In [37]:
import os
import pandas as pd
import numpy as np
import random
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb 

plt.style.use('seaborn-v0_8-bright')
%config InlineBackend.figure_format = 'svg'

FOLDER_PATH = './champs-scalar-coupling/'
OUTPUT_PATH = "./preprocessed/"
files = os.listdir(FOLDER_PATH)
print(files)

['scalar_coupling_contributions.csv', '.DS_Store', 'mulliken_charges.csv', 'structures', 'structures.csv', 'test.csv', 'train.csv', 'magnetic_shielding_tensors.csv', 'dipole_moments.csv', 'sample_submission.csv', 'potential_energy.csv']


In [38]:
df = pd.read_csv(os.path.join(FOLDER_PATH, 'train.csv'))
structures_df = pd.read_csv(os.path.join(FOLDER_PATH, 'structures.csv'))

In [39]:
print(df['molecule_name'].nunique())

85012


### Build the final dataframe with all information

New features created : dist bewteen atom 0 and atom 1, dist_x, dist_y, dist_z, the position (x,y,z) of each of the two atoms.

In [40]:
for atom_idx in [0,1]:
    
    df = pd.merge(df, structures_df, how = 'left',
                      left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                      right_on = ['molecule_name',  'atom_index'])
        
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                                'x': f'x{atom_idx}',
                                'y': f'y{atom_idx}',
                                'z': f'z{atom_idx}'})

p0 = df[['x0', 'y0', 'z0']].values
p1 = df[['x1', 'y1', 'z1']].values

df['dist'] = np.linalg.norm(p0 - p1, axis=1)
df['dist_x'] = np.sqrt((df['x0'] - df['x1']) ** 2)
df['dist_y'] = np.sqrt((df['y0'] - df['y1']) ** 2)
df['dist_z'] = np.sqrt((df['z0'] - df['z1']) ** 2)

df[['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']] =  pd.get_dummies(df['type'])

In [41]:
df

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x0,y0,z0,...,dist_y,dist_z,1JHC,1JHN,2JHC,2JHH,2JHN,3JHC,3JHH,3JHN
0,0,dsgdb9nsd_000001,1,0,1JHC,84.807600,H,0.002150,-0.006031,0.001976,...,1.091835,0.006025,True,False,False,False,False,False,False,False
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257000,H,0.002150,-0.006031,0.001976,...,1.469782,0.001700,False,False,False,True,False,False,False,False
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.254800,H,0.002150,-0.006031,0.001976,...,1.453558,0.878620,False,False,False,True,False,False,False,False
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.254300,H,0.002150,-0.006031,0.001976,...,1.443964,0.904421,False,False,False,True,False,False,False,False
4,4,dsgdb9nsd_000001,2,0,1JHC,84.807400,H,1.011731,1.463751,0.000277,...,0.377947,0.007724,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4659071,4659071,dsgdb9nsd_133884,17,4,2JHC,3.543450,H,1.126550,-1.348733,-1.933838,...,0.601497,2.169100,False,False,True,False,False,False,False,False
4659072,4659072,dsgdb9nsd_133884,17,5,3JHC,0.568997,H,1.126550,-1.348733,-1.933838,...,1.969506,2.872960,False,False,False,False,False,True,False,False
4659073,4659073,dsgdb9nsd_133884,17,6,3JHC,1.173370,H,1.126550,-1.348733,-1.933838,...,2.095765,2.412345,False,False,False,False,False,True,False,False
4659074,4659074,dsgdb9nsd_133884,17,7,2JHC,4.762010,H,1.126550,-1.348733,-1.933838,...,1.705716,0.923642,False,False,True,False,False,False,False,False


In [42]:
train_molecule_names, test_molecule_names = train_test_split(df['molecule_name'].unique(), test_size=0.2, random_state=42)

train_df = df[df['molecule_name'].isin(train_molecule_names)]
test_df = df[df['molecule_name'].isin(test_molecule_names)]

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [43]:
print(train_df['molecule_name'].nunique())
print(test_df['molecule_name'].nunique())

68009
17003


In [44]:
train_df.to_csv(os.path.join(OUTPUT_PATH, 'train_df.csv'), index=False)
test_df.to_csv(os.path.join(OUTPUT_PATH, 'test_df.csv'), index=False)

In [45]:
print(structures_df['molecule_name'].nunique())

130789


In [48]:
structures_df[['C', 'F' ,'H', 'N', 'O']] = pd.get_dummies(structures_df['atom'])

train_structures = structures_df[structures_df["molecule_name"].isin(train_molecule_names)]
test_structures = structures_df[structures_df["molecule_name"].isin(test_molecule_names)]

In [49]:
train_structures.to_csv(os.path.join(OUTPUT_PATH, 'train_structures_df.csv'), index=False)
test_structures.to_csv(os.path.join(OUTPUT_PATH, 'test_structures_df.csv'), index=False)