In [3]:
import os
import pandas as pd
import numpy as np
import random
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb 

plt.style.use('seaborn-v0_8-bright')
%config InlineBackend.figure_format = 'svg'

FOLDER_PATH = './champs-scalar-coupling/'
files = os.listdir(FOLDER_PATH)
print(files)

['scalar_coupling_contributions.csv', 'angles.csv', '.DS_Store', 'mulliken_charges.csv', 'champs-basic-graph', 'structures', 'structures.csv', 'dataset', 'test.csv', 'train.csv', 'magnetic_shielding_tensors.csv', 'dipole_moments.csv', 'sample_submission.csv', 'potential_energy.csv']


In [4]:
df = pd.read_csv(os.path.join(FOLDER_PATH, 'train.csv'))
structures_df = pd.read_csv(os.path.join(FOLDER_PATH, 'structures.csv'))

In [9]:
print(df['molecule_name'].nunique())

85012


### Build the final dataframe with all information

New features created : dist bewteen atom 0 and atom 1, dist_x, dist_y, dist_z, the position (x,y,z) of each of the two atoms.

In [5]:
for atom_idx in [0,1]:
    
    df = pd.merge(df, structures_df, how = 'left',
                      left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                      right_on = ['molecule_name',  'atom_index'])
        
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                                'x': f'x{atom_idx}',
                                'y': f'y{atom_idx}',
                                'z': f'z{atom_idx}'})

p0 = df[['x0', 'y0', 'z0']].values
p1 = df[['x1', 'y1', 'z1']].values

df['dist'] = np.linalg.norm(p0 - p1, axis=1)
df['dist_x'] = np.sqrt((df['x0'] - df['x1']) ** 2)
df['dist_y'] = np.sqrt((df['y0'] - df['y1']) ** 2)
df['dist_z'] = np.sqrt((df['z0'] - df['z1']) ** 2)

map_atoms = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4}
map_types = {'1JHC': 0, '2JHH': 1, '1JHN': 2, '2JHN': 3, '2JHC': 4, '3JHH': 5, '3JHC': 6, '3JHN': 7}

df['atom_0'] = df['atom_0'].replace(map_atoms)
df['atom_1'] = df['atom_1'].replace(map_atoms)
df['type'] = df['type'].replace(map_types)

In [6]:
df

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x0,y0,z0,atom_1,x1,y1,z1,dist,dist_x,dist_y,dist_z
0,0,dsgdb9nsd_000001,1,0,0,84.807600,0,0.002150,-0.006031,0.001976,1,-0.012698,1.085804,0.008001,1.091953,0.014849,1.091835,0.006025
1,1,dsgdb9nsd_000001,1,2,1,-11.257000,0,0.002150,-0.006031,0.001976,0,1.011731,1.463751,0.000277,1.783120,1.009580,1.469782,0.001700
2,2,dsgdb9nsd_000001,1,3,1,-11.254800,0,0.002150,-0.006031,0.001976,0,-0.540815,1.447527,-0.876644,1.783147,0.542965,1.453558,0.878620
3,3,dsgdb9nsd_000001,1,4,1,-11.254300,0,0.002150,-0.006031,0.001976,0,-0.523814,1.437933,0.906397,1.783157,0.525964,1.443964,0.904421
4,4,dsgdb9nsd_000001,2,0,0,84.807400,0,1.011731,1.463751,0.000277,1,-0.012698,1.085804,0.008001,1.091952,1.024429,0.377947,0.007724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4659071,4659071,dsgdb9nsd_133884,17,4,4,3.543450,0,1.126550,-1.348733,-1.933838,1,1.629865,-0.747236,0.235262,2.306538,0.503315,0.601497,2.169100
4659072,4659072,dsgdb9nsd_133884,17,5,6,0.568997,0,1.126550,-1.348733,-1.933838,1,1.415947,0.620773,0.939122,3.495226,0.289397,1.969506,2.872960
4659073,4659073,dsgdb9nsd_133884,17,6,6,1.173370,0,1.126550,-1.348733,-1.933838,1,-0.027076,0.747033,0.478506,3.397424,1.153626,2.095765,2.412345
4659074,4659074,dsgdb9nsd_133884,17,7,4,4.762010,0,1.126550,-1.348733,-1.933838,1,-0.131901,0.356983,-1.010196,2.312202,1.258451,1.705716,0.923642


In [7]:
train_molecule_names, test_molecule_names = train_test_split(df['molecule_name'].unique(), test_size=0.2, random_state=42)

train_df = df[df['molecule_name'].isin(train_molecule_names)]
test_df = df[df['molecule_name'].isin(test_molecule_names)]

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [8]:
print(train_df['molecule_name'].nunique())
print(test_df['molecule_name'].nunique())

68009
17003


In [10]:
train_df.to_csv('train_df.csv', index=False)
test_df.to_csv('test_df.csv', index=False)