In [2]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [98]:
if 'ygong' in os.getcwd():
    filepath = "../data"
    dir_out = "../output"
else:
    filepath = "/home/gong/Documents/Kaggle_July2019/data"
    dir_out = "/home/gong/Documents/Kaggle_July2019/output"

def load_data(filepath):
    train = pd.read_csv(os.path.join(filepath, 'train.csv'))
    test = pd.read_csv(os.path.join(filepath, 'test.csv'))
    submit = pd.read_csv(os.path.join(filepath, 'sample_submission.csv'))
    structures = pd.read_csv(os.path.join(filepath, 'structures.csv'))

    print('Train dataset shape is -> rows: {} cols:{}'.format(train.shape[0], train.shape[1]))
    print('Test dataset shape is  -> rows: {} cols:{}'.format(test.shape[0], test.shape[1]))
    print('Sample submission dataset shape is  -> rows: {} cols:{}'.format(submit.shape[0], submit.shape[1]))
    print('Structures dataset shape is  -> rows: {} cols:{}'.format(structures.shape[0], structures.shape[1]))
    print('\n')

    return train, test, submit, structures

train, test, submit, structures = load_data(filepath)

Train dataset shape is -> rows: 4658147 cols:6
Test dataset shape is  -> rows: 2505542 cols:5
Sample submission dataset shape is  -> rows: 2505542 cols:2
Structures dataset shape is  -> rows: 2358657 cols:6




In [99]:
# split number of interventing bonds and bond combination
def extract_n_bonds(row):
    return int(row['type'][0])
def extract_type_bond(row):
    return row['type'][2:]
def check_covalent(row):
    if 'JHH' in row['type']:
        cov = 0
    else:
        cov = 1
    return cov

train = pd.concat([train, pd.DataFrame({'n_bonds': train.agg(extract_n_bonds, axis=1), 
                                        'type_bond': train.agg(extract_type_bond, axis=1),
                                        'cov':train.agg(check_covalent, axis=1)})], axis=1)
test = pd.concat([test, pd.DataFrame({'n_bonds': test.agg(extract_n_bonds, axis=1), 
                                      'type_bond': test.agg(extract_type_bond, axis=1),
                                      'cov':test.agg(check_covalent, axis=1)})], axis=1)

In [101]:
# calculate bond length adopted from @Chanran Kim - Kaggle kernel
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)

In [110]:
train.to_csv(os.path.join(dir_out, "_train.csv"), index=False)
print("Saved training dataset to {}".format(os.path.join(dir_out, "_train_0729.csv")))

test.to_csv(os.path.join(dir_out, "_test.csv"), index=False)
print("Saved test dataset to {}".format(os.path.join(dir_out, "_test_0729.csv")))

Saved training dataset to /home/gong/Documents/Kaggle_July2019/output/_train_0729.csv
Saved test dataset to /home/gong/Documents/Kaggle_July2019/output/_test_0729.csv


In [109]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,n_bonds,type_bond,cov,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,dist
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,1,HC,1,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,1.091953
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,2,HH,0,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,1.78312
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,2,HH,0,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,1.783147
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,2,HH,0,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,1.783157
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,1,HC,1,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001,1.091952
