In [None]:
import numpy as np
import pandas as pd

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sub = pd.read_csv('data/sample_submission.csv')
structures = pd.read_csv('data/structures.csv')
# additional metadata
potential_energy = pd.read_csv('data/potential_energy.csv')
mulliken_charges = pd.read_csv('data/mulliken_charges.csv')
scalar_coupling_contributions = pd.read_csv('data/scalar_coupling_contributions.csv')
magnetic_shielding_tensors = pd.read_csv('data/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv('data/dipole_moments.csv')

In [None]:
def merge_rename_drop(df_left, df_right, rename_suffix, left_cols, right_cols, to_rename, to_drop):
    
    df_left = pd.merge(df_left, df_right, how = 'left', left_on = left_cols, right_on = right_cols)
    
    rename_map = {}
    for col in to_rename:
        rename_map[col] = col + '_' + rename_suffix
        
    df_left = df_left.rename(columns=rename_map)
    
    if to_drop:
        df_left = df_left.drop(to_drop, axis = 1)

    return df_left

In [None]:
#merge with structures
train = merge_rename_drop(train, structures, '0', ['molecule_name','atom_index_0'], ['molecule_name','atom_index'], ['x','y','z','atom'],'atom_index')
train = merge_rename_drop(train, structures, '1', ['molecule_name','atom_index_1'], ['molecule_name','atom_index'], ['x','y','z','atom'],'atom_index')

test = merge_rename_drop(test, structures, '0', ['molecule_name','atom_index_0'], ['molecule_name','atom_index'], ['x','y','z','atom'],'atom_index')
test = merge_rename_drop(test, structures, '1', ['molecule_name','atom_index_1'], ['molecule_name','atom_index'], ['x','y','z','atom'],'atom_index')

train['type'] = train['type'].apply(lambda x: x[0])
test['type'] = test['type'].apply(lambda x: x[0])
train = train.drop('type', axis=1)
test = test.drop('type', axis=1) 

train.head()

In [None]:
#join all metadata
metadata = train.iloc[:,0:4]
metadata = pd.merge(metadata, scalar_coupling_contributions, how='left', on= ['molecule_name','atom_index_0','atom_index_1'])
metadata = pd.merge(metadata, potential_energy, how='left', on= ['molecule_name'])
metadata = merge_rename_drop(metadata, mulliken_charges, '0', ['molecule_name','atom_index_0'], ['molecule_name','atom_index'], ['mulliken_charge'],'atom_index')
metadata = merge_rename_drop(metadata, mulliken_charges, '1', ['molecule_name','atom_index_1'], ['molecule_name','atom_index'], ['mulliken_charge'],'atom_index')
metadata = merge_rename_drop(metadata, magnetic_shielding_tensors, '0', ['molecule_name','atom_index_0'], ['molecule_name','atom_index'], ['XX','YX', 'ZX', 'XY','YY', 'ZY', 'XZ', 'YZ', 'ZZ'],'atom_index')
metadata = merge_rename_drop(metadata, magnetic_shielding_tensors, '1', ['molecule_name','atom_index_1'], ['molecule_name','atom_index'], ['XX','YX', 'ZX', 'XY','YY', 'ZY', 'XZ', 'YZ', 'ZZ'],'atom_index')
metadata = merge_rename_drop(metadata, dipole_moments, 'dpm', ['molecule_name'], ['molecule_name'], ['X','Y', 'Z'],'')
metadata.head()

In [None]:
metadata.to_csv('data/metadata.csv', index=False)
train.to_csv('data/train-structure.csv', index=False)
test.to_csv('data/test-structure.csv', index=False)