In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import keras
from keras.layers import Dense, Input, Activation, BatchNormalization, Add
from keras.optimizers import Adam
from keras.models import Model

from keras import backend as K
from tqdm import tqdm_notebook as tqdm

pd.set_option('display.max_colwidth', -1)


### Importing
Import all of the files as dataframe
In the test set we will be given a molecule name, the indices of the two atoms, and the scalar coupling constant which we are trying to predict

note that the four extra files are NOT given with the testing data so we should be able to extrapolate how to calculate it from the training data

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_struct = pd.read_csv('structures.csv')

df_potential = pd.read_csv('potential_energy.csv')
df_dipole = pd.read_csv('dipole_moments.csv')
df_mulliken = pd.read_csv('mulliken_charges.csv')
df_magnetic = pd.read_csv('magnetic_shielding_tensors.csv')

df_scc = pd.read_csv('scalar_coupling_contributions.csv')

In [9]:
# not my function

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)
df_struct = reduce_mem_usage(df_struct)
df_mulliken = reduce_mem_usage(df_mulliken)
df_magnetic = reduce_mem_usage(df_magnetic)

Mem. usage decreased to 124.39 Mb (0.0% reduction)
Mem. usage decreased to 62.13 Mb (0.0% reduction)
Mem. usage decreased to 51.74 Mb (0.0% reduction)
Mem. usage decreased to 16.09 Mb (0.0% reduction)
Mem. usage decreased to 39.49 Mb (0.0% reduction)


In [5]:
df_train.head(11)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8125
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.257812
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.257812
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8125
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.257812
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8125


All of the atoms are either H, C or N so electronegativities could be useful

In [11]:
electronegativity = {'H':2.2, 'C':2.55, 'N':3.04, 'O':3.44, 'F':3.98}

def add_electronegativity(df):
    df['en_0'] = df['type'].map(lambda x: electronegativity[str(x)[2]])
    df['en_1'] = df['type'].map(lambda x: electronegativity[str(x)[3]])
    # could add the difference in electronegativites because larger diff -> stronger bond but some 
    # x-x interactions are also strong

add_electronegativity(df_train)
add_electronegativity(df_test)

In [12]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)
df_struct = reduce_mem_usage(df_struct)
df_mulliken = reduce_mem_usage(df_mulliken)
df_magnetic = reduce_mem_usage(df_magnetic)

df_train.head(10)

Mem. usage decreased to 124.39 Mb (30.0% reduction)
Mem. usage decreased to 62.13 Mb (31.6% reduction)
Mem. usage decreased to 51.74 Mb (0.0% reduction)
Mem. usage decreased to 16.09 Mb (0.0% reduction)
Mem. usage decreased to 39.49 Mb (0.0% reduction)


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,en_0,en_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,2.199219,2.550781
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,2.199219,2.199219
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812,2.199219,2.199219
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812,2.199219,2.199219
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,2.199219,2.550781
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.257812,2.199219,2.199219
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.257812,2.199219,2.199219
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8125,2.199219,2.550781
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.257812,2.199219,2.199219
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8125,2.199219,2.550781


In [7]:
# now we have the electronegativities