In [None]:
import numpy as np
import pandas as pd

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sub = pd.read_csv('data/sample_submission.csv')

In [None]:
train['type'].unique()

In [None]:
structures = pd.read_csv('data/structures.csv')

def map_atom_info(df, atom_idx):
    atom_idx = str(atom_idx)
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', 'atom_index_' + atom_idx ],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': 'atom_' + atom_idx ,
                            'x': 'x_'+atom_idx,
                            'y': 'y_'+atom_idx,
                            'z': 'z_'+atom_idx})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

In [None]:
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
train['dist_x'] = (train['x_0'] - train['x_1']) ** 2
test['dist_x'] = (test['x_0'] - test['x_1']) ** 2
train['dist_y'] = (train['y_0'] - train['y_1']) ** 2
test['dist_y'] = (test['y_0'] - test['y_1']) ** 2
train['dist_z'] = (train['z_0'] - train['z_1']) ** 2
test['dist_z'] = (test['z_0'] - test['z_1']) ** 2

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train['dist_to_type_mean'] = train['dist'] / train.groupby('type')['dist'].transform('mean')
test['dist_to_type_mean'] = test['dist'] / test.groupby('type')['dist'].transform('mean')

train['dist_to_type_0_mean'] = train['dist'] / train.groupby('atom_0')['dist'].transform('mean')
test['dist_to_type_0_mean'] = test['dist'] / test.groupby('atom_0')['dist'].transform('mean')

train['dist_to_type_1_mean'] = train['dist'] / train.groupby('atom_1')['dist'].transform('mean')
test['dist_to_type_1_mean'] = test['dist'] / test.groupby('atom_1')['dist'].transform('mean')


In [None]:
train['type_0'] = train['type'].apply(lambda x: x[0])
test['type_0'] = test['type'].apply(lambda x: x[0])
train = train.drop('type', axis=1)
test = test.drop('type', axis=1) 

In [None]:
train.head()

### Basic Neural Network implementation

In [None]:
import tensorflow as tf

In [None]:
import os
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

#os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
train_atom_0_series = train['atom_0']
train_atom_0_dummies = pd.get_dummies(train_atom_0_series, prefix="atom_0")
test_atom_0_series = test['atom_0']
test_atom_0_dummies = pd.get_dummies(test_atom_0_series, prefix="atom_0")

train_atom_1_series = train['atom_1']
train_atom_1_dummies = pd.get_dummies(train_atom_1_series, prefix="atom_1")
test_atom_1_series = test['atom_1']
test_atom_1_dummies = pd.get_dummies(test_atom_1_series, prefix="atom_1")

train_type_0_series = train['type_0']
train_type_0_dummies = pd.get_dummies(train_type_0_series, prefix="type")
test_type_0_series = test['type_0']
test_type_0_dummies = pd.get_dummies(test_type_0_series, prefix="type")

train = pd.concat([train, train_atom_0_dummies, train_atom_1_dummies, train_type_0_dummies], axis=1)
test = pd.concat([test, test_atom_0_dummies, test_atom_1_dummies, test_type_0_dummies], axis=1)

train = train.drop(['atom_0','atom_1','type_0'], axis=1)
test = test.drop(['atom_0','atom_1', 'type_0'], axis=1)

In [None]:
print(train.shape)
print(test.shape)
print(train.columns)
train.head()

In [None]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

columns_to_normalize = [ 'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1',
       'dist', 'dist_x', 'dist_y', 'dist_z', 'dist_to_type_mean',
       'dist_to_type_0_mean', 'dist_to_type_1_mean', 'atom_0_H', 'atom_1_C',
       'atom_1_H', 'atom_1_N', 'type_1', 'type_2', 'type_3']

train[columns_to_normalize] = min_max_scaler.fit_transform(train[columns_to_normalize])

#del columns_to_normalize[0] 
test[columns_to_normalize] = min_max_scaler.fit_transform(test[columns_to_normalize])



In [None]:
from sklearn.model_selection import train_test_split

train, test_internal, = train_test_split(train, test_size=0.25, random_state=42)

x_train = train[columns_to_normalize]
x_test = test_internal[columns_to_normalize]

y_train = train['scalar_coupling_constant']
y_test = test_internal['scalar_coupling_constant']

x_train.head()


In [None]:
print(x_train.isnull().values.any())
print(x_test.isnull().values.any())
print(y_train.isnull().values.any())
print(y_test.isnull().values.any())
print(y_train.head())

In [None]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('acc')>0.9):
      print("\nReached 90% accuracy so cancelling training!")
      self.model.stop_training = True

callbacks = myCallback()

model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(20, input_shape=(20,), activation=tf.nn.relu),
  tf.keras.layers.Dropout(.2),
  tf.keras.layers.Dense(30, activation=tf.nn.relu),
  tf.keras.layers.Dropout(.2),
  tf.keras.layers.Dense(50, activation=tf.nn.relu),
  tf.keras.layers.Dropout(.2),
  tf.keras.layers.Dense(10, activation=tf.nn.relu),
  tf.keras.layers.Dense(5, activation=tf.nn.relu),
  tf.keras.layers.Dense(1, activation=tf.keras.activations.linear),
])

#sgd = tf.keras.optimizers.SGD(lr=0.001)

model.compile(optimizer='adam',
              loss='mean_absolute_error',
              metrics=['accuracy','mean_absolute_error'])

history = model.fit(x_train.values, y_train.values, validation_data=(x_test.values, y_test.values), epochs=20, callbacks=[callbacks])

In [None]:
model.summary()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend(loc=0)
plt.figure()


plt.show()

In [None]:
#test[columns_to_normalize].shape
predictions = model.predict(test[columns_to_normalize])

In [None]:
sub.head()
#predictions
sub['scalar_coupling_constant'] = predictions

In [None]:
sub.to_csv('submission.csv', index=False)