In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import math
import gc
import copy

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMRegressor

In [2]:
ATOMIC_NUMBERS = {
    'H': 1,
    'C': 6,
    'N': 7,
    'O': 8,
    'F': 9
}

In [3]:
# pd.set_option('display.max_colwidth', -1)
# pd.set_option('display.max_rows', 120)
# pd.set_option('display.max_columns', 120)

In [4]:
# train_dtypes = {
#     'molecule_name': 'category',
#     'atom_index_0': 'int8',
#     'atom_index_1': 'int8',
#     'type': 'category',
#     'scalar_coupling_constant': 'float32'
# }
train_csv = pd.read_csv('train.csv', index_col='id')
train_csv['molecule_index'] = train_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
train_csv = train_csv[['molecule_index', 'atom_index_0', 'atom_index_1', 'type', 'scalar_coupling_constant']]

  mask |= (ar1 == a)


In [5]:
train_csv.head()

Unnamed: 0_level_0,molecule_index,atom_index_0,atom_index_1,type,scalar_coupling_constant
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,1,0,1JHC,84.8076
1,1,1,2,2JHH,-11.257
2,1,1,3,2JHH,-11.2548
3,1,1,4,2JHH,-11.2543
4,1,2,0,1JHC,84.8074


In [6]:
print('Shape: ', train_csv.shape)
print('Total: ', train_csv.memory_usage().sum())
train_csv.memory_usage()

Shape:  (4658147, 5)
Total:  204958468


Index                       37265176
molecule_index              18632588
atom_index_0                37265176
atom_index_1                37265176
type                        37265176
scalar_coupling_constant    37265176
dtype: int64

In [7]:
submission_csv = pd.read_csv('sample_submission.csv', index_col='id')

In [8]:
test_csv = pd.read_csv('test.csv', index_col='id')
test_csv['molecule_index'] = test_csv['molecule_name'].str.replace('dsgdb9nsd_', '').astype('int32')
test_csv = test_csv[['molecule_index', 'atom_index_0', 'atom_index_1', 'type']]

In [9]:
test_csv.head()

Unnamed: 0_level_0,molecule_index,atom_index_0,atom_index_1,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4658147,4,2,0,2JHC
4658148,4,2,1,1JHC
4658149,4,2,3,3JHH
4658150,4,3,0,1JHC
4658151,4,3,1,2JHC


In [10]:
# structures_dtypes = {
#     'molecule_name': 'category',
#     'atom_index': 'int8',
#     'atom': 'category',
#     'x': 'float32',
#     'y': 'float32',
#     'z': 'float32'
# }
structures_csv = pd.read_csv('structures.csv')
structures_csv['molecule_index'] = structures_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
structures_csv = structures_csv[['molecule_index', 'atom_index', 'atom', 'x', 'y', 'z']]

In [11]:
structures_csv['atom'] = structures_csv['atom'].replace(ATOMIC_NUMBERS).astype('int8')
structures_csv.head(10)

Unnamed: 0,molecule_index,atom_index,atom,x,y,z
0,1,0,6,-0.012698,1.085804,0.008001
1,1,1,1,0.00215,-0.006031,0.001976
2,1,2,1,1.011731,1.463751,0.000277
3,1,3,1,-0.540815,1.447527,-0.876644
4,1,4,1,-0.523814,1.437933,0.906397
5,2,0,7,-0.040426,1.024108,0.062564
6,2,1,1,0.017257,0.012545,-0.027377
7,2,2,1,0.915789,1.358745,-0.028758
8,2,3,1,-0.520278,1.343532,-0.775543
9,3,0,8,-0.03436,0.97754,0.007602


In [12]:
print('Shape: ', structures_csv.shape)
print('Total: ', structures_csv.memory_usage().sum())
structures_csv.memory_usage()

Shape:  (2358657, 6)
Total:  87270389


Index                   80
molecule_index     9434628
atom_index        18869256
atom               2358657
x                 18869256
y                 18869256
z                 18869256
dtype: int64

In [13]:
def build_type_dataframes(base, structures, coupling_type):
    base = base[base['type'] == coupling_type].drop('type', axis=1).copy()
    base = base.reset_index()
    base['id'] = base['id'].astype('int32')
    structures = structures[structures['molecule_index'].isin(base['molecule_index'])]
    return base, structures

In [14]:
def add_coordinates(base, structures, index):
    df = pd.merge(base, structures, how='inner',
                  left_on=['molecule_index', f'atom_index_{index}'],
                  right_on=['molecule_index', 'atom_index']).drop(['atom_index'], axis=1)
    df = df.rename(columns={
        'atom': f'atom_{index}',
        'x': f'x_{index}',
        'y': f'y_{index}',
        'z': f'z_{index}'
    })
    return df

In [15]:
def add_atoms(base, atoms):
    df = pd.merge(base, atoms, how='inner',
                  on=['molecule_index', 'atom_index_0', 'atom_index_1'])
    return df

In [16]:
def merge_all_atoms(base, structures):
    df = pd.merge(base, structures, how='left',
                  left_on=['molecule_index'],
                  right_on=['molecule_index'])
    df = df[(df.atom_index_0 != df.atom_index) & (df.atom_index_1 != df.atom_index)]
    return df

In [17]:
def add_center(df):
    df['x_c'] = ((df['x_1'] + df['x_0']) * np.float32(0.5))
    df['y_c'] = ((df['y_1'] + df['y_0']) * np.float32(0.5))
    df['z_c'] = ((df['z_1'] + df['z_0']) * np.float32(0.5))

def add_distance_to_center(df):
    df['d_c'] = ((
        (df['x_c'] - df['x'])**np.float32(2) +
        (df['y_c'] - df['y'])**np.float32(2) + 
        (df['z_c'] - df['z'])**np.float32(2)
    )**np.float32(0.5))

def add_distance_between(df, suffix1, suffix2):
    df[f'd_{suffix1}_{suffix2}'] = ((
        (df[f'x_{suffix1}'] - df[f'x_{suffix2}'])**np.float32(2) +
        (df[f'y_{suffix1}'] - df[f'y_{suffix2}'])**np.float32(2) + 
        (df[f'z_{suffix1}'] - df[f'z_{suffix2}'])**np.float32(2)
    )**np.float32(0.5))

In [18]:
def add_distances(df):
    n_atoms = 1 + max([int(c.split('_')[1]) for c in df.columns if c.startswith('x_')])
    
    for i in range(1, n_atoms):
        for vi in range(min(4, i)):
            add_distance_between(df, i, vi)

In [19]:
def add_n_atoms(base, structures):
    dfs = structures['molecule_index'].value_counts().rename('n_atoms').to_frame()
    return pd.merge(base, dfs, left_on='molecule_index', right_index=True)

In [20]:
def build_couple_dataframe(some_csv, structures_csv, coupling_type, n_atoms=10):
    base, structures = build_type_dataframes(some_csv, structures_csv, coupling_type)
    base = add_coordinates(base, structures, 0)
    base = add_coordinates(base, structures, 1)
    
    base = base.drop(['atom_0', 'atom_1'], axis=1)
    atoms = base.drop('id', axis=1).copy()
    if 'scalar_coupling_constant' in some_csv:
        atoms = atoms.drop(['scalar_coupling_constant'], axis=1)
        
    add_center(atoms)
    atoms = atoms.drop(['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'], axis=1)

    atoms = merge_all_atoms(atoms, structures)
    
    add_distance_to_center(atoms)
    
    atoms = atoms.drop(['x_c', 'y_c', 'z_c', 'atom_index'], axis=1)
    atoms.sort_values(['molecule_index', 'atom_index_0', 'atom_index_1', 'd_c'], inplace=True)
    atom_groups = atoms.groupby(['molecule_index', 'atom_index_0', 'atom_index_1'])
    atoms['num'] = atom_groups.cumcount() + 2
    atoms = atoms.drop(['d_c'], axis=1)
    atoms = atoms[atoms['num'] < n_atoms]

    atoms = atoms.set_index(['molecule_index', 'atom_index_0', 'atom_index_1', 'num']).unstack()
    atoms.columns = [f'{col[0]}_{col[1]}' for col in atoms.columns]
    atoms = atoms.reset_index()
    
    # downcast back to int8
    for col in atoms.columns:
        if col.startswith('atom_'):
            atoms[col] = atoms[col].fillna(0).astype('int8')
            
    atoms['molecule_index'] = atoms['molecule_index'].astype('int32')
    
    full = add_atoms(base, atoms)
    add_distances(full)
    
    full.sort_values('id', inplace=True)
    
    return full

In [21]:
def take_n_atoms(df, n_atoms, four_start=4):
    labels = []
    for i in range(2, n_atoms):
        label = f'atom_{i}'
        labels.append(label)

    for i in range(n_atoms):
        num = min(i, 4) if i < four_start else 4
        for j in range(num):
            labels.append(f'd_{i}_{j}')
    if 'scalar_coupling_constant' in df:
        labels.append('scalar_coupling_constant')
    return df[labels]

In [22]:
%%time
full = build_couple_dataframe(train_csv, structures_csv, '1JHN', n_atoms=10)
print(full.shape)

(43363, 73)
Wall time: 3.18 s


In [23]:
full.columns

Index(['id', 'molecule_index', 'atom_index_0', 'atom_index_1',
       'scalar_coupling_constant', 'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1',
       'atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'y_2',
       'y_3', 'y_4', 'y_5', 'y_6', 'y_7', 'y_8', 'y_9', 'z_2', 'z_3', 'z_4',
       'z_5', 'z_6', 'z_7', 'z_8', 'z_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0',
       'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1',
       'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1',
       'd_7_2', 'd_7_3', 'd_8_0', 'd_8_1', 'd_8_2', 'd_8_3', 'd_9_0', 'd_9_1',
       'd_9_2', 'd_9_3'],
      dtype='object')

In [24]:
df = take_n_atoms(full, 7)
# LightGBM performs better with 0-s then with NaN-s
df = df.fillna(0)
df.columns

Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'd_1_0', 'd_2_0',
       'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3',
       'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3',
       'scalar_coupling_constant'],
      dtype='object')

In [25]:
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
import itertools
from scipy import interp
from bayes_opt import BayesianOptimization
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, roc_auc_score, f1_score, roc_curve, auc,precision_recall_curve, mean_absolute_error

warnings.filterwarnings("ignore")

In [26]:
# X_data = df.drop(['scalar_coupling_constant'], axis=1)
# y_data = df['scalar_coupling_constant']
# X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=128)
# X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [27]:
LGB_PARAMS = {
    'objective': 'regression',
    'metric': 'mae',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'learning_rate': 0.2,
    'num_leaves': 128,
    'min_child_samples': 79,
    'max_depth': 9,
    'subsample_freq': 1,
    'subsample': 0.9,
    'bagging_seed': 11,
    'reg_alpha': 0.1,
    'reg_lambda': 0.3,
    'colsample_bytree': 1.0
}

In [28]:
def build_x_y_data(some_csv, coupling_type, n_atoms):
    full = build_couple_dataframe(some_csv, structures_csv, coupling_type, n_atoms=n_atoms)
    
    df = take_n_atoms(full, n_atoms)
    df = df.fillna(0)
    print(df.columns)
    
    if 'scalar_coupling_constant' in df:
        X_data = df.drop(['scalar_coupling_constant'], axis=1)
        y_data = df['scalar_coupling_constant']
    else:
        X_data = df
        y_data = None
    
    return X_data, y_data

In [34]:
X_data, y_data = build_x_y_data(train_csv, '3JHN', 10)

Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd_4_1', 'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0',
       'd_6_1', 'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0',
       'd_8_1', 'd_8_2', 'd_8_3', 'd_9_0', 'd_9_1', 'd_9_2', 'd_9_3',
       'scalar_coupling_constant'],
      dtype='object')


In [35]:
def LGB_bayesian(
    learning_rate,
    max_depth,
    num_leaves, 
    bagging_fraction,
    feature_fraction,
    min_data_in_leaf,
    reg_alpha,
    reg_lambda
     ):
    
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)

    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    
    columns = X_data.columns
    train_index = range(int(len(X_data)*0.7))
    valid_index = range(int(len(X_data)*0.7), len(X_data))
    
    X_train, X_valid = X_data[columns].iloc[train_index], X_data[columns].iloc[valid_index]
    y_train, y_valid = y_data.iloc[train_index], y_data.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'bagging_fraction' : bagging_fraction,
              'feature_fraction' : feature_fraction,
              'learning_rate' : learning_rate,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'objective': 'regression',
              'save_binary': True,
              'seed': 1337,
              'feature_fraction_seed': 1337,
              'bagging_seed': 1337,
              'drop_seed': 1337,
              'data_random_seed': 1337,
              'boosting_type': 'gbdt',
              'verbose': 1,
              'is_unbalance': False,
              'boost_from_average': True,
              'metric':'mae'}    
    
    
    model = LGBMRegressor(**param, n_estimators=10000, n_jobs = -1)
    model.fit(X_train, y_train, 
        eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae',
        verbose=1000, early_stopping_rounds=100)    
    
    y_pred = model.predict(X_valid)
    score = np.log(mean_absolute_error(y_valid, y_pred))

    return score

In [36]:
bounds_LGB = {
    'num_leaves': (31, 300), 
    'min_data_in_leaf': (20, 200),
    'max_depth': (5, 50),
    'bagging_fraction' : (0.1, 0.9),
    'feature_fraction' : (0.1, 0.9),
    'learning_rate': (0.01, 0.2),  
    'reg_alpha': (0, 1), 
    'reg_lambda': (0, 1)
}

In [37]:
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB)

In [38]:
init_points = 10
n_iter = 15

In [39]:
%%time

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)
    

# |  6        | -0.8103   |  0.3519   |  0.4793   |  0.01981  |  19.78    |  31.09    |  88.22    
# |  0.7319   |  0.02046  |

# |  4        | -0.398    |  0.2817   |  0.8268   |  0.0313   |  37.53    |  58.55    |  215.7    
# |  0.6199   |  0.06773  |

# |  1        | -1.96     |  0.3144   |  0.6463   |  0.0338   |  29.72    |  34.21    |  275.5    
# |  0.738    |  0.4629   |

# |  2        | -1.893    |  0.351    |  0.3879   |  0.04198  |  12.06    |  105.4    |  200.7    
# |  0.8291   |  0.4697   |

# |  10       | -1.372    |  0.1386   |  0.7393   |  0.1277   |  10.49    |  25.28    |  148.2    
# |  0.4687   |  0.1326   |

# |  9        | -1.973    |  0.1856   |  0.6099   |  0.06415  |  28.39    |  163.8    |  115.7    
# |  0.8427   |  0.3105   |

# |  6        | -1.308    |  0.2464   |  0.5808   |  0.1378   |  17.85    |  155.7    |  240.5    
# |  0.07945  |  0.439    |

# |  4        | -2.159    |  0.4323   |  0.5897   |  0.01962  |  34.39    |  114.0    |  184.0    
# |  0.2624   |  0.3179   |

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_da... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[1000]	training's l1: 0.0838097	valid_1's l1: 0.153671
[2000]	training's l1: 0.056495	valid_1's l1: 0.142359
[3000]	training's l1: 0.0434112	valid_1's l1: 0.137992
[4000]	training's l1: 0.0359163	valid_1's l1: 0.13581
[5000]	training's l1: 0.031184	valid_1's l1: 0.134506
[6000]	training's l1: 0.0279322	valid_1's l1: 0.133651
[7000]	training's l1: 0.0256865	valid_1's l1: 0.133075
[8000]	training's l1: 0.0240906	valid_1's l1: 0.132715
Early stopping, best iteration is:
[8123]	training's l1: 0.0239607	valid_1's l1: 0.132678
| [0m 1       [0m | [0m-2.02    [0m | [0m 0.4357  [0m | [0m 0.4188  [0m | [0m 0.1482  [0m | [0m 7.572   [0m | [0m 32.75   [0m | [0m 90.34   [0m | [0m

[10000]	training's l1: 0.0271852	valid_1's l1: 0.120041
Did not meet early stopping. Best iteration is:
[10000]	training's l1: 0.0271852	valid_1's l1: 0.120041
| [0m 10      [0m | [0m-2.12    [0m | [0m 0.58    [0m | [0m 0.8788  [0m | [0m 0.05806 [0m | [0m 33.03   [0m | [0m 153.8   [0m | [0m 91.89   [0m | [0m 0.7389  [0m | [0m 0.1649  [0m |
Training until validation scores don't improve for 100 rounds.
[1000]	training's l1: 0.0260152	valid_1's l1: 0.135876
[2000]	training's l1: 0.0125312	valid_1's l1: 0.133288
[3000]	training's l1: 0.00744442	valid_1's l1: 0.132612
[4000]	training's l1: 0.00484919	valid_1's l1: 0.132314
[5000]	training's l1: 0.00337789	valid_1's l1: 0.132183
[6000]	training's l1: 0.00247676	valid_1's l1: 0.132116
[7000]	training's l1: 0.00187435	valid_1's l1: 0.132068
[8000]	training's l1: 0.00145231	valid_1's l1: 0.132038
[9000]	training's l1: 0.00115476	valid_1's l1: 0.13202
Early stopping, best iteration is:
[9628]	training's l1: 0.00100845	valid_

KeyboardInterrupt: 

In [38]:
LGB_BO.max['target']

AttributeError: 'BayesianOptimization' object has no attribute 'min'

In [37]:
LGB_BO.max['params']

{'bagging_fraction': 0.1,
 'feature_fraction': 0.1,
 'learning_rate': 0.2,
 'max_depth': 50.0,
 'min_data_in_leaf': 20.0,
 'num_leaves': 300.0,
 'reg_alpha': 1.0,
 'reg_lambda': 1.0}

In [28]:
def train_and_predict_for_one_coupling_type(coupling_type, submission, n_atoms, n_folds=5, n_splits=5, random_state=128):
    print(f'*** Training Model for {coupling_type} ***')
    
    X_data, y_data = build_x_y_data(train_csv, coupling_type, n_atoms)
    X_test, _ = build_x_y_data(test_csv, coupling_type, n_atoms)
    y_pred = np.zeros(X_test.shape[0], dtype='float32')

    cv_score = 0
    
    if n_folds > n_splits:
        n_splits = n_folds
    
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for fold, (train_index, val_index) in enumerate(kfold.split(X_data, y_data)):
        if fold >= n_folds:
            break

        X_train, X_val = X_data[train_index], X_data[val_index]
        y_train, y_val = y_data[train_index], y_data[val_index]

        model = LGBMRegressor(**LGB_PARAMS, n_estimators=5000, n_jobs = -1)
        model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='mae',
            verbose=500, early_stopping_rounds=200)

        y_val_pred = model.predict(X_val)
        val_score = np.log(mean_absolute_error(y_val, y_val_pred))
        print(f'{coupling_type} Fold {fold}, logMAE: {val_score}')
        
        cv_score += val_score / n_folds
        y_pred += model.predict(X_test) / n_folds
        
        
    submission.loc[test_csv['type'] == coupling_type, 'scalar_coupling_constant'] = y_pred
    return cv_score

In [29]:
%%time

model_params = {
    '1JHN': 7,
    '1JHC': 10,
    '2JHH': 9,
    '2JHN': 9,
    '2JHC': 9,
    '3JHH': 9,
    '3JHC': 10,
    '3JHN': 10
}
N_FOLDS = 5
submission = submission_csv.copy()

cv_scores = {}
for coupling_type in model_params.keys():
    cv_score = train_and_predict_for_one_coupling_type(
        coupling_type, submission, n_atoms=model_params[coupling_type], n_folds=N_FOLDS)
    cv_scores[coupling_type] = cv_score

*** Training Model for 1JHN ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'd_1_0', 'd_2_0',
       'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3',
       'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3',
       'scalar_coupling_constant'],
      dtype='object')
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'd_1_0', 'd_2_0',
       'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3',
       'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3'],
      dtype='object')
Training until validation scores don't improve for 200 rounds.
[100]	training's l1: 0.432629	valid_1's l1: 0.538931
[200]	training's l1: 0.356809	valid_1's l1: 0.483761
[300]	training's l1: 0.309424	valid_1's l1: 0.455831
[400]	training's l1: 0.274702	valid_1's l1: 0.438134
[500]	training's l1: 0.248383	valid_1's l1: 0.428061
[600]	training's l1: 0.226356	valid_1's l1: 0.420704
[700]	training's l1: 0.208145	valid_1's 

[3300]	training's l1: 0.0506805	valid_1's l1: 0.370419
[3400]	training's l1: 0.0486396	valid_1's l1: 0.370093
[3500]	training's l1: 0.0467844	valid_1's l1: 0.369851
[3600]	training's l1: 0.0450308	valid_1's l1: 0.369605
[3700]	training's l1: 0.0433698	valid_1's l1: 0.369429
[3800]	training's l1: 0.0416962	valid_1's l1: 0.369184
[3900]	training's l1: 0.040164	valid_1's l1: 0.369051
[4000]	training's l1: 0.0386366	valid_1's l1: 0.368953
[4100]	training's l1: 0.0372103	valid_1's l1: 0.368725
[4200]	training's l1: 0.0358809	valid_1's l1: 0.368624
[4300]	training's l1: 0.0346416	valid_1's l1: 0.368522
[4400]	training's l1: 0.0335301	valid_1's l1: 0.368417
[4500]	training's l1: 0.0323609	valid_1's l1: 0.36832
[4600]	training's l1: 0.0312225	valid_1's l1: 0.368207
[4700]	training's l1: 0.0302087	valid_1's l1: 0.368097
[4800]	training's l1: 0.0291945	valid_1's l1: 0.368047
[4900]	training's l1: 0.0282399	valid_1's l1: 0.36796
[5000]	training's l1: 0.0273016	valid_1's l1: 0.367883
Did not meet 

[700]	training's l1: 0.690522	valid_1's l1: 0.870537
[800]	training's l1: 0.660323	valid_1's l1: 0.853856
[900]	training's l1: 0.632369	valid_1's l1: 0.839482
[1000]	training's l1: 0.607584	valid_1's l1: 0.82667
[1100]	training's l1: 0.584782	valid_1's l1: 0.815541
[1200]	training's l1: 0.564551	valid_1's l1: 0.805956
[1300]	training's l1: 0.545842	valid_1's l1: 0.797774
[1400]	training's l1: 0.528954	valid_1's l1: 0.790408
[1500]	training's l1: 0.512675	valid_1's l1: 0.783406
[1600]	training's l1: 0.497838	valid_1's l1: 0.777273
[1700]	training's l1: 0.483477	valid_1's l1: 0.771245
[1800]	training's l1: 0.470035	valid_1's l1: 0.766076
[1900]	training's l1: 0.457547	valid_1's l1: 0.761017
[2000]	training's l1: 0.445504	valid_1's l1: 0.756398
[2100]	training's l1: 0.434241	valid_1's l1: 0.75237
[2200]	training's l1: 0.423785	valid_1's l1: 0.748566
[2300]	training's l1: 0.413536	valid_1's l1: 0.745096
[2400]	training's l1: 0.404064	valid_1's l1: 0.741848
[2500]	training's l1: 0.394627	va

1JHC Fold 2, logMAE: -0.3723167211137031
Training until validation scores don't improve for 200 rounds.
[100]	training's l1: 1.16226	valid_1's l1: 1.21588
[200]	training's l1: 0.986705	valid_1's l1: 1.06847
[300]	training's l1: 0.889198	valid_1's l1: 0.994239
[400]	training's l1: 0.819731	valid_1's l1: 0.944746
[500]	training's l1: 0.767442	valid_1's l1: 0.909807
[600]	training's l1: 0.724909	valid_1's l1: 0.883306
[700]	training's l1: 0.688822	valid_1's l1: 0.862378
[800]	training's l1: 0.65822	valid_1's l1: 0.845859
[900]	training's l1: 0.630896	valid_1's l1: 0.831494
[1000]	training's l1: 0.60608	valid_1's l1: 0.819381
[1100]	training's l1: 0.583188	valid_1's l1: 0.807271
[1200]	training's l1: 0.563212	valid_1's l1: 0.797898
[1300]	training's l1: 0.544408	valid_1's l1: 0.789176
[1400]	training's l1: 0.526866	valid_1's l1: 0.781146
[1500]	training's l1: 0.511231	valid_1's l1: 0.774655
[1600]	training's l1: 0.496281	valid_1's l1: 0.768115
[1700]	training's l1: 0.482257	valid_1's l1: 0

[3000]	training's l1: 0.0557626	valid_1's l1: 0.169209
[3100]	training's l1: 0.0542167	valid_1's l1: 0.168819
[3200]	training's l1: 0.0527653	valid_1's l1: 0.168466
[3300]	training's l1: 0.0513104	valid_1's l1: 0.168096
[3400]	training's l1: 0.0499498	valid_1's l1: 0.167742
[3500]	training's l1: 0.0486077	valid_1's l1: 0.167416
[3600]	training's l1: 0.047389	valid_1's l1: 0.167148
[3700]	training's l1: 0.0462475	valid_1's l1: 0.166904
[3800]	training's l1: 0.0450676	valid_1's l1: 0.166627
[3900]	training's l1: 0.0439345	valid_1's l1: 0.166366
[4000]	training's l1: 0.0428825	valid_1's l1: 0.16617
[4100]	training's l1: 0.0418689	valid_1's l1: 0.165989
[4200]	training's l1: 0.0408746	valid_1's l1: 0.165793
[4300]	training's l1: 0.0399512	valid_1's l1: 0.165596
[4400]	training's l1: 0.0390402	valid_1's l1: 0.165415
[4500]	training's l1: 0.0381374	valid_1's l1: 0.165255
[4600]	training's l1: 0.0373097	valid_1's l1: 0.165095
[4700]	training's l1: 0.0364835	valid_1's l1: 0.164942
[4800]	train

[2000]	training's l1: 0.076755	valid_1's l1: 0.174791
[2100]	training's l1: 0.0742586	valid_1's l1: 0.173994
[2200]	training's l1: 0.071765	valid_1's l1: 0.173293
[2300]	training's l1: 0.0696039	valid_1's l1: 0.1727
[2400]	training's l1: 0.0672701	valid_1's l1: 0.171974
[2500]	training's l1: 0.065189	valid_1's l1: 0.171335
[2600]	training's l1: 0.0631958	valid_1's l1: 0.170858
[2700]	training's l1: 0.0613191	valid_1's l1: 0.170342
[2800]	training's l1: 0.0595724	valid_1's l1: 0.169866
[2900]	training's l1: 0.0577665	valid_1's l1: 0.169395
[3000]	training's l1: 0.0562001	valid_1's l1: 0.169022
[3100]	training's l1: 0.0546483	valid_1's l1: 0.168624
[3200]	training's l1: 0.0531678	valid_1's l1: 0.168286
[3300]	training's l1: 0.0517576	valid_1's l1: 0.168018
[3400]	training's l1: 0.0503614	valid_1's l1: 0.167658
[3500]	training's l1: 0.0490393	valid_1's l1: 0.167353
[3600]	training's l1: 0.0477945	valid_1's l1: 0.167041
[3700]	training's l1: 0.0465998	valid_1's l1: 0.166754
[3800]	training

[4900]	training's l1: 0.0147121	valid_1's l1: 0.136084
[5000]	training's l1: 0.0143241	valid_1's l1: 0.136031
Did not meet early stopping. Best iteration is:
[5000]	training's l1: 0.0143241	valid_1's l1: 0.136031
2JHN Fold 0, logMAE: -1.9948713361554775
Training until validation scores don't improve for 200 rounds.
[100]	training's l1: 0.186716	valid_1's l1: 0.217876
[200]	training's l1: 0.148258	valid_1's l1: 0.191736
[300]	training's l1: 0.129454	valid_1's l1: 0.179715
[400]	training's l1: 0.115217	valid_1's l1: 0.171346
[500]	training's l1: 0.103975	valid_1's l1: 0.165695
[600]	training's l1: 0.0954156	valid_1's l1: 0.161839
[700]	training's l1: 0.0880363	valid_1's l1: 0.158628
[800]	training's l1: 0.0816495	valid_1's l1: 0.156021
[900]	training's l1: 0.0764297	valid_1's l1: 0.153958
[1000]	training's l1: 0.0716494	valid_1's l1: 0.152142
[1100]	training's l1: 0.0673582	valid_1's l1: 0.15068
[1200]	training's l1: 0.0636636	valid_1's l1: 0.149373
[1300]	training's l1: 0.0601437	valid_

[3800]	training's l1: 0.0206016	valid_1's l1: 0.133504
[3900]	training's l1: 0.0199282	valid_1's l1: 0.133364
[4000]	training's l1: 0.0193016	valid_1's l1: 0.133263
[4100]	training's l1: 0.01871	valid_1's l1: 0.133139
[4200]	training's l1: 0.0181352	valid_1's l1: 0.133026
[4300]	training's l1: 0.0175781	valid_1's l1: 0.132919
[4400]	training's l1: 0.0170585	valid_1's l1: 0.132826
[4500]	training's l1: 0.0165641	valid_1's l1: 0.132751
[4600]	training's l1: 0.0160881	valid_1's l1: 0.132659
[4700]	training's l1: 0.0156412	valid_1's l1: 0.132586
[4800]	training's l1: 0.0152074	valid_1's l1: 0.132501
[4900]	training's l1: 0.0147776	valid_1's l1: 0.132427
[5000]	training's l1: 0.0143843	valid_1's l1: 0.132366
Did not meet early stopping. Best iteration is:
[5000]	training's l1: 0.0143843	valid_1's l1: 0.132366
2JHN Fold 3, logMAE: -2.0221873761656917
Training until validation scores don't improve for 200 rounds.
[100]	training's l1: 0.186339	valid_1's l1: 0.217049
[200]	training's l1: 0.1510

[1400]	training's l1: 0.240027	valid_1's l1: 0.311649
[1500]	training's l1: 0.233843	valid_1's l1: 0.30813
[1600]	training's l1: 0.227974	valid_1's l1: 0.304944
[1700]	training's l1: 0.222545	valid_1's l1: 0.30212
[1800]	training's l1: 0.217465	valid_1's l1: 0.299255
[1900]	training's l1: 0.212733	valid_1's l1: 0.296735
[2000]	training's l1: 0.208321	valid_1's l1: 0.294551
[2100]	training's l1: 0.204062	valid_1's l1: 0.292455
[2200]	training's l1: 0.199967	valid_1's l1: 0.290466
[2300]	training's l1: 0.196147	valid_1's l1: 0.288597
[2400]	training's l1: 0.192331	valid_1's l1: 0.286733
[2500]	training's l1: 0.188784	valid_1's l1: 0.285055
[2600]	training's l1: 0.185497	valid_1's l1: 0.28364
[2700]	training's l1: 0.182182	valid_1's l1: 0.28217
[2800]	training's l1: 0.179014	valid_1's l1: 0.280762
[2900]	training's l1: 0.17601	valid_1's l1: 0.27943
[3000]	training's l1: 0.173047	valid_1's l1: 0.278114
[3100]	training's l1: 0.170174	valid_1's l1: 0.276812
[3200]	training's l1: 0.167489	val

[600]	training's l1: 0.31577	valid_1's l1: 0.361687
[700]	training's l1: 0.30147	valid_1's l1: 0.351665
[800]	training's l1: 0.289183	valid_1's l1: 0.343184
[900]	training's l1: 0.278539	valid_1's l1: 0.336136
[1000]	training's l1: 0.269016	valid_1's l1: 0.32997
[1100]	training's l1: 0.260507	valid_1's l1: 0.32465
[1200]	training's l1: 0.252981	valid_1's l1: 0.320063
[1300]	training's l1: 0.245853	valid_1's l1: 0.315685
[1400]	training's l1: 0.239107	valid_1's l1: 0.311555
[1500]	training's l1: 0.232968	valid_1's l1: 0.307997
[1600]	training's l1: 0.22717	valid_1's l1: 0.304812
[1700]	training's l1: 0.221829	valid_1's l1: 0.301951
[1800]	training's l1: 0.216784	valid_1's l1: 0.29935
[1900]	training's l1: 0.212089	valid_1's l1: 0.296826
[2000]	training's l1: 0.207605	valid_1's l1: 0.294576
[2100]	training's l1: 0.203305	valid_1's l1: 0.292501
[2200]	training's l1: 0.199157	valid_1's l1: 0.290484
[2300]	training's l1: 0.195155	valid_1's l1: 0.288568
[2400]	training's l1: 0.191485	valid_1

[3600]	training's l1: 0.0669734	valid_1's l1: 0.15933
[3700]	training's l1: 0.065684	valid_1's l1: 0.15894
[3800]	training's l1: 0.0644081	valid_1's l1: 0.158558
[3900]	training's l1: 0.0631791	valid_1's l1: 0.158181
[4000]	training's l1: 0.0619467	valid_1's l1: 0.15784
[4100]	training's l1: 0.0607867	valid_1's l1: 0.157502
[4200]	training's l1: 0.0596747	valid_1's l1: 0.157212
[4300]	training's l1: 0.0585797	valid_1's l1: 0.156884
[4400]	training's l1: 0.0575518	valid_1's l1: 0.156616
[4500]	training's l1: 0.0565339	valid_1's l1: 0.156361
[4600]	training's l1: 0.0555336	valid_1's l1: 0.156106
[4700]	training's l1: 0.0545812	valid_1's l1: 0.15585
[4800]	training's l1: 0.053671	valid_1's l1: 0.155598
[4900]	training's l1: 0.0527622	valid_1's l1: 0.155372
[5000]	training's l1: 0.0518901	valid_1's l1: 0.155138
Did not meet early stopping. Best iteration is:
[5000]	training's l1: 0.0518901	valid_1's l1: 0.155138
3JHH Fold 1, logMAE: -1.863437326160381
Training until validation scores don't

[2600]	training's l1: 0.0833286	valid_1's l1: 0.164549
[2700]	training's l1: 0.0813919	valid_1's l1: 0.163799
[2800]	training's l1: 0.0795413	valid_1's l1: 0.163127
[2900]	training's l1: 0.0777317	valid_1's l1: 0.162527
[3000]	training's l1: 0.0760048	valid_1's l1: 0.161961
[3100]	training's l1: 0.074356	valid_1's l1: 0.161381
[3200]	training's l1: 0.0727546	valid_1's l1: 0.160855
[3300]	training's l1: 0.0712093	valid_1's l1: 0.160368
[3400]	training's l1: 0.0697751	valid_1's l1: 0.159918
[3500]	training's l1: 0.0683382	valid_1's l1: 0.159456
[3600]	training's l1: 0.0669548	valid_1's l1: 0.158996
[3700]	training's l1: 0.0656514	valid_1's l1: 0.158616
[3800]	training's l1: 0.0643608	valid_1's l1: 0.158217
[3900]	training's l1: 0.0631362	valid_1's l1: 0.15785
[4000]	training's l1: 0.0619564	valid_1's l1: 0.157528
[4100]	training's l1: 0.0607871	valid_1's l1: 0.157213
[4200]	training's l1: 0.0596836	valid_1's l1: 0.156857
[4300]	training's l1: 0.0585967	valid_1's l1: 0.156512
[4400]	train

[100]	training's l1: 0.553213	valid_1's l1: 0.563603
[200]	training's l1: 0.472596	valid_1's l1: 0.490007
[300]	training's l1: 0.429747	valid_1's l1: 0.452807
[400]	training's l1: 0.400523	valid_1's l1: 0.428762
[500]	training's l1: 0.377658	valid_1's l1: 0.410359
[600]	training's l1: 0.360318	valid_1's l1: 0.3971
[700]	training's l1: 0.344906	valid_1's l1: 0.385462
[800]	training's l1: 0.332548	valid_1's l1: 0.376633
[900]	training's l1: 0.321365	valid_1's l1: 0.369049
[1000]	training's l1: 0.311727	valid_1's l1: 0.362512
[1100]	training's l1: 0.302845	valid_1's l1: 0.356708
[1200]	training's l1: 0.294799	valid_1's l1: 0.351472
[1300]	training's l1: 0.287153	valid_1's l1: 0.346696
[1400]	training's l1: 0.280236	valid_1's l1: 0.342514
[1500]	training's l1: 0.273792	valid_1's l1: 0.338801
[1600]	training's l1: 0.267702	valid_1's l1: 0.335302
[1700]	training's l1: 0.262029	valid_1's l1: 0.332144
[1800]	training's l1: 0.256733	valid_1's l1: 0.329092
[1900]	training's l1: 0.251794	valid_1'

[4700]	training's l1: 0.16733	valid_1's l1: 0.287373
[4800]	training's l1: 0.165408	valid_1's l1: 0.286639
[4900]	training's l1: 0.163514	valid_1's l1: 0.285949
[5000]	training's l1: 0.161663	valid_1's l1: 0.285285
Did not meet early stopping. Best iteration is:
[5000]	training's l1: 0.161663	valid_1's l1: 0.285285
3JHC Fold 4, logMAE: -1.2542663573183155
*** Training Model for 3JHN ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd_4_1', 'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0',
       'd_6_1', 'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0',
       'd_8_1', 'd_8_2', 'd_8_3', 'd_9_0', 'd_9_1', 'd_9_2', 'd_9_3',
       'scalar_coupling_constant'],
      dtype='object')
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd_

[2100]	training's l1: 0.0315993	valid_1's l1: 0.112737
[2200]	training's l1: 0.0302043	valid_1's l1: 0.112379
[2300]	training's l1: 0.0289224	valid_1's l1: 0.11199
[2400]	training's l1: 0.0277592	valid_1's l1: 0.111709
[2500]	training's l1: 0.0266605	valid_1's l1: 0.111479
[2600]	training's l1: 0.0256072	valid_1's l1: 0.111183
[2700]	training's l1: 0.0246408	valid_1's l1: 0.110963
[2800]	training's l1: 0.0236961	valid_1's l1: 0.110715
[2900]	training's l1: 0.0228299	valid_1's l1: 0.110498
[3000]	training's l1: 0.0220133	valid_1's l1: 0.110276
[3100]	training's l1: 0.0212234	valid_1's l1: 0.110104
[3200]	training's l1: 0.0204943	valid_1's l1: 0.109938
[3300]	training's l1: 0.0198098	valid_1's l1: 0.109783
[3400]	training's l1: 0.019158	valid_1's l1: 0.10963
[3500]	training's l1: 0.018543	valid_1's l1: 0.109511
[3600]	training's l1: 0.0179399	valid_1's l1: 0.109391
[3700]	training's l1: 0.017377	valid_1's l1: 0.109257
[3800]	training's l1: 0.0168456	valid_1's l1: 0.109162
[3900]	training

In [30]:
pd.DataFrame({'type': list(cv_scores.keys()), 'cv_score': list(cv_scores.values())})

Unnamed: 0,type,cv_score
0,1JHN,-0.996985
1,1JHC,-0.37361
2,2JHH,-1.805959
3,2JHN,-1.999981
4,2JHC,-1.343571
5,3JHH,-1.866846
6,3JHC,-1.250684
7,3JHN,-2.23136


In [31]:
np.mean(list(cv_scores.values()))

-1.4836246534105204

In [32]:
submission[submission['scalar_coupling_constant'] == 0].shape

(0, 1)

In [34]:
submission.to_csv('lgb_5fold_5k.csv')