In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import os

In [3]:
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn import metrics
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
import gc
import seaborn as sns
import warnings
import scipy.signal as sg
warnings.filterwarnings("ignore")

from scipy import stats

In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max and c_prec == np.finfo(np.float16).precision:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Prepare Data

This notebook focuses on merging all good datasets and modeling. The datasets ending with fc_sd_pso_dso_SCC_1 only contain molecular level features without type specific features.

In [6]:
train_mole = pd.read_csv('E:/kaggle/Molecular_properties/Data_use/train_20190820_fc_sd_pso_dso_SCC_1.csv')
test_mole = pd.read_csv('E:/kaggle/Molecular_properties/Data_use/test_20190820_fc_sd_pso_dso_1.csv')

In [7]:
train_mole.drop('Unnamed: 0', axis = 1, inplace = True)
test_mole.drop('Unnamed: 0', axis = 1, inplace = True)

In [60]:
scalar_coupling.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,fc,sd,pso,dso
0,dsgdb9nsd_000001,1,0,1JHC,83.0224,0.254579,1.25862,0.27201
1,dsgdb9nsd_000001,1,2,2JHH,-11.0347,0.352978,2.85839,-3.4336
2,dsgdb9nsd_000001,1,3,2JHH,-11.0325,0.352944,2.85852,-3.43387
3,dsgdb9nsd_000001,1,4,2JHH,-11.0319,0.352934,2.85855,-3.43393
4,dsgdb9nsd_000001,2,0,1JHC,83.0222,0.254585,1.25861,0.272013


In [8]:
scalar_coupling = pd.read_csv('E:/kaggle/Molecular_properties/champs-scalar-coupling/scalar_coupling_contributions.csv')
#scalar_coupling.head()
y_fc = scalar_coupling['fc']
y_sd = scalar_coupling['sd']
y_pso = scalar_coupling['pso']
y_dso = scalar_coupling['dso']

Load df_molecules for later group k fold validation

In [9]:
df_molecules = pd.read_csv('E:/kaggle/Molecular_properties/Data_use/molecules.csv')

# Basic functions

In [10]:
#need to implement a way to calculate group mae
def train_model_regression(X, X_test, y, params, folds, molecules, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None,
                               verbose=10000, early_stopping_rounds=200, n_estimators=50000):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
    
    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type
    
    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]
    
    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'sklearn_scoring_function': metrics.mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                        'catboost_metric_name': 'MSE',
                        'sklearn_scoring_function': metrics.mean_squared_error}
                    }

    
    result_dict = {}
    
    # out-of-fold predictions on train data
    oof = np.zeros(len(X))
    
    # averaged predictions on train data
    prediction = np.zeros(len(X_test))
    
    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    
    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, groups = molecules)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators = n_estimators, n_jobs = -1)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                    verbose=verbose, early_stopping_rounds=early_stopping_rounds)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')
            
            y_pred = model.predict(X_test).reshape(-1,)
        
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,  eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params,
                                      loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        if eval_metric != 'group_mae':
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred    
        
        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= folds.n_splits
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores
    
    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= folds.n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
            
            result_dict['feature_importance'] = feature_importance
        
    return result_dict

In [11]:
def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [12]:
n_splits = 8
gkf = GroupKFold(n_splits=n_splits)

In [13]:
def Data_merge(df, df_merge, both_on):
    df = pd.merge(df, df_merge, how = 'left',
                 left_on = both_on,
                 right_on = both_on)
    return df

In [14]:
def getDuplicateColumns(df):
    duplicateColumnNames = set()
    goodCol = set()
    
    for x in range(df.shape[1]):
        col = df.iloc[:, x]
        for y in range(x + 1, df.shape[1]):
            otherCol = df.iloc[:, y]
            if col.equals(otherCol):
                found = True
                goodCol.add(df.columns.values[x])
                duplicateColumnNames.add(df.columns.values[y])
            
                
    return list(duplicateColumnNames), list(goodCol)

# Stacking features by type

In [15]:
X_short = pd.DataFrame({'ind': list(train_mole.index), 'type': train_mole['type'].values, 
                        'oof_fc': [0] * len(train_mole), 'oof_sd': [0] * len(train_mole), 'oof_pso': [0] * len(train_mole), 'oof_dso': [0] * len(train_mole),
                        'target_fc': y_fc.values, 'target_sd': y_sd.values, 'target_pso': y_pso.values, 'target_dso': y_dso.values})
X_short_test = pd.DataFrame({'ind': list(test_mole.index), 'type': test_mole['type'].values, 
                             'prediction_fc': [0] * len(test_mole), 'prediction_sd': [0] * len(test_mole), 'prediction_pso': [0] * len(test_mole), 'prediction_dso': [0] * len(test_mole) })
types_iterate = train_mole['type'].unique()
len(types_iterate)

8

In [16]:
def train_secondary(X_short_secondary, X_short_test_secondary, params, features_toget, types_iterate_index, model_type = 'lgb'):
    
    # Good molecular level features
    df_features = pd.read_csv(f'E:/kaggle/Molecular_properties/good_features/good_ft_type{types_iterate_index}_bonds_QM9_inver_neighbor_huber.csv')
    good_features = list(df_features['feature'].values) + ['molecule_name', 'atom_index_0', 'atom_index_1']
    type_index = types_iterate[types_iterate_index]
    
    temp = train_mole[['type', 'type_string']].loc[train_mole['type'] == type_index].head()
    Type = temp['type_string'].unique()[0]
    X_t = train_mole[good_features].loc[train_mole['type'] == type_index]
    X_test_t = test_mole[good_features].loc[test_mole['type'] == type_index]
    old_index = X_t.index
    old_index_test = X_test_t.index
    
    # Dataset with base neighbor features for a type
    X_t_merge = pd.read_csv(f'E:/kaggle/Molecular_properties/train_more_neighbor_20190805/train_{Type}.csv')
    X_test_t_merge = pd.read_csv(f'E:/kaggle/Molecular_properties/test_more_neighbor_20190809/test_{Type}.csv')
        
    X_t_merge.drop('Unnamed: 0', axis = 1, inplace = True)
    X_test_t_merge.drop('Unnamed: 0', axis = 1, inplace = True)
    
    X_t = Data_merge(X_t, X_t_merge, ['molecule_name', 'atom_index_0', 'atom_index_1'])
    X_test_t = Data_merge(X_test_t, X_test_t_merge, ['molecule_name', 'atom_index_0', 'atom_index_1'])
    
    # Dataset with inverse distance related neighbor features for a type
    X_t_merge = pd.read_csv(f'E:/kaggle/Molecular_properties/train_20190822/train_{Type}.csv')
    X_test_t_merge = pd.read_csv(f'E:/kaggle/Molecular_properties/test_20190822/test_{Type}.csv')

    X_t_merge.drop('Unnamed: 0', axis = 1, inplace = True)
    X_test_t_merge.drop('Unnamed: 0', axis = 1, inplace = True)

    X_t = Data_merge(X_t, X_t_merge, ['molecule_name', 'atom_index_0', 'atom_index_1'])
    X_test_t = Data_merge(X_test_t, X_test_t_merge, ['molecule_name', 'atom_index_0', 'atom_index_1'])

    X_t.index = old_index
    X_test_t.index = old_index_test
    
    bad_features, _ = getDuplicateColumns(X_t[:40000])
    bad_features = bad_features + ['molecule_name', 'atom_index_0', 'atom_index_1', 'cosinus', 'dihedral',
                                   'type_string', 'scalar_coupling_constant']
    good_features = [col for col in X_t.columns.values if col not in bad_features]
    
    print(f'Training of type {Type}')
    
    for feature in features_toget:
        
        print('Training of feature ' + feature)
        label = 'target_' + feature
        train_secondary_feature = 'oof_' + feature
        test_secondary_feature = 'prediction_' + feature
        
        y_t = X_short_secondary.loc[X_short['type'] == type_index, label]
        molecules_t = df_molecules.loc[df_molecules['type'] == type_index, 'molecule_name']
        
        result_dict_lgb_oof = train_model_regression(X=X_t[good_features], X_test=X_test_t[good_features], y=y_t, params=params, molecules = molecules_t,
                                                                  folds=gkf, model_type=model_type, eval_metric='group_mae', plot_feature_importance=False,
                                                                  verbose=1000, early_stopping_rounds=500, n_estimators=20000)

        X_short_secondary.loc[X_short_secondary['type'] == type_index, train_secondary_feature] = result_dict_lgb_oof['oof']
        X_short_test_secondary.loc[X_short_test_secondary['type'] == type_index, test_secondary_feature] = result_dict_lgb_oof['prediction']
        

In [17]:
# oof: -0.5362
params = {'num_leaves': 300,                           #initial 200       best 300 with max_depth 25                                   
          'min_child_samples': 60,                      #initial 79       best 60         
          'objective': 'huber',                                        
          #'max_depth': 25,                            # initial none    
          'colsample_bytree': 0.5,                    # initial 0.9      best 0.45
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # initial 0.25    best0.08        
          "metric": 'mae',                                       
          'reg_alpha': 2,                            #initial 0.1         best 2         
          'reg_lambda': 0.3                          #initial 0.3                            
         }

In [None]:
train_secondary(X_short, X_short_test, params, ['fc', 'pso', 'dso'], 0)

In [19]:
# oof: -2.1969
params = {'num_leaves': 300,                           #initial 200         best 300                           
          'min_child_samples': 60,                      #initial 79         best 60 
          'objective': 'huber',                                        
          #'max_depth': 30,                              #initial none       best 30      
          'colsample_bytree': 0.5,                    # 0.9                 best 0.3
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # 0.25              best 0.05   
          "metric": 'mae',                                       
          'reg_alpha': 0.1,                            #initial 0.1         best 0.3       
          'reg_lambda': 0.3                       #initial 0.3                                
         }

In [None]:
train_secondary(X_short, X_short_test, params, ['fc', 'sd', 'pso'], 1)

In [21]:
# oof: -1.2694
params = {'num_leaves': 300,                           #initial 200    best: 300                                 
          'min_child_samples': 60,                      #initial 79    best: 60        
          'objective': 'huber',                                        
          #'max_depth': 60,                                    
          'colsample_bytree': 0.5,                    # 0.9           best  0.18
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.1,                        # 0.25        best: 0.04
          "metric": 'mae',                                       
          'reg_alpha': 0.5,                            #initial 0.1    best: 0.5   
          'reg_lambda': 0.5                           #initial 0.3     best: 0.5                            
         }

In [None]:
train_secondary(X_short, X_short_test, params, ['fc', 'pso', 'dso'], 2)

In [23]:
# oof: -1.9993
params = {'num_leaves': 300,                           #initial 200       best: 300                              
          'min_child_samples': 79,                      #initial 79          
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.5,                    # initial 0.9       best: 0.5
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # 0.25            best: 0.05        
          "metric": 'mae',                                       
          'reg_alpha': 0.1,                            #initial 0.1       
          'reg_lambda': 0.3                           #initial 0.3                               
         }

In [None]:
train_secondary(X_short, X_short_test, params, ['fc', 'sd', 'pso'], 3)

In [25]:
# -1.5059
params = {'num_leaves': 300,                           #initial 200      best: 300                               
          'min_child_samples': 50,                      #initial 79      best: 50    
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.5,                    # 0.9              best: 0.45
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # 0.25           best: 0.05        
          "metric": 'mae',                                       
          'reg_alpha': 0.75,                            #initial 0.1        best: 0.75     
          'reg_lambda': 0.2                           #initial 0.3       best: 0.2                              
         }

In [None]:
train_secondary(X_short, X_short_test, params, ['fc'], 4)

In [27]:
params = {'num_leaves': 300,                           #initial 200            best: 300                         
          'min_child_samples': 60,                      #initial 79            best: 60          
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.5,                    # 0.9                    best: 0.45
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # 0.25                 best: 0.05   
          "metric": 'mae',                                       
          'reg_alpha': 0.3,                            #initial 0.1            best: 0.3
          'reg_lambda': 1                           #initial 0.3               best: 1                                 
         }

In [None]:
train_secondary(X_short, X_short_test, params, ['fc'], 5)

In [29]:
# -1.5768
params = {'num_leaves': 300,                           #initial 200        best: 300                             
          'min_child_samples': 60,                      #initial 79        best: 60          
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.5,                    # 0.9                best: 0.5
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # 0.25             best: 0.05
          "metric": 'mae',                                       
          'reg_alpha': 0.1,                            #initial 0.1        best: 0.05   
          'reg_lambda': 0.3                          #initial 0.3                                     
         }

In [None]:
train_secondary(X_short, X_short_test, params, ['fc'], 6)

In [31]:
# -2.4587
params = {'num_leaves': 300,                           #initial 200       best: 300                              
          'min_child_samples': 79,                      #initial 79       best: 30   
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.6,                    # 0.9               best: 0.6
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # 0.25            best: 0.03
          "metric": 'mae',                                       
          'reg_alpha': 0.1,                            #initial 0.1       best: 0.1   
          'reg_lambda': 0.3                           #initial 0.3                                 
         }

In [None]:
train_secondary(X_short, X_short_test, params, ['fc', 'sd', 'pso'], 7)

In [33]:
train_mole['oof_fc'] = X_short['oof_fc']
test_mole['oof_fc'] = X_short_test['prediction_fc']

In [34]:
train_mole['oof_sd'] = X_short['oof_sd']
test_mole['oof_sd'] = X_short_test['prediction_sd']

In [35]:
train_mole['oof_pso'] = X_short['oof_pso']
test_mole['oof_pso'] = X_short_test['prediction_pso']

In [36]:
train_mole['oof_dso'] = X_short['oof_dso']
test_mole['oof_dso'] = X_short_test['prediction_dso']

In [37]:
train_mole.to_csv('E:/kaggle/Molecular_properties/Data_use/train_20190826_fc_sd_pso_dso_SCC_1.csv')
test_mole.to_csv('E:/kaggle/Molecular_properties/Data_use/test_20190826_fc_sd_pso_dso_1.csv')

# train by type

In [38]:
y_tr = train_mole['scalar_coupling_constant']

In [39]:
n_splits = 8
gkf = GroupKFold(n_splits=n_splits)

In [40]:
X_short = pd.DataFrame({'ind': list(train_mole.index), 'type': train_mole['type'].values, 
                        'oof_lgb': [0] * len(train_mole), 'oof_xgb': [0] * len(train_mole), 'oof_ridge': [0] * len(train_mole),
                        'target': y_tr.values
                       })
X_short_test = pd.DataFrame({'ind': list(test_mole.index), 'type': test_mole['type'].values, 
                             'prediction_lgb': [0] * len(test_mole), 'prediction_xgb': [0] * len(test_mole), 'prediction_ridge': [0] * len(test_mole),
                            })
types_iterate = train_mole['type'].unique()
len(types_iterate)

8

In [41]:
def train_byType(X_short, X_short_test, params, features_secondary, types_iterate_index, model_type = 'lgb'):
    
    df_features = pd.read_csv(f'E:/kaggle/Molecular_properties/good_features/good_ft_type{types_iterate_index}_bonds_QM9_inver_neighbor_huber.csv')
    good_features = list(df_features['feature'].values) + ['molecule_name', 'atom_index_0', 'atom_index_1'] + features_secondary
    type_index = types_iterate[types_iterate_index]
    
    temp = train_mole[['type', 'type_string']].loc[train_mole['type'] == type_index].head()
    Type = temp['type_string'].unique()[0]
    X_t = train_mole[good_features].loc[train_mole['type'] == type_index]
    X_test_t = test_mole[good_features].loc[test_mole['type'] == type_index]
    old_index = X_t.index
    old_index_test = X_test_t.index
    
    # Dataset with base neighbor features for a type
    X_t_merge = pd.read_csv(f'E:/kaggle/Molecular_properties/train_more_neighbor_20190805/train_{Type}.csv')
    X_test_t_merge = pd.read_csv(f'E:/kaggle/Molecular_properties/test_more_neighbor_20190809/test_{Type}.csv')
        
    X_t_merge.drop('Unnamed: 0', axis = 1, inplace = True)
    X_test_t_merge.drop('Unnamed: 0', axis = 1, inplace = True)
    
    X_t = Data_merge(X_t, X_t_merge, ['molecule_name', 'atom_index_0', 'atom_index_1'])
    X_test_t = Data_merge(X_test_t, X_test_t_merge, ['molecule_name', 'atom_index_0', 'atom_index_1'])
    
    # Dataset with inverse distance related neighbor features for a type
    X_t_merge = pd.read_csv(f'E:/kaggle/Molecular_properties/train_20190822/train_{Type}.csv')
    X_test_t_merge = pd.read_csv(f'E:/kaggle/Molecular_properties/test_20190822/test_{Type}.csv')

    X_t_merge.drop('Unnamed: 0', axis = 1, inplace = True)
    X_test_t_merge.drop('Unnamed: 0', axis = 1, inplace = True)

    X_t = Data_merge(X_t, X_t_merge, ['molecule_name', 'atom_index_0', 'atom_index_1'])
    X_test_t = Data_merge(X_test_t, X_test_t_merge, ['molecule_name', 'atom_index_0', 'atom_index_1'])

    X_t.index = old_index
    X_test_t.index = old_index_test
    
    bad_features, _ = getDuplicateColumns(X_t[:40000])
    bad_features = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type_string', 'scalar_coupling_constant']
    good_features = [col for col in X_t.columns.values if col not in bad_features]
    
    print(f'Training of type {Type}')
         
    y_t = X_short.loc[X_short['type'] == type_index, 'target']
    molecules_t = df_molecules.loc[df_molecules['type'] == type_index, 'molecule_name']

    result_dict_lgb_oof = train_model_regression(X=X_t[good_features], X_test=X_test_t[good_features], y=y_t, params=params, molecules = molecules_t,
                                                              folds=gkf, model_type=model_type, eval_metric='group_mae', plot_feature_importance=False,
                                                              verbose=1000, early_stopping_rounds=500, n_estimators=20000)

    X_short.loc[X_short['type'] == type_index, 'oof_' + model_type] = result_dict_lgb_oof['oof']
    X_short_test.loc[X_short_test['type'] == type_index, 'prediction_' + model_type] = result_dict_lgb_oof['prediction']


In [42]:
#'huber' w/o fc: -0.4507  -0.5219  -0.4878  -0.4583  -0.4657  -0.5196
# Directly adding up: -0.4923
# -0.1191   fc: -0.7834  fc+sd:-0.7934  fc+sd+pso:-0.8027  fc+sd+pso+dso: -0.8020  fc+sd+dso: -0.8016 fc+pso+dso: -0.8049
# fc+pso: -0.7920  fc+dso: -0.7907
params = {'num_leaves': 300,                           #initial 200       best 300 with max_depth 25                                   
          'min_child_samples': 60,                      #initial 79       best 60         
          'objective': 'huber',                                        
          #'max_depth': 25,                            # initial none    
          'colsample_bytree': 0.5,                    # initial 0.9      best 0.45
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # initial 0.25    best0.08        
          "metric": 'mae',                                       
          'reg_alpha': 2,                            #initial 0.1         best 2         
          'reg_lambda': 0.3                          #initial 0.3                             
         }

In [None]:
train_byType(X_short, X_short_test, params, 
             ['oof_fc', 'oof_pso', 'oof_dso'],
             0)

In [44]:
#'huber': w 'huber' fc: -2.1603  -2.2533  -2.2741  -2.2167   -2.1698   -2.1626   -2.2412
# Directly adding up: -2.1452
# -1.7509   fc: -2.1915  fc+sd: -2.1785  fc+pso: -2.1775  fc+dso:-2.1770  fc+sd+pso:-2.2183  fc+sd+dso:-2.2142
# fc+pso+dso: -2.2122  fc+sd+pso+dso: -2.1922
params = {'num_leaves': 300,                           #initial 200         best 300                           
          'min_child_samples': 60,                      #initial 79         best 60 
          'objective': 'huber',                                        
          #'max_depth': 30,                              #initial none       best 30      
          'colsample_bytree': 0.5,                    # 0.9                 best 0.3
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # 0.25              best 0.05   
          "metric": 'mae',                                       
          'reg_alpha': 0.1,                            #initial 0.1         best 0.3       
          'reg_lambda': 0.3                       #initial 0.3                            
         }

In [None]:
train_byType(X_short, X_short_test, params, 
             ['oof_fc', 'oof_sd', 'oof_pso'],
             1)

In [46]:
#'huber': w'huber' fc: -1.1385  -1.2825  -1.3014  -1.1448  -1.1761   -1.2116  -1.2695
# Directly adding up: -1.205
# -1.1427  fc: -1.1232  fc+sd: -1.1252  fc+pso: -1.1319  fc+dso:-1.1390  fc+sd+pso: -1.1284  fc+sd+dso: -1.1416  
# fc+pso+dso: -1.1490  fc+sd+pso+dso: -1.1452
params = {'num_leaves': 300,                           #initial 200    best: 300                                 
          'min_child_samples': 60,                      #initial 79    best: 60        
          'objective': 'huber',                                        
          #'max_depth': 60,                                    
          'colsample_bytree': 0.5,                    # 0.9           best  0.18
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.1,                        # 0.25        best: 0.04
          "metric": 'mae',                                       
          'reg_alpha': 0.5,                            #initial 0.1    best: 0.5   
          'reg_lambda': 0.5                           #initial 0.3     best: 0.5                             
         }

In [None]:
train_byType(X_short, X_short_test, params, 
             ['oof_fc', 'oof_pso', 'oof_dso'],
             2)

In [48]:
#'huber': w'huber' fc: -1.9772   -2.0570   -2.0614  -2.0227  -1.9779   -1.9795   -2.0444
# Directly adding up: -1.9754
# -1.7188   fc: -2.0483  fc+sd: -2.0388   fc+pso: -2.0458   fc+dso: -2.0463  fc+sd+pso: -2.0831  fc+sd+sdo: -2.0766
# fc+pso+dso: -2.0805    fc+sd+pso+dso: -2.0585
params = {'num_leaves': 300,                           #initial 200       best: 300                              
          'min_child_samples': 79,                      #initial 79          
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.5,                    # initial 0.9       best: 0.5
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # 0.25            best: 0.05        
          "metric": 'mae',                                       
          'reg_alpha': 0.1,                            #initial 0.1       
          'reg_lambda': 0.3                           #initial 0.3                             
         }

In [None]:
train_byType(X_short, X_short_test, params, 
             ['oof_fc', 'oof_sd', 'oof_pso'], 
             3)

In [50]:
#'huber': w 'huber' fc: -1.4041  -1.5036   -1.5201  -1.4618   -1.4238  -1.4485  -1.5344
# Directly adding up: -1.4583
# -0.9357   fc: -1.6998  fc+sd: -1.6735   fc+pso: -1.675  fc+dso: -1.6717   fc+sd+pso: -1.6402  fc+pso+dso: -1.6360
# fc+sd+dso:  -1.6448   fc+sd+pso+dso: -1.6644
params = {'num_leaves': 300,                           #initial 200      best: 300                               
          'min_child_samples': 50,                      #initial 79      best: 50    
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.5,                    # 0.9              best: 0.45
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # 0.25           best: 0.05        
          "metric": 'mae',                                       
          'reg_alpha': 0.75,                            #initial 0.1        best: 0.75     
          'reg_lambda': 0.2                           #initial 0.3       best: 0.2                       
         }

In [None]:
train_byType(X_short, X_short_test, params, 
             ['oof_fc'],
             4)

In [52]:
#'huber': w/o fc: -2.0471  -2.1918  -2.1648  -2.0844   -2.1487   -2.1846
# Directly adding up: -2.0848
# -1.6215   fc: -2.2635  fc+sd: -2.2167  fc+pso: -2.2094   fc+dso: -2.2155  fc+sd+pso: -2.1836  fc+sd+dso: -2.1854
# fc+pso+dso: -2.1777   fc+sd+pso+dso: -2.2149
params = {'num_leaves': 300,                           #initial 200            best: 300                         
          'min_child_samples': 60,                      #initial 79            best: 60          
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.5,                    # 0.9                    best: 0.45
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # 0.25                 best: 0.05   
          "metric": 'mae',                                       
          'reg_alpha': 0.3,                            #initial 0.1            best: 0.3
          'reg_lambda': 1                           #initial 0.3               best: 1                       
         }

In [None]:
train_byType(X_short, X_short_test, params, 
             ['oof_fc'],
             5)

In [54]:
#'huber': w 'huber' fc: -1.4907   -1.5805  -1.5873  -1.5469  -1.5208  -1.5772   -1.6105
# Directly adding up: -1.5225
# -0.9887   fc: -1.8958   fc+sd: -1.8101    fc+pso: -1.8033   fc+dso: -1.8063   fc+sd+pso: -1.7794  fc+sd+dso: -1.7717
# fc+pso+dso: -1.7790   fc+sd+pso+dso: -1.8105 
params = {'num_leaves': 300,                           #initial 200        best: 300                             
          'min_child_samples': 60,                      #initial 79        best: 60          
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.5,                    # 0.9                best: 0.5
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # 0.25             best: 0.05
          "metric": 'mae',                                       
          'reg_alpha': 0.1,                            #initial 0.1        best: 0.05   
          'reg_lambda': 0.3                          #initial 0.3                               
         }

In [None]:
train_byType(X_short, X_short_test, params, 
             ['oof_fc'],
             6)

In [56]:
#'huber': w 'huber' fc: -2.3953    -2.5190  -2.5509  -2.4635   -2.4176   -2.4609    -2.4917
# Directly adding up: -2.4219
#-2.1084  fc: -2.5922  fc+sd: -2.5682   fc+pso: -2.5739  fc+dso: -2.5705  fc+sd+pso: -2.5953   fc+sd+dso: -2.5870 
# fc+pso+dso: -2.5931  fc+sd+pso+dso: -2.5855
params = {'num_leaves': 300,                           #initial 200       best: 300                              
          'min_child_samples': 79,                      #initial 79       best: 30   
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.6,                    # 0.9               best: 0.6
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.15,                        # 0.25            best: 0.03
          "metric": 'mae',                                       
          'reg_alpha': 0.1,                            #initial 0.1       best: 0.1   
          'reg_lambda': 0.3                           #initial 0.3                                
         }

In [None]:
train_byType(X_short, X_short_test, params, 
             ['oof_fc', 'oof_sd', 'oof_pso'],
             7)

In [182]:
X_short_test.to_csv('E:/kaggle/Molecular_properties/X_short_test_20190815.csv')
X_short.to_csv('E:/kaggle/Molecular_properties/X_short_20190815.csv')

# Prepare for submission

In [58]:
sample_submission = pd.read_csv('E:/kaggle/Molecular_properties/champs-scalar-coupling/sample_submission.csv', index_col='id')
benchmark = sample_submission.copy()
benchmark.reset_index(inplace = True)

In [59]:
benchmark['scalar_coupling_constant'] = X_short_test['prediction_lgb']
benchmark.to_csv('train_type_feature_20190828.csv', index=False)
benchmark.head()

Unnamed: 0,id,scalar_coupling_constant
0,4658147,18.6605775739572
1,4658148,196.56697760247133
2,4658149,9.952218486467974
3,4658150,196.5656091613627
4,4658151,18.5089496592511


In [60]:
benchmark.tail()

Unnamed: 0,id,scalar_coupling_constant
2505537,7163684,2.649353092376293
2505538,7163685,4.151215380486076
2505539,7163686,2.356377834792806
2505540,7163687,2.608921942259303
2505541,7163688,122.051770281176


Blend results

In [75]:
submission_1 = pd.read_csv('E:/kaggle/Molecular_properties/results/train_type_feature_20190828.csv', index_col='id')
submission_2 = pd.read_csv('E:/kaggle/Molecular_properties/results/train_type_feature_20190825_2.csv', index_col='id')

In [76]:
submission_3 = submission_1.copy()

In [77]:
ratio = 0.77
submission_3['scalar_coupling_constant'] = (submission_1['scalar_coupling_constant'] * ratio + submission_2['scalar_coupling_constant'] * (1 - ratio))
submission_3.reset_index(inplace = True)

In [78]:
submission_3.to_csv('train_type_feature_20190828_blending_3.csv', index=False)
submission_3.head()

Unnamed: 0,id,scalar_coupling_constant
0,4658147,18.62161063200206
1,4658148,196.61284691704384
2,4658149,9.831053684298508
3,4658150,196.58696722937177
4,4658151,18.40240477781641
