# Introduction
- nb29では、typeごとにパラメータを決めなかった。
- このノートブックでは、パラメータをtypeごとに決める
- nb29 で精度の良い`oof_fc`を計算するためである

# Import everything I need :)

In [36]:
import time
import multiprocessing
import glob
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from functools import partial
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
# from fastprogress import progress_bar

# Preparation

In [37]:
nb = 30
isSmallSet = False
length = 10000

In [38]:
pd.set_option('display.max_columns', 100)

In [39]:
file_path = '../input/champs-scalar-coupling/'
glob.glob(file_path + '*')

['../input/champs-scalar-coupling/scalar_coupling_contributions.csv',
 '../input/champs-scalar-coupling/magnetic_shielding_tensors.csv',
 '../input/champs-scalar-coupling/structures.csv',
 '../input/champs-scalar-coupling/test.csv',
 '../input/champs-scalar-coupling/dipole_moments.csv',
 '../input/champs-scalar-coupling/potential_energy.csv',
 '../input/champs-scalar-coupling/sample_submission.csv',
 '../input/champs-scalar-coupling/train.csv',
 '../input/champs-scalar-coupling/mulliken_charges.csv']

In [40]:
# train
path = file_path + 'train.csv'
if isSmallSet:
    train = pd.read_csv(path) [:length]
else:
    train = pd.read_csv(path)

In [41]:
# test
path = file_path + 'test.csv'
if isSmallSet:
    test = pd.read_csv(path)[:length]
else:
    test = pd.read_csv(path)

In [42]:
# structure
path = file_path + 'structures.csv'
structures = pd.read_csv(path)

In [43]:
if isSmallSet:
    print('using SmallSet !!')
    print('-------------------')

print(f'There are {train.shape[0]} rows in train data.')
print(f'There are {test.shape[0]} rows in test data.')

print(f"There are {train['molecule_name'].nunique()} distinct molecules in train data.")
print(f"There are {test['molecule_name'].nunique()} distinct molecules in test data.")
print(f"There are {train['atom_index_0'].nunique()} unique atoms.")
print(f"There are {train['type'].nunique()} unique types.")

There are 4658147 rows in train data.
There are 2505542 rows in test data.
There are 85003 distinct molecules in train data.
There are 45772 distinct molecules in test data.
There are 29 unique atoms.
There are 8 unique types.


---
## myFunc
**metrics**

In [44]:
def kaggle_metric(df, preds):
    df["prediction"] = preds
    maes = []
    for t in df.type.unique():
        y_true = df[df.type==t].scalar_coupling_constant.values
        y_pred = df[df.type==t].prediction.values
        mae = np.log(mean_absolute_error(y_true, y_pred))
        maes.append(mae)
    return np.mean(maes)

---
**momory**

In [45]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max and c_prec == np.finfo(np.float16).precision:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max and c_prec == np.finfo(np.float32).precision:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Feature Engineering

**basic**

In [46]:
def map_atom_info(df_1,df_2, atom_idx):
    df = pd.merge(df_1, df_2, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)

    return df


for atom_idx in [0,1]:
    train = map_atom_info(train, structures, atom_idx)
    test  = map_atom_info(test, structures, atom_idx)
    
    train = train.rename(columns={'atom': f'atom_{atom_idx}',
                                        'x': f'x_{atom_idx}',
                                        'y': f'y_{atom_idx}',
                                        'z': f'z_{atom_idx}'})
    test  =  test.rename(columns={'atom': f'atom_{atom_idx}',
                                        'x': f'x_{atom_idx}',
                                        'y': f'y_{atom_idx}',
                                        'z': f'z_{atom_idx}'})

`type` の特徴量から、数字を抽出  
例) 2JHC ---> 2

In [47]:
train['type_0'] = train['type'].apply(lambda x: x[0])
test['type_0'] = test['type'].apply(lambda x: x[0])

add angle features
- angle featue を追加するためには、dist が必要

In [48]:
# dist
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
train['dist_x'] = (train['x_0'] - train['x_1']) ** 2
test['dist_x'] = (test['x_0'] - test['x_1']) ** 2
train['dist_y'] = (train['y_0'] - train['y_1']) ** 2
test['dist_y'] = (test['y_0'] - test['y_1']) ** 2
train['dist_z'] = (train['z_0'] - train['z_1']) ** 2
test['dist_z'] = (test['z_0'] - test['z_1']) ** 2

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 630.81 Mb (11.2% reduction)
Mem. usage decreased to 320.19 Mb (11.8% reduction)


In [49]:
def add_cos_features(df):
    #I apologize for my poor coding skill. Please make the better one.
#     print(train.shape)
    df_temp=df.loc[:,["molecule_name","atom_index_0","atom_index_1","dist","x_0","y_0","z_0","x_1","y_1","z_1"]].copy()

    df_temp_=df_temp.copy()
    df_temp_= df_temp_.rename(columns={'atom_index_0': 'atom_index_1',
                                       'atom_index_1': 'atom_index_0',
                                       'x_0': 'x_1',
                                       'y_0': 'y_1',
                                       'z_0': 'z_1',
                                       'x_1': 'x_0',
                                       'y_1': 'y_0',
                                       'z_1': 'z_0'})
    df_temp=pd.concat((df_temp,df_temp_),axis=0)
    df_temp["min_dist"]=df_temp.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df_temp= df_temp[df_temp["min_dist"]==df_temp["dist"]]
    df_temp=df_temp.drop(['x_0','y_0','z_0','min_dist'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                     'atom_index_1': 'atom_index_closest',
                                     'dist': 'dist_closest',
                                     'x_1': 'x_closest',
                                     'y_1': 'y_closest',
                                     'z_1': 'z_closest'})

    #delete duplicated rows (some atom pairs have perfectly same distance)
    #This code is added based on Adriano Avelar's comment.
    df_temp=df_temp.drop_duplicates(subset=['molecule_name', 'atom_index'])

    for atom_idx in [0,1]:
        df = map_atom_info(df,df_temp, atom_idx)
        df = df.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                            'dist_closest': f'dist_closest_{atom_idx}',
                                            'x_closest': f'x_closest_{atom_idx}',
                                            'y_closest': f'y_closest_{atom_idx}',
                                            'z_closest': f'z_closest_{atom_idx}'})

    df["dist_0"]=((df['x_0']-df['x_closest_0'])**2+(df['y_0']-df['y_closest_0'])**2+(df['z_0']-df['z_closest_0'])**2)**(1/2)
    df["dist_1"]=((df['x_1']-df['x_closest_1'])**2+(df['y_1']-df['y_closest_1'])**2+(df['z_1']-df['z_closest_1'])**2)**(1/2)
    df["vec_0_x"]=(df['x_0']-df['x_closest_0'])/df["dist_0"]
    df["vec_0_y"]=(df['y_0']-df['y_closest_0'])/df["dist_0"]
    df["vec_0_z"]=(df['z_0']-df['z_closest_0'])/df["dist_0"]
    df["vec_1_x"]=(df['x_1']-df['x_closest_1'])/df["dist_1"]
    df["vec_1_y"]=(df['y_1']-df['y_closest_1'])/df["dist_1"]
    df["vec_1_z"]=(df['z_1']-df['z_closest_1'])/df["dist_1"]
    df["vec_x"]=(df['x_1']-df['x_0'])/df["dist"]
    df["vec_y"]=(df['y_1']-df['y_0'])/df["dist"]
    df["vec_z"]=(df['z_1']-df['z_0'])/df["dist"]
    df["cos_0_1"]=df["vec_0_x"]*df["vec_1_x"]+df["vec_0_y"]*df["vec_1_y"]+df["vec_0_z"]*df["vec_1_z"]
    df["cos_0"]=df["vec_0_x"]*df["vec_x"]+df["vec_0_y"]*df["vec_y"]+df["vec_0_z"]*df["vec_z"]
    df["cos_1"]=df["vec_1_x"]*df["vec_x"]+df["vec_1_y"]*df["vec_y"]+df["vec_1_z"]*df["vec_z"]
    df=df.drop(['vec_0_x','vec_0_y','vec_0_z','vec_1_x','vec_1_y','vec_1_z','vec_x','vec_y','vec_z'], axis=1)
    
    df = reduce_mem_usage(df)
    
    return df

train = add_cos_features(train)
test  = add_cos_features(test)


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Mem. usage decreased to 1101.70 Mb (0.0% reduction)
Mem. usage decreased to 573.47 Mb (0.0% reduction)


続いて、**分子単位**で統計量を計算

In [50]:
def create_features(df):
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    
    df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0'])['dist'].transform('std')
    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
    df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
    df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']

    df = reduce_mem_usage(df)
    return df

In [51]:
%%time
train = create_features(train)

Mem. usage decreased to 2896.42 Mb (3.0% reduction)
CPU times: user 13min 9s, sys: 39.8 s, total: 13min 49s
Wall time: 13min 49s


In [52]:
%%time
test = create_features(test)

Mem. usage decreased to 1538.82 Mb (3.0% reduction)
CPU times: user 6min 58s, sys: 19.4 s, total: 7min 18s
Wall time: 7min 18s


---
LabelEncode
- `atom_1` = {H, C, N}
- `type_0` = {1, 2, 3}
- `type`   = {2JHC, ...}

In [53]:
for f in ['atom_1', 'type_0', 'type']:
    if f in train.columns:
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

---
**show features**

In [54]:
train.head(2)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,type_0,dist,dist_x,dist_y,dist_z,atom_index_closest_0,dist_closest_0,x_closest_0,y_closest_0,z_closest_0,atom_index_closest_1,dist_closest_1,x_closest_1,y_closest_1,z_closest_1,dist_0,dist_1,cos_0_1,cos_0,cos_1,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
0,0,dsgdb9nsd_000001,1,0,0,84.8076,H,0.00215,-0.006031,0.001976,0,-0.012698,1.085804,0.008001,0,1.091953,0.00022,1.192105,3.6e-05,0,1.091953,-0.012698,1.085804,0.008001,3,1.091946,-0.540815,1.447527,-0.876644,1.091953,1.091946,0.333335,-1.0,-0.333335,10,1.506668,1.091946,1.783158,4,4,0.727907,1.358754,0.272949,1.25138,1.463751,0.377947,0.182278,0.727957,1.610344,0.518391,1.474738,1.783157,0.691204,1.632998,1.091953,0.0,1.0,0.345594,-0.746359,0.316492,1.09195,-3e-06,0.999997,1.091953,0.0,1.0,1.091946,-7e-06,0.999994,3e-06,-1.09195,3e-06,1.09195,1.091946,-7e-06,0.999994,3e-06,-1.09195,3e-06,-1.09195,1.09195,-3e-06,0.999997,1.091953,1.091946,3e-06,-1.09195
1,1,dsgdb9nsd_000001,1,2,3,-11.257,H,0.00215,-0.006031,0.001976,1,1.011731,1.463751,0.000277,1,1.78312,1.019253,2.160261,3e-06,0,1.091953,-0.012698,1.085804,0.008001,0,1.091952,-0.012698,1.085804,0.008001,1.091953,1.091952,-0.333287,-0.816483,0.816482,10,1.506668,1.091946,1.783158,4,1,0.727907,1.358754,-0.104998,0.928268,1.463751,0.0,0.182278,0.727957,1.610344,-0.172776,0.903105,1.783157,3.7e-05,1.000021,1.091953,-0.691167,0.612383,0.345594,-1.437526,0.193814,1.78312,0.0,1.0,1.78312,0.0,1.0,1.78312,0.0,1.0,,,,1.783146,1.78312,0.0,1.0,1.4e-05,-1.783106,1.4e-05,-1.783106,1.783146,2.7e-05,1.000015,1.783158,1.78312,1.4e-05,-1.783106


In [55]:
print(train.columns)

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'atom_0', 'x_0', 'y_0', 'z_0', 'atom_1',
       'x_1', 'y_1', 'z_1', 'type_0', 'dist', 'dist_x', 'dist_y', 'dist_z',
       'atom_index_closest_0', 'dist_closest_0', 'x_closest_0', 'y_closest_0',
       'z_closest_0', 'atom_index_closest_1', 'dist_closest_1', 'x_closest_1',
       'y_closest_1', 'z_closest_1', 'dist_0', 'dist_1', 'cos_0_1', 'cos_0',
       'cos_1', 'molecule_couples', 'molecule_dist_mean', 'molecule_dist_min',
       'molecule_dist_max', 'atom_0_couples_count', 'atom_1_couples_count',
       'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean',
       'molecule_atom_index_0_y_1_mean_diff',
       'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max',
       'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_std',
       'molecule_atom_index_0_z_1_std', 'molecule_atom_index_0_dist_mean',
       'molecule_atom_index_0_dist_me

# create train, test data

In [56]:
# train = reduce_mem_usage(train)
# test  = reduce_mem_usage(test)
y = train['scalar_coupling_constant']
train = train.drop(['id', 'molecule_name', 'atom_0', 'scalar_coupling_constant'], axis=1)
test  =  test.drop(['id', 'molecule_name', 'atom_0'], axis=1)

X = train.copy()
X_test = test.copy()

In [57]:
del train, test

In [58]:
gc.collect()

2608

# Hyperopt

In [59]:
X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                  y,
                                                  test_size = 0.30, 
                                                  random_state = 0)

In [60]:
N_JOBS = multiprocessing.cpu_count() -3

In [61]:
# Define searched space
hyper_space = {'objective': 'regression',
               'metric':'mae',
               'boosting':'gbdt',
               'max_depth':  hp.choice('max_depth', [5, 8, 10, 12, 15]),
#                'num_leaves': hp.choice('num_leaves', [100, 250, 500, 650, 750, 1000,1300]),
               'num_leaves': hp.choice('num_leaves', [10, 50, 100, 250]),
               'subsample': hp.choice('subsample', [.3, .5, .7, .8, 1]),
               'subsample_freq': 1,
               'colsample_bytree': hp.choice('colsample_bytree', [ .6, .7, .8, .9, 1]),
               'learning_rate': hp.choice('learning_rate', [.1, .2, .3]),
               'reg_alpha': hp.choice('reg_alpha', [.1, .2, .3, .4, .5, .6]),
               'reg_lambda':  hp.choice('reg_lambda', [.1, .2, .3, .4, .5, .6]),               
               'min_child_samples': hp.choice('min_child_samples', [20, 45, 70, 100]),
               'verbosity': -1,
               'bagging_seed': 11,
              }

In [72]:
# type ごとの学習 

# feature_importance  = pd.DataFrame()
# X_short      = pd.DataFrame({'ind': list(X.index),      'type': X['type'].values,      'oof': [0] * len(X), 'target': y_fc})
# X_short_test = pd.DataFrame({'ind': list(X_test.index), 'type': X_test['type'].values, 'prediction': [0] * len(X_test)})
best_params_list = []
for t in sorted(X_train['type'].unique()):
    print('-'*80)
    print(f'- Training of type {t}')
    X_t_train = X_train.loc[X_train['type'] == t]
    X_t_valid = X_valid.loc[X_valid['type'] == t]
    y_t_train = y_train[X_train['type'] == t]
    y_t_valid = y_valid[X_valid['type'] == t]
    
    
    # evaluate_metric
    def evaluate_metric(params):
    #     model_lgb = lgb.train(params, lgtrain, 500, 
    #                           valid_sets=[lgtrain, lgval], early_stopping_rounds=20, 
    #                           verbose_eval=500)
    #     model_lgb = lgb.train(para)
        model_lgb = lgb.LGBMRegressor(**params, n_jobs=N_JOBS, n_estimators=500) 
        model_lgb.fit(X_t_train, y_t_train,
                  eval_set=[(X_t_train, y_t_train), (X_t_valid, y_t_valid)],
                  verbose=500,
                  early_stopping_rounds=100)

        pred = model_lgb.predict(X_t_valid)

        _X_t_valid = X_t_valid.copy()
        _X_t_valid['scalar_coupling_constant'] = y_t_valid
        cv_score = kaggle_metric(_X_t_valid, pred)
        _X_t_valid = _X_t_valid.drop(['scalar_coupling_constant'], axis=1)

#         print(f'mae(valid): {mean_absolute_error(y_t_valid, pred)}')
        print(f'cv_score:{cv_score}')

        return {
            'loss': cv_score,
            'status': STATUS_OK,
            'stats_running': STATUS_RUNNING
        }
    
    
    # hyperopt
    # Trail
    trials = Trials()

    # Set algoritm parameters
    algo = partial(tpe.suggest, 
                   n_startup_jobs=-1)

    # Seting the number of Evals
    MAX_EVALS= 15

    # Fit Tree Parzen Estimator
    best_vals = fmin(evaluate_metric, space=hyper_space, verbose=1,
                     algo=algo, max_evals=MAX_EVALS, trials=trials)

    # Print best parameters
    best_params = space_eval(hyper_space, best_vals)
    best_params_list.append(best_params)
    print("BEST PARAMETERS: " + str(best_params))

--------------------------------------------------------------------------------
- Training of type 0
Training until validation scores don't improve for 100 rounds.
[500]	training's l1: 1.78913	valid_1's l1: 2.05966
Did not meet early stopping. Best iteration is:
[500]	training's l1: 1.78913	valid_1's l1: 2.05966
cv_score:0.7225389899051997
Training until validation scores don't improve for 100 rounds.
[500]	training's l1: 1.78913	valid_1's l1: 2.05966
Did not meet early stopping. Best iteration is:
[500]	training's l1: 1.78913	valid_1's l1: 2.05966
cv_score:0.7225389899051997
Training until validation scores don't improve for 100 rounds.
[500]	training's l1: 1.27756	valid_1's l1: 1.94423
Did not meet early stopping. Best iteration is:
[500]	training's l1: 1.27756	valid_1's l1: 1.94423
cv_score:0.6648638078294542
Training until validation scores don't improve for 100 rounds.
[500]	training's l1: 1.14546	valid_1's l1: 1.80978
Did not meet early stopping. Best iteration is:
[500]	trainin

In [73]:
best_params_list

[{'bagging_seed': 11,
  'boosting': 'gbdt',
  'colsample_bytree': 0.7,
  'learning_rate': 0.1,
  'max_depth': 15,
  'metric': 'mae',
  'min_child_samples': 70,
  'num_leaves': 250,
  'objective': 'regression',
  'reg_alpha': 0.6,
  'reg_lambda': 0.3,
  'subsample': 0.8,
  'subsample_freq': 1,
  'verbosity': -1},
 {'bagging_seed': 11,
  'boosting': 'gbdt',
  'colsample_bytree': 0.7,
  'learning_rate': 0.1,
  'max_depth': 10,
  'metric': 'mae',
  'min_child_samples': 20,
  'num_leaves': 100,
  'objective': 'regression',
  'reg_alpha': 0.5,
  'reg_lambda': 0.6,
  'subsample': 0.7,
  'subsample_freq': 1,
  'verbosity': -1},
 {'bagging_seed': 11,
  'boosting': 'gbdt',
  'colsample_bytree': 0.9,
  'learning_rate': 0.1,
  'max_depth': 15,
  'metric': 'mae',
  'min_child_samples': 100,
  'num_leaves': 250,
  'objective': 'regression',
  'reg_alpha': 0.2,
  'reg_lambda': 0.3,
  'subsample': 0.5,
  'subsample_freq': 1,
  'verbosity': -1},
 {'bagging_seed': 11,
  'boosting': 'gbdt',
  'colsample_

In [74]:
lgb_params_list = [{'bagging_seed': 11,
  'boosting': 'gbdt',
  'colsample_bytree': 0.7,
  'learning_rate': 0.1,
  'max_depth': 15,
  'metric': 'mae',
  'min_child_samples': 70,
  'num_leaves': 250,
  'objective': 'regression',
  'reg_alpha': 0.6,
  'reg_lambda': 0.3,
  'subsample': 0.8,
  'subsample_freq': 1,
  'verbosity': -1},
 {'bagging_seed': 11,
  'boosting': 'gbdt',
  'colsample_bytree': 0.7,
  'learning_rate': 0.1,
  'max_depth': 10,
  'metric': 'mae',
  'min_child_samples': 20,
  'num_leaves': 100,
  'objective': 'regression',
  'reg_alpha': 0.5,
  'reg_lambda': 0.6,
  'subsample': 0.7,
  'subsample_freq': 1,
  'verbosity': -1},
 {'bagging_seed': 11,
  'boosting': 'gbdt',
  'colsample_bytree': 0.9,
  'learning_rate': 0.1,
  'max_depth': 15,
  'metric': 'mae',
  'min_child_samples': 100,
  'num_leaves': 250,
  'objective': 'regression',
  'reg_alpha': 0.2,
  'reg_lambda': 0.3,
  'subsample': 0.5,
  'subsample_freq': 1,
  'verbosity': -1},
 {'bagging_seed': 11,
  'boosting': 'gbdt',
  'colsample_bytree': 0.8,
  'learning_rate': 0.2,
  'max_depth': 12,
  'metric': 'mae',
  'min_child_samples': 70,
  'num_leaves': 100,
  'objective': 'regression',
  'reg_alpha': 0.6,
  'reg_lambda': 0.2,
  'subsample': 0.8,
  'subsample_freq': 1,
  'verbosity': -1},
 {'bagging_seed': 11,
  'boosting': 'gbdt',
  'colsample_bytree': 1,
  'learning_rate': 0.1,
  'max_depth': 15,
  'metric': 'mae',
  'min_child_samples': 100,
  'num_leaves': 250,
  'objective': 'regression',
  'reg_alpha': 0.6,
  'reg_lambda': 0.4,
  'subsample': 0.7,
  'subsample_freq': 1,
  'verbosity': -1},
 {'bagging_seed': 11,
  'boosting': 'gbdt',
  'colsample_bytree': 1,
  'learning_rate': 0.2,
  'max_depth': 15,
  'metric': 'mae',
  'min_child_samples': 100,
  'num_leaves': 250,
  'objective': 'regression',
  'reg_alpha': 0.2,
  'reg_lambda': 0.6,
  'subsample': 1,
  'subsample_freq': 1,
  'verbosity': -1},
 {'bagging_seed': 11,
  'boosting': 'gbdt',
  'colsample_bytree': 0.7,
  'learning_rate': 0.2,
  'max_depth': 15,
  'metric': 'mae',
  'min_child_samples': 70,
  'num_leaves': 250,
  'objective': 'regression',
  'reg_alpha': 0.6,
  'reg_lambda': 0.2,
  'subsample': 1,
  'subsample_freq': 1,
  'verbosity': -1},
 {'bagging_seed': 11,
  'boosting': 'gbdt',
  'colsample_bytree': 0.7,
  'learning_rate': 0.2,
  'max_depth': 12,
  'metric': 'mae',
  'min_child_samples': 45,
  'num_leaves': 250,
  'objective': 'regression',
  'reg_alpha': 0.1,
  'reg_lambda': 0.4,
  'subsample': 0.8,
  'subsample_freq': 1,
  'verbosity': -1}]

In [75]:
len(lgb_params_list)

8