# Introduction
- distance とその統計量  
- cos features  
- dist_interact  
- openbabelcharge 特徴量(nb32)
- fc
- nb36 の特徴量を用いて、NNを使用する

# Import everything I need :)

In [2]:
import time
import multiprocessing
import glob
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F
import lightgbm as lgb
from fastprogress import progress_bar

# Preparation

In [3]:
nb = 38
isSmallSet = False
length = 2000
is_cuda = torch.cuda.is_available()
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(3))

True
Tesla V100-SXM2-32GB


In [4]:
pd.set_option('display.max_columns', 200)
warnings.filterwarnings('ignore')

In [5]:
file_path = '../input/champs-scalar-coupling/'
glob.glob(file_path + '*')

['../input/champs-scalar-coupling/scalar_coupling_contributions.csv',
 '../input/champs-scalar-coupling/magnetic_shielding_tensors.csv',
 '../input/champs-scalar-coupling/structures.csv',
 '../input/champs-scalar-coupling/test.csv',
 '../input/champs-scalar-coupling/dipole_moments.csv',
 '../input/champs-scalar-coupling/potential_energy.csv',
 '../input/champs-scalar-coupling/sample_submission.csv',
 '../input/champs-scalar-coupling/train.csv',
 '../input/champs-scalar-coupling/nb33_train_dist-interaction.csv',
 '../input/champs-scalar-coupling/test_ob_charges_V7EstimatioofMullikenChargeswithOpenBabel.csv',
 '../input/champs-scalar-coupling/nb29_fc_test_feature.csv',
 '../input/champs-scalar-coupling/train_ob_charges_V7EstimatioofMullikenChargeswithOpenBabel.csv',
 '../input/champs-scalar-coupling/nb33_test_dist-interaction.csv',
 '../input/champs-scalar-coupling/mulliken_charges.csv',
 '../input/champs-scalar-coupling/nb29_fc_train_feature.csv']

In [6]:
# train
path = file_path + 'train.csv'
if isSmallSet:
    train = pd.read_csv(path) [:length]
else:
    train = pd.read_csv(path)

In [7]:
# test
path = file_path + 'test.csv'
if isSmallSet:
    test = pd.read_csv(path)[:length]
else:
    test = pd.read_csv(path)

In [8]:
# structure
path = file_path + 'structures.csv'
structures = pd.read_csv(path)

In [9]:
# fc_train
path = file_path + 'nb29_fc_train_feature.csv'
if isSmallSet:
    fc_train = pd.read_csv(path)[:length]
else:
    fc_train = pd.read_csv(path)

In [10]:
# fc_test
path = file_path + 'nb29_fc_test_feature.csv'
if isSmallSet:
    fc_test = pd.read_csv(path)[:length]
else:
    fc_test = pd.read_csv(path)

In [11]:
# train dist-interact
path = file_path + 'nb33_train_dist-interaction.csv'
if isSmallSet:
    dist_interact_train = pd.read_csv(path)[:length]
else:
    dist_interact_train = pd.read_csv(path)

In [12]:
# test dist-interact
path = file_path + 'nb33_test_dist-interaction.csv'
if isSmallSet:
    dist_interact_test = pd.read_csv(path)[:length]
else:
    dist_interact_test = pd.read_csv(path)

In [13]:
# ob charge train
path = file_path + 'train_ob_charges_V7EstimatioofMullikenChargeswithOpenBabel.csv'
if isSmallSet:
    ob_charge_train = pd.read_csv(path)[:length].drop(['Unnamed: 0', 'error'], axis=1)
else:
    ob_charge_train = pd.read_csv(path).drop(['Unnamed: 0', 'error'], axis=1)

In [14]:
# ob charge test
path = file_path + 'test_ob_charges_V7EstimatioofMullikenChargeswithOpenBabel.csv'
if isSmallSet:
    ob_charge_test = pd.read_csv(path)[:length].drop(['Unnamed: 0', 'error'], axis=1)
else:
    ob_charge_test = pd.read_csv(path).drop(['Unnamed: 0', 'error'], axis=1)

In [15]:
len(test), len(fc_test)

(2505542, 2505542)

In [16]:
len(train), len(fc_train)

(4658147, 4658147)

In [17]:
if isSmallSet:
    print('using SmallSet !!')
    print('-------------------')

print(f'There are {train.shape[0]} rows in train data.')
print(f'There are {test.shape[0]} rows in test data.')

print(f"There are {train['molecule_name'].nunique()} distinct molecules in train data.")
print(f"There are {test['molecule_name'].nunique()} distinct molecules in test data.")
print(f"There are {train['atom_index_0'].nunique()} unique atoms.")
print(f"There are {train['type'].nunique()} unique types.")

There are 4658147 rows in train data.
There are 2505542 rows in test data.
There are 85003 distinct molecules in train data.
There are 45772 distinct molecules in test data.
There are 29 unique atoms.
There are 8 unique types.


---
## myFunc
**metrics**

In [18]:
def kaggle_metric(df, preds):
    df["prediction"] = preds
    maes = []
    for t in df.type.unique():
        y_true = df[df.type==t].scalar_coupling_constant.values
        y_pred = df[df.type==t].prediction.values
        mae = np.log(mean_absolute_error(y_true, y_pred))
        maes.append(mae)
    return np.mean(maes)

---
**momory**

In [19]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max and c_prec == np.finfo(np.float16).precision:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max and c_prec == np.finfo(np.float32).precision:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Feature Engineering

**dist-interact**

In [20]:
train['dist_interact'] = dist_interact_train.values
test['dist_interact'] = dist_interact_test.values

**basic**

In [21]:
def map_atom_info(df_1,df_2, atom_idx):
    df = pd.merge(df_1, df_2, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)
    return df


# structure and ob_charges
ob_charge = pd.concat([ob_charge_train, ob_charge_test])
merge = pd.merge(ob_charge, structures, how='left',
                  left_on  = ['molecule_name', 'atom_index'],
                  right_on = ['molecule_name', 'atom_index'])
for atom_idx in [0,1]:
    train = map_atom_info(train, merge, atom_idx)
    test  = map_atom_info(test,  merge, atom_idx)
    
    train = train.rename(columns={'atom': f'atom_{atom_idx}',
                                        'x': f'x_{atom_idx}',
                                        'y': f'y_{atom_idx}',
                                        'z': f'z_{atom_idx}',
                                        'eem': f'eem_{atom_idx}',
                                     'mmff94': f'mmff94_{atom_idx}',
                                  'gasteiger': f'gasteiger_{atom_idx}', 
                                        'qeq': f'qeq_{atom_idx}',
                                      'qtpie': f'qtpie_{atom_idx}', 
                                  'eem2015ha': f'eem2015ha_{atom_idx}', 
                                  'eem2015hm': f'eem2015hm_{atom_idx}', 
                                  'eem2015hn': f'eem2015hn_{atom_idx}', 
                                  'eem2015ba': f'eem2015ba_{atom_idx}', 
                                  'eem2015bm': f'eem2015bm_{atom_idx}', 
                                  'eem2015bn': f'eem2015bn_{atom_idx}',})
    test = test.rename(columns={'atom': f'atom_{atom_idx}',
                                        'x': f'x_{atom_idx}',
                                        'y': f'y_{atom_idx}',
                                        'z': f'z_{atom_idx}',
                                        'eem': f'eem_{atom_idx}',
                                     'mmff94': f'mmff94_{atom_idx}',
                                  'gasteiger': f'gasteiger_{atom_idx}', 
                                        'qeq': f'qeq_{atom_idx}', 
                                      'qtpie': f'qtpie_{atom_idx}', 
                                  'eem2015ha': f'eem2015ha_{atom_idx}', 
                                  'eem2015hm': f'eem2015hm_{atom_idx}', 
                                  'eem2015hn': f'eem2015hn_{atom_idx}', 
                                  'eem2015ba': f'eem2015ba_{atom_idx}', 
                                  'eem2015bm': f'eem2015bm_{atom_idx}', 
                                  'eem2015bn': f'eem2015bn_{atom_idx}'})
#     test  =  test.rename(columns={'atom': f'atom_{atom_idx}',
#                                         'x': f'x_{atom_idx}',
#                                         'y': f'y_{atom_idx}',
#                                         'z': f'z_{atom_idx}'})

# ob_charges
# train = map_atom_info(train, ob_charge_train, 0)
# test  = map_atom_info(test,  ob_charge_test,  0)
# train = map_atom_info(train, ob_charge_train, 1)
# test  = map_atom_info(test,  ob_charge_test,  1)

`type` の特徴量から、数字を抽出  
例) 2JHC ---> 2

In [22]:
def create_type0(df):
    df['type_0'] = df['type'].apply(lambda x : x[0])
    return df

distances

In [23]:
def distances(df):
    df_p_0 = df[['x_0', 'y_0', 'z_0']].values
    df_p_1 = df[['x_1', 'y_1', 'z_1']].values
    
    df['dist'] = np.linalg.norm(df_p_0 - df_p_1, axis=1)
    df['dist_x'] = (df['x_0'] - df['x_1']) ** 2
    df['dist_y'] = (df['y_0'] - df['y_1']) ** 2
    df['dist_z'] = (df['z_0'] - df['z_1']) ** 2
    
    return df

# train = distances(train)
# test  = distances(test)

distance 統計量

In [24]:
def create_features(df):
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0'])['dist'].transform('std')
    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
    df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
    df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']
    return df

angle features

In [25]:
def map_atom_info(df_1,df_2, atom_idx):
    df = pd.merge(df_1, df_2, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)

    return df

def create_closest(df):
    df_temp=df.loc[:,["molecule_name","atom_index_0","atom_index_1","dist","x_0","y_0","z_0","x_1","y_1","z_1"]].copy()
    df_temp_=df_temp.copy()
    df_temp_= df_temp_.rename(columns={'atom_index_0': 'atom_index_1',
                                       'atom_index_1': 'atom_index_0',
                                       'x_0': 'x_1',
                                       'y_0': 'y_1',
                                       'z_0': 'z_1',
                                       'x_1': 'x_0',
                                       'y_1': 'y_0',
                                       'z_1': 'z_0'})
    df_temp=pd.concat(objs=[df_temp,df_temp_],axis=0)

    df_temp["min_distance"]=df_temp.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df_temp= df_temp[df_temp["min_distance"]==df_temp["dist"]]

    df_temp=df_temp.drop(['x_0','y_0','z_0','min_distance', 'dist'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                     'atom_index_1': 'atom_index_closest',
                                     'distance': 'distance_closest',
                                     'x_1': 'x_closest',
                                     'y_1': 'y_closest',
                                     'z_1': 'z_closest'})

    for atom_idx in [0,1]:
        df = map_atom_info(df,df_temp, atom_idx)
        df = df.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                        'distance_closest': f'distance_closest_{atom_idx}',
                                        'x_closest': f'x_closest_{atom_idx}',
                                        'y_closest': f'y_closest_{atom_idx}',
                                        'z_closest': f'z_closest_{atom_idx}'})
    return df

def add_cos_features(df):
    df["distance_0"]=((df['x_0']-df['x_closest_0'])**2+(df['y_0']-df['y_closest_0'])**2+(df['z_0']-df['z_closest_0'])**2)**(1/2)
    df["distance_1"]=((df['x_1']-df['x_closest_1'])**2+(df['y_1']-df['y_closest_1'])**2+(df['z_1']-df['z_closest_1'])**2)**(1/2)
    df["vec_0_x"]=(df['x_0']-df['x_closest_0'])/df["distance_0"]
    df["vec_0_y"]=(df['y_0']-df['y_closest_0'])/df["distance_0"]
    df["vec_0_z"]=(df['z_0']-df['z_closest_0'])/df["distance_0"]
    df["vec_1_x"]=(df['x_1']-df['x_closest_1'])/df["distance_1"]
    df["vec_1_y"]=(df['y_1']-df['y_closest_1'])/df["distance_1"]
    df["vec_1_z"]=(df['z_1']-df['z_closest_1'])/df["distance_1"]
    df["vec_x"]=(df['x_1']-df['x_0'])/df["dist"]
    df["vec_y"]=(df['y_1']-df['y_0'])/df["dist"]
    df["vec_z"]=(df['z_1']-df['z_0'])/df["dist"]
    df["cos_0_1"]=df["vec_0_x"]*df["vec_1_x"]+df["vec_0_y"]*df["vec_1_y"]+df["vec_0_z"]*df["vec_1_z"]
    df["cos_0"]=df["vec_0_x"]*df["vec_x"]+df["vec_0_y"]*df["vec_y"]+df["vec_0_z"]*df["vec_z"]
    df["cos_1"]=df["vec_1_x"]*df["vec_x"]+df["vec_1_y"]*df["vec_y"]+df["vec_1_z"]*df["vec_z"]
    df=df.drop(['vec_0_x','vec_0_y','vec_0_z','vec_1_x','vec_1_y','vec_1_z','vec_x','vec_y','vec_z'], axis=1)
    return df



In [26]:
%%time

print('add fc')
print(len(train), len(test))
train['fc'] = fc_train.values
test['fc']  = fc_test.values

print('type0')
print(len(train), len(test))
train = create_type0(train)
test  = create_type0(test)

print('distances')
print(len(train), len(test))
train = distances(train)
test  = distances(test)

print('create_featueres')
print(len(train), len(test))
train = create_features(train)
test  = create_features(test)

print('create_closest')
print(len(train), len(test))
train = create_closest(train)
test  = create_closest(test)

print('add_cos_features')
print(len(train), len(test))
train = add_cos_features(train)
test  = add_cos_features(test)

add fc
4658147 2505542
type0
4658147 2505542
distances
4658147 2505542
create_featueres
4658147 2505542
create_closest
4658147 2505542
add_cos_features
4658154 2505542
CPU times: user 1min 38s, sys: 2min 17s, total: 3min 56s
Wall time: 3min 56s


---
<br>
<br>
<br>
カテゴリカル特徴量 と 数値特徴量

In [27]:
# inf がある特徴量を削除
df = train
for feat in progress_bar(df.columns):
    logi = (df[feat]==np.inf)
    if sum(logi) >= 1:
        print(feat)

molecule_atom_index_0_y_1_mean_div


In [28]:
train = train.drop(['molecule_atom_index_0_y_1_mean_div'], axis=1)
test  =  test.drop(['molecule_atom_index_0_y_1_mean_div'], axis=1)

In [29]:
# int 型の列挙
for feat in train.columns:
    if train[feat].dtypes == np.dtype('int64'):
        print(feat)

id
atom_index_0
atom_index_1
molecule_couples
atom_0_couples_count
atom_1_couples_count
atom_index_closest_0
atom_index_closest_1


In [30]:
cat_cols = ['atom_1','type_0','type']
num_cols = list(set(train.columns) - set(cat_cols) - set(["scalar_coupling_constant", 'molecule_name', 'id', 'atom_0',
                                                          'atom_0_couples_count', 'atom_1_couples_count', 'atom_index_closest_0', 'atom_index_closest_1']))
print(f'カテゴリカル: {cat_cols}')
print(f'数値:        {num_cols}')

カテゴリカル: ['atom_1', 'type_0', 'type']
数値:        ['molecule_atom_index_0_dist_max', 'molecule_atom_index_1_dist_max', 'molecule_type_dist_mean_div', 'gasteiger_0', 'molecule_type_dist_mean', 'molecule_atom_index_1_dist_std', 'molecule_atom_index_1_dist_std_div', 'molecule_atom_1_dist_mean', 'y_0', 'molecule_atom_1_dist_std_diff', 'molecule_atom_index_0_dist_mean_diff', 'distance_1', 'x_closest_0', 'molecule_atom_index_0_dist_max_div', 'molecule_atom_index_1_dist_min_diff', 'molecule_atom_index_0_dist_mean_div', 'dist_interact', 'molecule_atom_1_dist_std', 'cos_1', 'y_1', 'molecule_atom_index_1_dist_mean_div', 'z_1', 'dist_y', 'mmff94_1', 'molecule_atom_index_1_dist_std_diff', 'molecule_dist_min', 'molecule_atom_index_0_dist_std_diff', 'y_closest_0', 'qeq_0', 'molecule_atom_index_1_dist_max_diff', 'molecule_atom_index_0_x_1_std', 'cos_0', 'molecule_atom_index_0_dist_min', 'x_1', 'molecule_atom_index_0_y_1_mean', 'qtpie_0', 'cos_0_1', 'molecule_type_dist_max', 'x_0', 'eem2015ba_0', 'eem20

<br>
<br>
<br>

LabelEncode
- `atom_1` = {H, C, N}
- `type_0` = {1, 2, 3}
- `type`   = {2JHC, ...}

In [31]:
for f in ['atom_1', 'type_0', 'type']:
    if f in train.columns:
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

<br>
<br>
<br>
標準化

In [32]:
train[num_cols] = train[num_cols].fillna(0)
test[num_cols]  =  test[num_cols].fillna(0)

In [33]:
print('scaling numerical columns')
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

scaling numerical columns


---
**show features**

In [34]:
train.head(2)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,dist_interact,eem_0,mmff94_0,gasteiger_0,qeq_0,qtpie_0,eem2015ha_0,eem2015hm_0,eem2015hn_0,eem2015ba_0,eem2015bm_0,eem2015bn_0,atom_0,x_0,y_0,z_0,eem_1,mmff94_1,gasteiger_1,qeq_1,qtpie_1,eem2015ha_1,eem2015hm_1,eem2015hn_1,eem2015ba_1,eem2015bm_1,eem2015bn_1,atom_1,x_1,y_1,z_1,fc,type_0,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff,atom_index_closest_0,x_closest_0,y_closest_0,z_closest_0,atom_index_closest_1,x_closest_1,y_closest_1,z_closest_1,distance_0,distance_1,cos_0_1,cos_0,cos_1
0,0,dsgdb9nsd_000001,-3.781511,-1.178221,0,84.8076,-1.7495,0.003098,-0.499733,-0.957081,-1.643399,1.849852,-0.677664,-0.464361,-0.129878,-0.631481,-0.516885,-0.065988,H,-0.058942,0.101017,-0.028857,-2.35326,-0.134372,-0.590919,2.500749,-2.627889,-0.220397,-2.200385,-2.429678,-0.42388,-2.000177,-2.638835,0,-0.074538,0.838297,-0.055478,1.972631,0,-1.722481,-0.842808,-0.418013,-0.830647,-2.643478,-8.777096,0.842388,-17.888431,4,4,-0.556914,1.177065,0.28748,0.297534,-0.954753,-3.02904,-0.462845,-4.628944,0.721346,0.709285,-6.266844,-0.315383,0.067509,-0.129872,1.685213,2.044782,-3.588055,1.114701,-0.35018,-3.47621,-5.328886e-06,-0.257917,-3.776541,-0.987156,-0.717324,-0.767633,1.022706,1.092895,-2.01175,0.571017,-1.277446,-8.01584,-0.502063,1.393794,1.577592,-4.61133,0.730822,-1.871366,1.651783,-1.790897,-1.7e-05,-0.068309,-1.689914,-1.876404,-1.181903,1.647381,0,-0.080289,0.726165,-0.04609,3,-0.37314,0.914106,-0.621102,-0.129872,-0.490765,0.698087,-1.215359,-0.66158
1,1,dsgdb9nsd_000001,-3.781511,-0.777737,3,-11.257,-0.741271,0.003098,-0.499733,-0.957081,-1.643399,1.849852,-0.677664,-0.464361,-0.129878,-0.631481,-0.516885,-0.065988,H,-0.058942,0.101017,-0.028857,0.849271,-0.134372,0.191465,-0.684059,0.717267,-0.266437,0.858001,0.879389,-0.168474,0.86644,1.022241,1,0.617917,1.052165,-0.061629,-0.796477,1,-0.784301,-0.406313,-0.033681,-0.830661,-2.643478,-8.777096,0.842388,-17.888431,4,1,-0.556914,1.177065,-0.110587,0.297534,-1.30519,-3.02904,-0.462845,-4.628944,-0.240419,-0.489393,-6.266844,-1.24198,-0.861893,-0.129872,0.755201,0.340917,-3.588055,0.188345,-1.022697,-1.582825,1.2224099999999998e-19,-0.257909,-2.478166,-0.987156,-0.717324,0.481321,1.022714,1.092918,-2.01176,1.817684,-1.277458,-3.64969,1.655812,1.393802,1.577616,-4.611262,-0.188767,-1.87128,0.553795,-0.815413,0.000132,-0.068049,-0.91164,-0.617287,-1.181825,0.601918,0,-0.080289,0.726165,-0.04609,0,-0.062445,0.733214,-0.033964,-0.129872,-0.490753,-0.404405,-0.282954,1.129537


In [35]:
print(len(test.columns))
print(test.columns)

107
Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'dist_interact', 'eem_0', 'mmff94_0', 'gasteiger_0', 'qeq_0',
       ...
       'z_closest_0', 'atom_index_closest_1', 'x_closest_1', 'y_closest_1',
       'z_closest_1', 'distance_0', 'distance_1', 'cos_0_1', 'cos_0', 'cos_1'],
      dtype='object', length=107)


# create train, test data

In [36]:
train = reduce_mem_usage(train)
test  = reduce_mem_usage(test)
y = train['scalar_coupling_constant']
train = train.drop(['id', 'molecule_name', 'atom_0', 'scalar_coupling_constant'], axis=1)
test  =  test.drop(['id', 'molecule_name', 'atom_0'], axis=1)

X = train.copy()
X_test = test.copy()

assert len(X.columns) == len(X_test.columns), f'X と X_test のサイズが違います X: {len(X.columns)}, X_test: {len(X_test.columns)}'

Mem. usage decreased to 3638.29 Mb (6.1% reduction)
Mem. usage decreased to 1937.86 Mb (6.1% reduction)


In [37]:
del train, test

In [38]:
gc.collect()

35

# Training model

**params**

In [39]:
# ----- set params -----
n_folds = 4
# batch_size = 200
batch_size = int(1024 * 2 * 8)
train_epochs = 150
VERBOSE = 30
n_feats =  X.shape[1]
# -----------------------

In [40]:
folds = KFold(n_splits=n_folds, shuffle=True)

In [41]:
# sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# analysis
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in progress_bar([i * 0.01 for i in range(100)]):
        score = accuracy_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'accuracy_score': best_score}
    return search_result

# Model
class Model(nn.Module):
    def __init__(self, in_features, out_features, p=0.2, bias=True):
        super(Model, self).__init__()
        self.fc0 = nn.Linear(in_features, 100, bias)
        self.bn0 = nn.BatchNorm1d(100)
        
        self.fc1 = nn.Linear(100, 512, bias)
        self.bn1 = nn.BatchNorm1d(512)
        
        self.fc2 = nn.Linear(512, 1024, bias)
        self.bn2 = nn.BatchNorm1d(1024)
        
        self.fc3 = nn.Linear(1024, 1024, bias)
        self.bn3 = nn.BatchNorm1d(1024)
        
        self.fc4 = nn.Linear(1024, 512, bias)
        self.bn4 = nn.BatchNorm1d(512)
        
        self.fc5 = nn.Linear(512, 512, bias)
        self.bn5 = nn.BatchNorm1d(512)
        
        self.fc6 = nn.Linear(512, 256, bias)
        self.bn6 = nn.BatchNorm1d(256)
        
        self.fc7 = nn.Linear(256, 256, bias)
        self.bn7 = nn.BatchNorm1d(256)
        
        self.fc8 = nn.Linear(256, 128, bias)
        self.bn8 = nn.BatchNorm1d(128)
        
        self.fc8 = nn.Linear(256, 128, bias)
        self.bn8 = nn.BatchNorm1d(128)
        
        self.fc9 = nn.Linear(128, 64, bias)
        self.bn9 = nn.BatchNorm1d(64)
        
        self.fc10= nn.Linear(64, out_features)
        
        
        # その他
        self.drop = nn.Dropout(p)
        self.leaky_relu = nn.LeakyReLU(0.01)
        
    def forward(self, x):
        x = self.fc0(x)
        x = self.bn0(x)
        x = self.leaky_relu(x)
        x = self.drop(x)
        
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.leaky_relu(x)
        x = self.drop(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.leaky_relu(x)
        x = self.drop(x)
        
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.leaky_relu(x)
        x = self.drop(x)
        
        x = self.fc4(x)
        x = self.bn4(x)
        x = self.leaky_relu(x)
        x = self.drop(x)
        
        x = self.fc5(x)
        x = self.bn5(x)
        x = self.leaky_relu(x)
        x = self.drop(x)
        
        x = self.fc6(x)
        x = self.bn6(x)
        x = self.leaky_relu(x)
        x = self.drop(x)
        
        x = self.fc7(x)
        x = self.bn7(x)
        x = self.leaky_relu(x)
        x = self.drop(x)
        
        x = self.fc8(x)
        x = self.bn8(x)
        x = self.leaky_relu(x)
        x = self.drop(x)
        
        x = self.fc9(x)
        x = self.bn9(x)
        x = self.leaky_relu(x)
        x = self.drop(x)
        
        x = self.fc10(x)
        return x
    

In [42]:
%%time
# Kfold のループ部分
train_preds = np.zeros((len(X)))
test_preds  = np.zeros((len(X_test)))
for i, (train_idx, valid_idx) in enumerate(folds.split(X, y)):

    # X, y, X_val, y_val をテンソル化(PyTorch で扱える形に変換)し、 .cuda() (GPUで計算するために特徴量を GPU に渡す処理)をする。
    X_train_fold = torch.tensor(X.iloc[train_idx, :].values, dtype=torch.float32)
    X_val_fold   = torch.tensor(X.iloc[valid_idx, :].values, dtype=torch.float32)
    X_test_      = torch.tensor(X_test.iloc[:,:].values, dtype=torch.float32) 
    y_train_fold = torch.tensor(y[train_idx, np.newaxis], dtype=torch.float32)
    y_val_fold   = torch.tensor(y[valid_idx, np.newaxis], dtype=torch.float32)
    
    # model を呼び出し
    model = Model(n_feats, 1)
    
    # gpu 使えるならcudaに渡す
    if is_cuda:
        X_train_fold = X_train_fold.cuda()
        y_train_fold = y_train_fold.cuda()
        X_val_fold   = X_val_fold.cuda()
        y_val_fold   = y_val_fold.cuda()
        model = model.cuda()
        model = nn.DataParallel(model) # make parallel
#         model = nn.DataParallel(model).cuda() # マルチGPU
#         cudnn.benchmark = True
    
    # loss 関数を呼び出す。BCELoss() よりも好まれるらしい。。
#     loss_fn = torch.nn.BCEWithLogitsLoss(reduction="sum")
    loss_fn = torch.nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    # dataloader で扱える形( = Dataset )にする
    train_ = torch.utils.data.TensorDataset(X_train_fold, y_train_fold)
    valid_ = torch.utils.data.TensorDataset(X_val_fold, y_val_fold)
    test_  = torch.utils.data.TensorDataset(X_test_)
    
    # X_train_fold batch_size個, y_train_fold batch_size個ずつを各ループで返す iterater の定義
    train_loader = torch.utils.data.DataLoader(train_, batch_size=batch_size, shuffle=True)
    # X_valid_fold batch_size個, y_valid_fold batch_size個ずつを各ループで返す iterater の定義
    valid_loader = torch.utils.data.DataLoader(valid_, batch_size=batch_size, shuffle=False)
    # X_test batch_size個, y_valid_fold batch_size個ずつを各ループで返す iterater の定義
    test_loader = torch.utils.data.DataLoader(test_, batch_size=batch_size, shuffle=False)
    
    print('-'*70)
    print(f'- Fold {i + 1}/{n_folds}')
    print(f'Fold {i + 1} started at {time.ctime()}')
    
    # epoch 分のループを回す
    for epoch in range(train_epochs):
        start_time = time.time()
        
        # model を train mode にする
        model.train()
        avg_loss = 0.

        # X_train_fold と y_train_fold を batch_size 個ずつ渡すループ
#         for X_batch, y_batch in progress_bar(train_loader):
        for X_batch, y_batch in train_loader:
            # predict
            y_pred = model.forward(X_batch)
            # loss の計算
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        
        if ((epoch+1)%VERBOSE==0) or (epoch+1==train_epochs):
            model.eval()
            valid_preds_fold = np.zeros((X_val_fold.size(0)))
            test_preds_fold = np.zeros(len(X_test_))
            avg_val_loss = 0.
            for i, (X_batch, y_batch) in enumerate(valid_loader):
                y_pred = model(X_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
                valid_preds_fold[i * batch_size:(i+1) * batch_size] = y_pred.cpu().numpy()[:, 0] #sigmoid(y_pred.cpu().numpy())[:, 0]

            elapsed_time = time.time() - start_time 
            print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
                epoch + 1, train_epochs, avg_loss, avg_val_loss, elapsed_time))

    # X_test_fold を batch_size ずつ渡すループ    
    for i, (X_batch,) in enumerate(test_loader):
        y_pred = model(X_batch).detach()

        # batch_size のリストのリストになっているのを単一階層のリストに変換して、cpuに値を渡し、テンソルから numpy.array()に変換したものを sigmoid 関数に渡す
        test_preds_fold[i * batch_size:(i+1) * batch_size] = y_pred.cpu().numpy()[:, 0] #sigmoid(y_pred.cpu().numpy())[:, 0]

    train_preds[valid_idx] = valid_preds_fold

    # 予測値の kfold数で割った値を加える
    test_preds += test_preds_fold / n_folds
    
X['scalar_coupling_constant'] = y
cv_score = kaggle_metric(X, train_preds)
X = X.drop(['scalar_coupling_constant', 'prediction'], axis=1)
print('\n CV mean score(group log mae): {0:.4f}\n'.format(cv_score))

----------------------------------------------------------------------
- Fold 1/4
Fold 1 started at Wed Aug  7 05:07:09 2019
Epoch 30/150 	 loss=1.9438 	 val_loss=1.1280 	 time=48.29s
Epoch 60/150 	 loss=1.7287 	 val_loss=1.2109 	 time=48.03s
Epoch 90/150 	 loss=1.6435 	 val_loss=1.3228 	 time=48.27s
Epoch 120/150 	 loss=1.6125 	 val_loss=1.4109 	 time=48.06s
Epoch 150/150 	 loss=1.5932 	 val_loss=1.3817 	 time=48.19s
----------------------------------------------------------------------
- Fold 2/4
Fold 2 started at Wed Aug  7 06:43:27 2019
Epoch 30/150 	 loss=1.9404 	 val_loss=1.1170 	 time=48.35s
Epoch 60/150 	 loss=1.7202 	 val_loss=1.3507 	 time=48.37s


KeyboardInterrupt: 

# Save

**submission**

In [43]:
path_submittion = '../output/' + 'nb{}_submission_nn_{:.4f}.csv'.format(nb, cv_score)
# path_submittion = 'nb{}_submission_lgb_{}.csv'.format(nb, cv_score)
print(f'save pash: {path_submittion}')

NameError: name 'cv_score' is not defined

In [None]:
submittion = pd.read_csv('../input/champs-scalar-coupling/sample_submission.csv')
# submittion = pd.read_csv('./input/champs-scalar-coupling/sample_submission.csv')[:100]
if isSmallSet :
    print('using small set')
else :
    submittion['scalar_coupling_constant'] = test_preds
    submittion.to_csv(path_submittion, index=False)  if not isSmallSet else print('using small set')

---
**result**

In [None]:
path_oof = '../output/' + 'nb{}_oof_nn_{:.4f}.csv'.format(nb, cv_score)
print(f'save pash: {path_oof}')

In [None]:
if isSmallSet:
    print('using small set')
else :
    oof = pd.DataFrame({'oof':train_preds})
    oof.to_csv(path_oof, index=False) if not isSmallSet else print('using small set')

# analysis

In [None]:
plot_data = pd.DataFrame(y)
plot_data.index.name = 'id'
plot_data['yhat'] = train_preds
plot_data['type'] = lbl.inverse_transform(X['type'])

def plot_oof_preds(ctype, llim, ulim):
        plt.figure(figsize=(6,6))
        sns.scatterplot(x='scalar_coupling_constant',y='yhat',
                        data=plot_data.loc[plot_data['type']==ctype,
                        ['scalar_coupling_constant', 'yhat']]);
        plt.xlim((llim, ulim))
        plt.ylim((llim, ulim))
        plt.plot([llim, ulim], [llim, ulim])
        plt.xlabel('scalar_coupling_constant')
        plt.ylabel('predicted')
        plt.title(f'{ctype}', fontsize=18)
        plt.show()

plot_oof_preds(0, 20, 250)
plot_oof_preds(1, 10, 100)
plot_oof_preds(2, -40, 50)
plot_oof_preds(3, -50, 30)
plot_oof_preds(4, -25, 25)
plot_oof_preds(5, -40, 90)
plot_oof_preds(6, -20, 20)
plot_oof_preds(7, -10, 15)