In [0]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········


In [0]:
%cd "/gdrive/My Drive"

/gdrive/My Drive


In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.base import clone
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import math

from tqdm import tqdm
import joblib
import gc

## config

In [0]:
INPUT = './analysis/mole/data/raw/'
TRAIN_PATH = INPUT + 'train.csv'
TEST_PATH = INPUT + 'test.csv'
PREPROCESS = './analysis/mole/data/preprocess/'

MID_MODEL_PATH = PREPROCESS + 'middle_model.pkl'
MODEL_PATH = PREPROCESS + 'model.pkl'
ENCODER_PATH = PREPROCESS + 'le.pkl'

USE_PREPROCESS_DATA = False
TARGET = 'scalar_coupling_constant'
MERGE_KEY = ['molecule_name', 'atom_index_0', 'atom_index_1']
CONTR_COLS = ['fc', 'sd', 'pso', 'dso']
N_FOLDS = 3

atom_weight = {'H': 1.008, 'C': 12.01, 'N': 14.01, 'O':16.00}

## logging

In [0]:
import logging
import logging.handlers


def create_logger(log_file_name):
    logger_ = logging.getLogger('main')
    logger_.setLevel(logging.DEBUG)
    fh = logging.handlers.RotatingFileHandler(
        log_file_name, maxBytes=100000, backupCount=8)
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '[%(levelname)s]%(asctime)s:%(name)s:%(message)s')
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)
    # add the handlers to the logger
    logger_.addHandler(fh)
    logger_.addHandler(ch)


def get_logger():
    return logging.getLogger('main')

In [0]:
create_logger('mole.log')

## util

In [0]:
def onehot(_df):
    cat_names = [name for name, col in _df.iteritems() if col.dtype == 'O']
    df_cat = pd.get_dummies(_df[cat_names])
    _df = pd.concat([_df, df_cat], axis=1).drop(cat_names, axis=1)
    return _df

def label_encode(df):
    cat_names = [name for name, col in df.iteritems() if col.dtype == 'O']    
    for cat_name in cat_names:
        print(cat_name)
        le = LabelEncoder()
        le.fit(df[cat_name].values)
        df[cat_name] = le.transform(df[cat_name].values)
    return df

class Encoder:
    def __init__(self):        
        self.encoders = {}
    
    def fit(self, df, cat_names):
        for cat_name in cat_names:
            le = LabelEncoder()
            le.fit(df[cat_name].values)
            self.encoders[cat_name] = le        
    
    def transform(self, df):
        for cat_name in self.encoders.keys():            
            df[cat_name] = self.encoders[cat_name].transform(df[cat_name].values)
            
        return df


def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    
    return df

def reduce_mem_usage_v2(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max and c_prec == np.finfo(np.float16).precision:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max and c_prec == np.finfo(np.float32).precision:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Preprocess

In [0]:
def map_atom_info(df, strct, atom_idx):
    df = pd.merge(df, strct, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

def calc_dist(df):
    p_0 = df[['x_0', 'y_0', 'z_0']].values
    p_1 = df[['x_1', 'y_1', 'z_1']].values

    df['dist'] = np.linalg.norm(p_0 - p_1, axis=1)
    df['dist_x'] = (df['x_0'] - df['x_1']) ** 2
    df['dist_y'] = (df['y_0'] - df['y_1']) ** 2
    df['dist_z'] = (df['z_0'] - df['z_1']) ** 2

    return df

def divide_type(df):    
    df['type_0'] = df['type'].apply(lambda x: x[0])
    df['type_1'] = df['type'].apply(lambda x: x[1:])
    return df

In [0]:
def feature_engineering(df):
    print("Starting Feature Engineering...")
    g = df.groupby('molecule_name')
    g1 = df.groupby(['molecule_name', 'atom_index_0'])
    g2 = df.groupby(['molecule_name', 'atom_index_1'])
    g3 = df.groupby(['molecule_name', 'atom_1'])
    g4 = df.groupby(['molecule_name', 'type_0'])
    g5 = df.groupby(['molecule_name', 'type'])
    
    df['type_0'] = df['type'].apply(lambda x: x[0])
    df['molecule_couples'] = g['id'].transform('count')
    df['molecule_dist_mean'] = g['dist'].transform('mean')
    df['molecule_dist_min'] = g['dist'].transform('min')
    df['molecule_dist_max'] = g['dist'].transform('max')
    df['atom_0_couples_count'] = g1['id'].transform('count')
    df['atom_1_couples_count'] = g2['id'].transform('count')
    df[f'molecule_atom_index_0_x_1_std'] = g1['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = g1['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = g1['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = g1['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = g1['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = g1['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = g1['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = g1['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = g1['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = g2['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = g2['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = g2['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = g2['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df[f'molecule_atom_1_dist_mean'] = g3['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = g3['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_std'] = g3['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_type_0_dist_std'] = g4['dist'].transform('std')
    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_dist_mean'] = g5['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_dist_max'] = g5['dist'].transform('max')
    df[f'molecule_type_dist_min'] = g5['dist'].transform('min')
    df[f'molecule_type_dist_std'] = g5['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']

    # TODO: back
    # df = reduce_mem_usage(df)
    
    return df

In [0]:
def add_1j(df):
    get_logger().info('load df_1j')
    
    df_1j = joblib.load(PREPROCESS + 'df_1j.pkl')
    
    df = df.merge(df_1j, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left') 
    
    return df


def add_2j_center_atom(df):    
    get_logger().info('load df_2jsim')
    
    df_2j = joblib.load(PREPROCESS + 'df_2jsim.pkl')  
    
    # atom weight
    df_2j['2j_atom_center_weight'] = df_2j['2j_atom_center'].replace(atom_weight)
    
    # sum of norm
    df_2j['2j_sum_norm_vec'] = df_2j['2j_norm_vec_02'] + df_2j['2j_norm_vec_12']
    
    df = df.merge(df_2j, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left')    
    
    # replace missing vlaue to 'nan' for LabelEncoder
    df.loc[df['2j_atom_center'].isnull(), '2j_atom_center'] = 'nan'
    
    return df

def str_sort(s):
    """
    Parameters
    ----------
    x: str   
    """
    # print(s)
    if not isinstance(s, str):
        return s
    elif s[0] > s[1]:
        return s[1] + s[0]
    else:
        return s

def add_3j_center_atom(df):    
    get_logger().info('load df_3jsim')
    
    df_3j = joblib.load(PREPROCESS + 'df_3jsim.pkl')
    
    # atom weight
    s_atom_w0 = df_3j['3j_atom_center_0'].replace(atom_weight)
    s_atom_w1 = df_3j['3j_atom_center_1'].replace(atom_weight)
    df_3j['3j_atom_center_weight'] = s_atom_w0 + s_atom_w1

    # concatenate atom string 'C' + 'C' - > 'CC'
    tmp = df_3j['3j_atom_center_0'] + df_3j['3j_atom_center_1']
    df_3j['3j_atom_center'] = tmp.transform(str_sort)    
    df_3j.drop(['3j_atom_center_0', '3j_atom_center_1'], axis=1, inplace=True)
    
    # sum norm_vec
    df_3j['3j_sum_norm_vec'] = df_3j['3j_norm_vec_02'] + df_3j['3j_norm_vec_13'] + df_3j['3j_norm_vec_23']
    
    df = df.merge(df_3j, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left')    
    
    # replace missing vlaue to 'nan' for LabelEncoder
    df.loc[df['3j_atom_center'].isnull(), '3j_atom_center'] = 'nan'    
    
    return df

In [0]:
def drop_col(df_org):
    df = df_org.copy()
    to_drop = ['id', 'molecule_name', 'atom_index_0', 'atom_index_1',
               'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1', # 'dist_x', 'dist_y', 'dist_z',
               'atom_0', 'atom_1'
              ]
    df = df.drop(to_drop, axis=1)
    
    return df

In [0]:
def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    """
    Fast metric computation for this competition: https://www.kaggle.com/c/champs-scalar-coupling
    Code is from this kernel: https://www.kaggle.com/uberkinder/efficient-metric
    """
    maes = (y_true-y_pred).abs().groupby(types).mean()    
    return np.log(maes.map(lambda x: max(x, floor))).mean()

def oof_train(X_org, y_org, _types):
# def oof_train(_X, _y, _types):
    """
    Parameters
    ----------
    _X: pd.DataFrame, shape [n_samples, n_features]
    _y: array-like object, shape [n_samples]
    _types: array-like object, shsape [n_samples]
        array of `type` (e.g. 2JHC, 1JHC, 3JHH, etc.)
    """
    # TODO: divide data to training and validation about molecular
    
    models = []
    # TODO: back
    _X = X_org.copy().reset_index(drop=True)
    _y = y_org.copy().reset_index(drop=True)
    df_scores = pd.DataFrame(columns=['valid_score'])
    df_pred = pd.DataFrame(index=_X.index).reset_index(drop=True)

    fold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=1)
    for n_fold, (train_idx, valid_idx) in enumerate(fold.split(_X, _types)):
        # prepare data
        X_train, y_train = _X.iloc[train_idx], _y.iloc[train_idx]
        X_valid, y_valid = _X.iloc[valid_idx], _y.iloc[valid_idx]
        print('mean of target. train:{}, valid:{}'.format(y_train.mean(), y_valid.mean()))

        # generate model
        model = gen_model(_X)
        
        # train
        model.fit(X_train, y_train, eval_metric='mae',
                  eval_set=[(X_train, y_train), (X_valid, y_valid)],
                  verbose=100,
                  early_stopping_rounds=100
                  )
        
        # validate
        y_pred = model.predict(X_valid, num_iteration=model.best_iteration_)
        
        types_valid = _types.iloc[valid_idx]
        valid_score = group_mean_log_mae(y_valid, y_pred, types_valid)
        get_logger().info('fold %d valid %f' % (n_fold+1, valid_score))
        
        df_scores = df_scores.append(pd.Series([valid_score], index=['valid_score']), ignore_index=True)
        df_pred.loc[valid_idx, 'proba'] = y_pred
        df_pred.loc[valid_idx, 'y_true'] = y_valid
        models.append(model)
        
        # TODO: back
        # break
    get_logger().info('CV score: %f' % df_scores.mean()[0])
    
    return models, df_scores, df_pred

def oof_predict(_models, _X):
    get_logger().info('Start oof_predict')
    y_pred = np.zeros(_X.shape[0])
        
    for i, model in enumerate(_models):
        get_logger().info('prediction: %d' % i)
        y_pred += model.predict(_X) / len(_models)
    
    get_logger().info('Finish oof_predict')
    return y_pred


def gen_model(_X):
    n_features = _X.shape[1]
    colsample_rate = max(0.1, math.sqrt(n_features)/n_features)
    
    _model = lgb.LGBMRegressor(
        learning_rate=0.2,
        n_estimators=2000,
        num_leaves=128,
        # min_child_weight=15, # good value: 0, 5, 15, 300
        min_child_samples=80,
        subsample=0.7,
        colsample_bytree=1, # colsample_rate,
        objective='regression',
        reg_lambda=0.1,
        reg_alpha=0.1,
        seed=2019
        )
    return _model


In [0]:
def preprocess(df, strct, mode, s_type=None):
    """
    Parameters
    ----------
    df: pd.DataFrame
        dataframe of train.csv or test.csv
    strct: pd.DataFrame
        dataframe of structures.csv
    mode: str
        'train' or 'predict'
    s_type: None or pd.Series
        'type' column (e.g. 1JHC, 2JHH).
        If mode is 'train', the s_type must be specified.
    """
    get_logger().info('Start preprocess()')
    df = add_1j(df)
    df = add_2j_center_atom(df)
    df = add_3j_center_atom(df)
    df = map_atom_info(df, strct, 0)
    df = map_atom_info(df, strct, 1)
    df = calc_dist(df)
    df = divide_type(df)
    df = feature_engineering(df)
    
    display(df.head(10))
    display(df.tail(10))
    
    # encode
    if mode == 'train':
        enc = Encoder()
        enc.fit(df, ['type', 'type_0', 'type_1', 
                     '2j_atom_center', '3j_atom_center'])
        joblib.dump(enc, ENCODER_PATH)
    elif mode == 'predict':
        get_logger().info('loading encoder from %s' % ENCODER_PATH)
        enc = joblib.load(ENCODER_PATH)
    df = enc.transform(df)
        
    use_features = [col for col in df.columns if col not in [TARGET, *CONTR_COLS]] #'fc', 'sd', 'dso', 'pso']]
    get_logger().info(use_features)
    df[use_features] = reduce_mem_usage(df[use_features])
    # TODO: back
    # df = add_scc_feature(df, 'fc', mode=mode, s_type=s_type)
    
    get_logger().info('Finish preprocess()')
    return df

In [0]:
def drop_uneffect_feature(df):
    """
    Drop uneffective features from dataframe
    """
    for col in df.columns:
        if len(df[col].unique()) == 1:
            df.drop(col, axis=1, inplace=True)
    return df

### fermi constant

In [0]:
class CNTR:
    """Model to predict fc/sd/pso/dso columns"""
    
    def __init__(self, y_col):
        self.y_col = y_col
        
    def train(self, df_org, scc, s_type):
        """
        Parameters
        ----------
        s_type: pd.Series
            'type' column (e.g. 1JHC, 2JHH)
        """
        df = df_org.copy()
        # Merge
        key_cols = ['molecule_name', 'atom_index_0', 'atom_index_1']
        df = df.merge(scc[key_cols + [self.y_col]], how='left', on=key_cols)
        
        # drop unnecessary cols        
        df = drop_col(df)        
        
        y = df[self.y_col].copy()        
        df.drop([TARGET, self.y_col], axis=1, inplace=True)
        X = df
        
        display(X.head())
        display(y.head())
        models, scores, y_pred = oof_train(X, y, s_type)
        
        # save model
        joblib.dump(models, MID_MODEL_PATH, compress=3)
        
        self.models_ = models
        self.scores_ = scores
        self.y_pred_ = y_pred
        
    def predict(self, df_org):    
        y_pred = np.zeros(df_org.shape[0])
        
        X = df_org.copy()
        X = drop_col(X)
        
        display(X.head())
        # X = self.preprocess(df_org)
        for model in self.models_:            
            y_pred += model.predict(X) / len(models)
        
        return y_pred
    
    def load_model(self):
        # load pkl by joblib
        self.models_ = joblib.load(MID_MODEL_PATH)

In [0]:
def add_scc_feature(df, cntr_name, mode, s_type=None):
    """
    Parameters
    ----------
    cntr_name: str
        'fc', 'sd', 'pso' or 'dso'
    mode: str
        'train' or 'predict'
    s_type: None or pd.Series
        'type' column (e.g. 1JHC, 2JHH).
        If mode is 'train', the s_type must be specified.
    """
    add_feature = '%s_pred' % cntr_name
    cntr = CNTR(cntr_name)
    if mode == 'train': 
        assert s_type is not None, 's_type must be specified.'
        
        get_logger().info('start loading scalar_coupling_contributions')
        scc = pd.read_csv(INPUT + 'scalar_coupling_contributions.csv')
        get_logger().info('finished loading scalar_coupling_contributions')
        
        # train contribution(fc/sd/pso/dso)
        cntr.train(df, scc, s_type)
    
        display(cntr.y_pred_.head())
        df[add_feature] = cntr.y_pred_
    elif mode == 'predict':
        cntr.load_model()
        y_pred = cntr.predict(df)
        df[add_feature] = y_pred
    
    return df

## Train

In [0]:
df_train = pd.read_csv(TRAIN_PATH)
df_strct = pd.read_csv(INPUT + 'structures.csv')

In [0]:
# TODO: remove
# df_train = df_train[(df_train['type']=='1JHC') | (df_train['type']=='1JHN')]

In [0]:
def train_single_model(df, strct):
    # TODO: back
    df = df.head(10000)

    s_type = df['type'].copy()

    df = preprocess(df, strct, mode='train', s_type=s_type)
    df = drop_col(df)

    y = df[TARGET].copy()
    df.drop([TARGET], axis=1, inplace=True)
    X = df
    
    display(X.head())
    display(y.head())
    models, df_scores, df_pred = oof_train(X, y, s_type)

    joblib.dump(models, MODEL_PATH, compress=3)
    
    return models, df_scores, df_pred

In [0]:
class LGBM:
    def __init__(self, target_col):
        self.target_col = target_col
        self.model_dict = {}
        self.score_dict = {}
        self.pred_dict = {}
    
    def train(self, df, s_type):
        self.cols = df.columns.tolist()
        
        # TODO: back
        coupling_types = s_type.unique()
        # coupling_types = ['1JHC']
        for coup_type in coupling_types:
            get_logger().info('Starting train model(%s %s)' % (self.target_col, coup_type))
            is_the_type = (s_type == coup_type)        
            df_type = df[is_the_type.values]

            y = df_type[self.target_col]
            # df_type.drop([self.target_col], axis=1, inplace=True)
            df_type.drop(CONTR_COLS + [TARGET], axis=1, inplace=True)
            X = df_type
            X = drop_uneffect_feature(X)

            get_logger().info('features(%s): %s' % (coup_type, str(X.columns.tolist())))
            display(X.head())
            display(y.head())
            models, df_scores, df_pred = oof_train(X, y, _types=s_type[is_the_type].reset_index(drop=True))

            self.model_dict[coup_type] = models
            self.score_dict[coup_type] = df_scores
            self.pred_dict[coup_type] = df_pred                     
    
    def predict(self, df, s_type, df_submit):
        # df = df.head(10000)        
                
        # coupling_types = ['1JHC']
        coupling_types = s_type.unique()
        print(coupling_types)
        for coup_type in coupling_types:

            models = self.model_dict[coup_type]

            get_logger().info('Starting predict target(%s %s)' % (self.target_col, coup_type))
            is_the_type = (s_type == coup_type)
            df_type = df[is_the_type]

            X = df_type
            X = drop_uneffect_feature(X)        

            display(X.head())  
            y_pred = oof_predict(models, X)        

            df_submit.loc[is_the_type, self.target_col] = y_pred        
        
        return df_submit

In [0]:
def train_models_each_type(df, strct, use_preprocess_data):
    # TODO:back
    # df = df.head(100000)
    
    get_logger().info('Data size: %s' % str(df.shape))
    
    if use_preprocess_data:
        df = joblib.load(PREPROCESS + 'df_preprocessed.pkl')
    else:
        df_scc = pd.read_csv(INPUT + 'scalar_coupling_contributions.csv')
        df = df.merge(df_scc[MERGE_KEY + CONTR_COLS], on=MERGE_KEY, how='left')    

        s_type = df['type'].copy()

        df = preprocess(df, strct, mode='train', s_type=s_type)
        df = drop_col(df)
        
        joblib.dump(df, PREPROCESS + 'df_preprocessed.pkl', compress=3)
    
    '''
    model_dict = {}
    score_dict = {}
    pred_dict = {}
    coupling_types = s_type.unique()
    for coup_type in coupling_types:
        get_logger().info('Starting train model(%s)' % coup_type)
        is_the_type = (s_type == coup_type)        
        df_type = df[is_the_type.values]
                
        y = df_type[TARGET]
        df_type.drop([TARGET], axis=1, inplace=True)
        X = df_type
        X = drop_uneffect_feature(X)
        
        get_logger().info('features(%s): %s' % (coup_type, str(X.columns.tolist())))
        display(X.head())
        display(y.head())
        models, df_scores, df_pred = oof_train(X, y, _types=s_type[is_the_type].reset_index(drop=True))
        
        model_dict[coup_type] = models
        score_dict[coup_type] = df_scores
        pred_dict[coup_type] = df_pred
    return model_dict, score_dict, pred_dict
    '''
    models = {}
    for target in [TARGET]:# CONTR_COLS:
    # for target in CONTR_COLS:
        model = LGBM(target)
        model.train(df, s_type)
        models[target] = model
        
        model_file = 'model_%s.pkl' % target        
        joblib.dump(model, model_file, compress=3)
    
    get_logger().info('validate sum of fc sd pso dso')
    # coupling_types = ['1JHC']
    coupling_types = s_type.unique()
    for coup_type in coupling_types:
        is_the_type = (s_type == coup_type)
        y_true = df.loc[is_the_type, TARGET].values
        
        y_pred = np.zeros(len(y_true))
        for target in [TARGET]: # CONTR_COLS:
        # for target in CONTR_COLS:
            model = models[target]
            df_pred = model.pred_dict[coup_type]
            y_pred += df_pred['proba'].values
        
        print(y_true[0:10])
        print(y_pred[0:10])
        
        y_true = pd.Series(y_true)
        y_pred = pd.Series(y_pred)
        valid_score = group_mean_log_mae(y_true, y_pred, s_type)
        get_logger().info('valid score(fc+sd+pso+dso %s): %f' % (coup_type, valid_score))
    return models
    

In [0]:
# models, df_scores, df_pred = train_single_model(df_train, df_strct)
models = train_models_each_type(df_train, df_strct, USE_PREPROCESS_DATA)

[INFO]2019-07-01 15:13:35,860:main:Data size: (4658147, 6)
[INFO]2019-07-01 15:13:51,454:main:Start preprocess()
[INFO]2019-07-01 15:13:51,456:main:load df_1j
[INFO]2019-07-01 15:13:56,935:main:load df_2jsim
[INFO]2019-07-01 15:14:14,334:main:load df_3jsim


Starting Feature Engineering...


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,1j_nbonds,2j_atom_center,2j_area_021,2j_norm_vec_02,2j_norm_vec_12,2j_cos,2j_atom_center_weight,2j_sum_norm_vec,3j_norm_vec_02,3j_norm_vec_13,3j_norm_vec_23,3j_cos_023,3j_cos_231,3j_area_023,3j_area_231,3j_dihedral,3j_atom_center_weight,3j_atom_center,3j_sum_norm_vec,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,dist,dist_x,dist_y,...,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,83.0224,0.254579,1.25862,0.27201,4.0,,,,,,,,,,,,,,,,,,,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,1.091953,0.00022,1.192105,...,0.727957,1.610344,0.518391,1.474738,1.783157,0.691204,1.632998,1.091953,0.0,1.0,0.345594,-0.746359,0.316492,1.09195,-3e-06,0.999997,1.091953,0.0,1.0,1.091946,-6.680479e-06,0.999994,3e-06,-1.09195,3e-06,1.09195,1.091946,-7e-06,0.999994,3e-06,-1.09195,3e-06,-1.09195,1.09195,-3e-06,0.999997,1.091953,1.091946,3e-06,-1.09195
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,-11.0347,0.352978,2.85839,-3.4336,,C,1.124187,1.091953,1.091952,-0.333287,12.01,2.183905,,,,,,,,,,,,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,1.78312,1.019253,2.160261,...,0.727957,1.610344,-0.172776,0.903105,1.783157,3.7e-05,1.000021,1.091953,-0.691167,0.612383,0.345594,-1.437526,0.193814,1.78312,0.0,1.0,1.78312,0.0,1.0,1.78312,0.0,1.0,,,,1.783146,1.78312,0.0,1.0,1.4e-05,-1.783106,1.4e-05,-1.783106,1.783146,2.7e-05,1.000015,1.783158,1.78312,1.4e-05,-1.783106
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,-11.0325,0.352944,2.85852,-3.43387,,C,1.124162,1.091953,1.091946,-0.333335,12.01,2.183899,,,,,,,,,,,,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,1.783147,0.294812,2.112831,...,0.727957,1.610344,-0.172803,0.903091,1.783157,9e-06,1.000005,1.091953,-0.691194,0.612374,0.345594,-1.437553,0.193811,1.783153,5e-06,1.000003,1.783158,1e-05,1.000006,1.783147,0.0,1.0,7e-06,-1.78314,4e-06,1.783146,1.78312,-2.8e-05,0.999984,1.4e-05,-1.783134,1.4e-05,-1.783134,1.783146,-1e-06,0.999999,1.783158,1.78312,1.4e-05,-1.783134
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,-11.0319,0.352934,2.85855,-3.43393,,C,1.124158,1.091953,1.091948,-0.333347,12.01,2.183901,,,,,,,,,,,,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,1.783157,0.276638,2.085032,...,0.727957,1.610344,-0.172812,0.903086,1.783157,0.0,1.0,1.091953,-0.691204,0.612371,0.345594,-1.437563,0.19381,1.783151,-6e-06,0.999997,1.783157,0.0,1.0,1.783148,-8.8131e-06,0.999995,5e-06,-1.783152,3e-06,1.783146,1.78312,-3.7e-05,0.999979,1.4e-05,-1.783143,1.4e-05,-1.783143,1.783146,-1e-05,0.999994,1.783158,1.78312,1.4e-05,-1.783143
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,83.0222,0.254585,1.25861,0.272013,4.0,,,,,,,,,,,,,,,,,,,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001,1.091952,1.049455,0.142844,...,0.891529,1.552753,0.460801,1.421998,1.783158,0.691206,1.633001,1.091952,0.0,1.0,0.399065,-0.692886,0.365461,1.09195,-2e-06,0.999998,1.091953,1e-06,1.000001,1.091946,-5.239448e-06,0.999995,3e-06,-1.091948,3e-06,1.09195,1.091946,-5e-06,0.999995,3e-06,-1.091948,3e-06,-1.091948,1.09195,-2e-06,0.999998,1.091953,1.091946,3e-06,-1.091948
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541,-11.0317,0.352932,2.85856,-3.43395,,C,1.124153,1.091952,1.091946,-0.333352,12.01,2.183898,,,,,,,,,,,,H,1.011731,1.463751,0.000277,H,-0.540815,1.447527,-0.876644,1.783158,2.410399,0.000263,...,0.891529,1.552753,-0.230405,0.870788,1.783158,0.0,1.0,1.091952,-0.691206,0.61237,0.399065,-1.384092,0.223797,1.783153,-5e-06,0.999997,1.783158,0.0,1.0,1.783147,-1.016344e-05,0.999994,7e-06,-1.78315,4e-06,1.783146,1.78312,-3.8e-05,0.999979,1.4e-05,-1.783144,1.4e-05,-1.783144,1.783146,-1.1e-05,0.999994,1.783158,1.78312,1.4e-05,-1.783144
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548,-11.0324,0.352943,2.85853,-3.43387,,C,1.124161,1.091952,1.091948,-0.333337,12.01,2.183899,,,,,,,,,,,,H,1.011731,1.463751,0.000277,H,-0.523814,1.437933,0.906397,1.783148,2.357897,0.000667,...,0.891529,1.552753,-0.230396,0.870793,1.783158,9e-06,1.000005,1.091952,-0.691197,0.612373,0.399065,-1.384083,0.223798,1.783151,3e-06,1.000001,1.783157,8e-06,1.000005,1.783148,-5.221498e-07,1.0,5e-06,-1.783143,3e-06,1.783146,1.78312,-2.9e-05,0.999984,1.4e-05,-1.783135,1.4e-05,-1.783135,1.783146,-2e-06,0.999999,1.783158,1.78312,1.4e-05,-1.783135
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,83.0241,0.254634,1.25856,0.272012,4.0,,,,,,,,,,,,,,,,,,,H,-0.540815,1.447527,-0.876644,C,-0.012698,1.085804,0.008001,1.091946,0.278907,0.130843,...,0.635262,1.437547,0.345601,1.3165,1.783148,0.691201,1.632999,1.091946,0.0,1.0,0.488753,-0.603193,0.447598,1.09195,3e-06,1.000003,1.091953,7e-06,1.000006,1.091946,0.0,1.0,3e-06,-1.091943,3e-06,1.09195,1.091946,0.0,1.0,3e-06,-1.091943,3e-06,-1.091943,1.09195,3e-06,1.000003,1.091953,1.091946,3e-06,-1.091943
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543,-11.0319,0.352943,2.85856,-3.43393,,C,1.124153,1.091946,1.091948,-0.333342,12.01,2.183894,,,,,,,,,,,,H,-0.540815,1.447527,-0.876644,H,-0.523814,1.437933,0.906397,1.783148,0.000289,9.2e-05,...,0.635262,1.437547,-0.345601,0.806185,1.783148,0.0,1.0,1.091946,-0.691201,0.61237,0.488753,-1.294395,0.274096,1.783151,3e-06,1.000002,1.783157,9e-06,1.000005,1.783148,0.0,1.0,5e-06,-1.783143,3e-06,1.783146,1.78312,-2.8e-05,0.999984,1.4e-05,-1.783134,1.4e-05,-1.783134,1.783146,-2e-06,0.999999,1.783158,1.78312,1.4e-05,-1.783134
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,83.0243,0.254628,1.25856,0.272012,4.0,,,,,,,,,,,,,,,,,,,H,-0.523814,1.437933,0.906397,C,-0.012698,1.085804,0.008001,1.091948,0.261239,0.123994,...,,1.091948,0.0,1.0,1.091948,0.0,1.0,1.091948,0.0,1.0,,,,1.09195,2e-06,1.000002,1.091953,6e-06,1.000005,1.091946,-1.161979e-06,0.999999,3e-06,-1.091944,3e-06,1.09195,1.091946,-1e-06,0.999999,3e-06,-1.091944,3e-06,-1.091944,1.09195,2e-06,1.000002,1.091953,1.091946,3e-06,-1.091944


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,1j_nbonds,2j_atom_center,2j_area_021,2j_norm_vec_02,2j_norm_vec_12,2j_cos,2j_atom_center_weight,2j_sum_norm_vec,3j_norm_vec_02,3j_norm_vec_13,3j_norm_vec_23,3j_cos_023,3j_cos_231,3j_area_023,3j_area_231,3j_dihedral,3j_atom_center_weight,3j_atom_center,3j_sum_norm_vec,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,dist,dist_x,dist_y,...,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
4658137,4658137,dsgdb9nsd_133884,16,8,2JHC,9.11973,9.15695,0.116162,-0.095455,-0.057927,,C,1.419871,1.09048,1.50993,-0.506344,12.01,2.60041,,,,,,,,,,,,H,-0.084531,1.110807,-1.796741,C,0.787756,-0.840138,-1.042152,2.266379,0.760884,3.806185,...,0.916944,2.600236,0.333857,1.147309,3.358747,1.092368,1.481988,1.09048,-1.175899,0.481155,0.679559,-1.58682,0.299844,2.536801,0.270422,1.119319,3.448811,1.182432,1.521728,1.080997,-1.185382,0.476971,0.823964,-1.442415,0.36356,2.625259,1.080997,-1.185382,0.476971,0.794133,-1.472246,0.135944,-2.130435,2.274912,0.008533,1.003765,2.312206,2.209489,0.037188,-2.229191
4658138,4658138,dsgdb9nsd_133884,16,17,3JHH,0.789559,0.801055,0.07114,0.400232,-0.482868,,,,,,,,,1.09048,1.080997,1.50993,-0.506344,-0.581363,1.419871,1.328055,0.823388,24.02,CC,3.681407,H,-0.084531,1.110807,-1.796741,H,1.12655,-1.348733,-1.933838,2.744968,1.466716,6.049335,...,0.916944,2.600236,-0.144732,0.947274,3.358747,0.61378,1.223602,1.09048,-1.654488,0.397265,0.679559,-2.065408,0.247565,2.705595,-0.039372,0.985657,2.744968,0.0,1.0,2.662406,-0.082561,0.969923,0.041413,-2.703555,0.015087,2.503545,1.784424,-0.9605434,0.650071,0.403132,-2.341835,0.344874,-2.400093,2.706152,-0.038816,0.985859,2.744969,2.6624,0.033847,-2.711121
4658139,4658139,dsgdb9nsd_133884,17,1,3JHN,-0.006537,0.022653,-0.013253,0.052203,-0.06814,,,,,,,,,1.080997,1.560647,1.50993,-0.581363,-0.411704,1.328055,2.14749,-0.310911,24.02,CC,4.151574,H,1.12655,-1.348733,-1.933838,N,-1.5711,0.047932,-0.491726,3.362689,7.277316,1.950674,...,0.755962,2.692607,-0.670082,0.80073,3.495226,0.132537,1.039414,1.080997,-2.281692,0.321468,0.845189,-2.5175,0.251343,2.609557,-0.753132,0.776033,3.362702,1.3e-05,1.000004,2.220388,-1.142301,0.660301,0.583487,-2.779202,0.173518,2.609557,2.220388,-1.142301,0.660301,0.583487,-2.779202,0.344874,-3.017815,3.362695,6e-06,1.000002,3.362702,3.362689,9e-06,-3.36268
4658140,4658140,dsgdb9nsd_133884,17,2,3JHC,1.94438,2.09344,-0.032818,0.29732,-0.413566,,,,,,,,,1.080997,1.509931,1.509711,-0.554904,-0.171603,1.357679,2.245745,-0.825532,24.02,CC,4.10064,H,1.12655,-1.348733,-1.933838,C,-0.75085,-0.602182,0.665932,3.292533,3.524629,0.557339,...,0.755962,2.692607,-0.599926,0.817792,3.495226,0.202693,1.061561,1.080997,-2.211536,0.328318,0.845189,-2.447344,0.256699,2.794773,-0.497761,0.848821,3.470578,0.178044,1.054075,1.09048,-2.202054,0.331198,0.752773,-2.53976,0.22863,2.625259,1.080997,-2.211536,0.328318,0.794133,-2.4984,0.344874,-2.947659,3.210469,-0.082064,0.975076,3.601194,2.469623,0.321635,-2.970899
4658141,4658141,dsgdb9nsd_133884,17,3,2JHC,0.861412,0.789842,0.113557,0.176618,-0.218606,,C,1.357679,1.080997,1.509711,-0.554904,12.01,2.590709,,,,,,,,,,,,H,1.12655,-1.348733,-1.933838,C,0.326978,-1.554195,0.205661,2.293248,0.639315,0.042215,...,0.755962,2.692607,0.399359,1.174146,3.495226,1.201978,1.524138,1.080997,-1.21225,0.471383,0.845189,-1.448058,0.368556,2.536799,0.243551,1.106204,3.448809,1.155561,1.503897,1.080997,-1.212251,0.471383,0.823962,-1.469286,0.359299,2.625259,1.080997,-1.212251,0.471383,0.794133,-1.499115,0.135944,-2.157304,2.274912,-0.018336,0.992004,2.312206,2.209489,0.037188,-2.25606
4658142,4658142,dsgdb9nsd_133884,17,4,2JHC,3.54345,3.58644,0.019741,0.150477,-0.213205,,C,1.390609,1.080997,1.532828,-0.543759,12.01,2.613826,,,,,,,,,,,,H,1.12655,-1.348733,-1.933838,C,1.629865,-0.747236,0.235262,2.306538,0.253326,0.361798,...,0.755962,2.692607,0.386069,1.16738,3.495226,1.188688,1.515356,1.080997,-1.225541,0.468667,0.845189,-1.461349,0.366432,2.371793,0.065255,1.028291,3.243547,0.937008,1.40624,1.083421,-1.223117,0.469717,0.734959,-1.571579,0.318642,2.625259,1.080997,-1.225541,0.468666,0.794133,-1.512405,0.135944,-2.170594,2.274912,-0.031626,0.986288,2.312206,2.209489,0.037188,-2.269351
4658143,4658143,dsgdb9nsd_133884,17,5,3JHC,0.568997,0.674583,-0.007276,0.305078,-0.403388,,,,,,,,,1.080997,1.553264,1.532828,-0.543759,-0.355358,1.390609,2.225488,-0.813418,24.02,CC,4.16709,H,1.12655,-1.348733,-1.933838,C,1.415947,0.620773,0.939122,3.495226,0.08375,3.878955,...,0.755962,2.692607,-0.802619,0.770367,3.495226,0.0,1.0,1.080997,-2.414229,0.309278,0.845189,-2.650037,0.241813,2.778225,-0.717001,0.794863,3.601194,0.105968,1.030318,1.094292,-2.400934,0.313082,1.034817,-2.46041,0.296066,2.625259,1.080997,-2.414229,0.309278,0.794133,-2.701093,0.344874,-3.150352,3.210469,-0.284757,0.91853,3.601194,2.469623,0.321635,-3.173592
4658144,4658144,dsgdb9nsd_133884,17,6,3JHC,1.17337,1.33747,-0.028423,0.31224,-0.44792,,,,,,,,,1.080997,1.542518,1.50993,-0.581363,-0.179514,1.328055,2.291259,-0.986229,24.02,CC,4.133445,H,1.12655,-1.348733,-1.933838,C,-0.027076,0.747033,0.478506,3.397424,1.330852,4.392233,...,0.755962,2.692607,-0.704817,0.792544,3.495226,0.097802,1.028787,1.080997,-2.316427,0.318181,0.845189,-2.552235,0.248774,2.631319,-0.766106,0.774504,3.397424,0.0,1.0,2.21175,-1.185674,0.651008,0.551357,-2.846067,0.162287,2.625259,1.080997,-2.316427,0.318181,0.794133,-2.603291,0.344874,-3.05255,3.210469,-0.186955,0.944972,3.601194,2.469623,0.321635,-3.07579
4658145,4658145,dsgdb9nsd_133884,17,7,2JHC,4.76201,4.80062,0.139202,-0.053102,-0.12471,,C,1.328055,1.080997,1.50993,-0.581363,12.01,2.590927,,,,,,,,,,,,H,1.12655,-1.348733,-1.933838,C,-0.131901,0.356983,-1.010196,2.312202,1.583699,2.909466,...,0.755962,2.692607,0.380405,1.164521,3.495226,1.183024,1.511644,1.080997,-1.231205,0.467518,0.845189,-1.467013,0.365534,2.794769,0.482567,1.208704,3.47058,1.158378,1.500985,1.09048,-1.221722,0.47162,0.752773,-1.559429,0.325565,2.625259,1.080997,-1.231205,0.467518,0.794133,-1.518069,0.135944,-2.176258,2.274912,-0.03729,0.983872,2.312206,2.209489,0.037188,-2.275015
4658146,4658146,dsgdb9nsd_133884,17,8,1JHC,117.934,115.975,0.378277,0.450213,1.1306,4.0,,,,,,,,,,,,,,,,,,,H,1.12655,-1.348733,-1.933838,C,0.787756,-0.840138,-1.042152,1.080997,0.114781,0.258669,...,0.755962,2.692607,1.61161,2.490855,3.495226,2.414229,3.233335,1.080997,0.0,1.0,0.845189,-0.235808,0.781861,2.536801,1.455803,2.346722,3.448811,2.367814,3.190398,1.080997,0.0,1.0,0.823964,-0.257034,0.762225,2.625259,1.080997,-1.288042e-07,1.0,0.794133,-0.286864,0.005663,-1.075335,1.089062,0.008064,1.00746,1.094293,1.080997,0.005663,-1.075335


[INFO]2019-07-01 15:15:27,008:main:['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type', '1j_nbonds', '2j_atom_center', '2j_area_021', '2j_norm_vec_02', '2j_norm_vec_12', '2j_cos', '2j_atom_center_weight', '2j_sum_norm_vec', '3j_norm_vec_02', '3j_norm_vec_13', '3j_norm_vec_23', '3j_cos_023', '3j_cos_231', '3j_area_023', '3j_area_231', '3j_dihedral', '3j_atom_center_weight', '3j_atom_center', '3j_sum_norm_vec', 'atom_0', 'x_0', 'y_0', 'z_0', 'atom_1', 'x_1', 'y_1', 'z_1', 'dist', 'dist_x', 'dist_y', 'dist_z', 'type_0', 'type_1', 'molecule_couples', 'molecule_dist_mean', 'molecule_dist_min', 'molecule_dist_max', 'atom_0_couples_count', 'atom_1_couples_count', 'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean', 'molecule_atom_index_0_y_1_mean_diff', 'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max', 'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_std', 'molecule_atom_index_0_z_1_std', 'molecule_atom_index_0_dist_mean', 'molecul

Mem. usage decreased to 919.57 Mb (71.9% reduction)


[INFO]2019-07-01 15:17:24,494:main:Finish preprocess()
[INFO]2019-07-01 15:18:39,307:main:Starting train model(scalar_coupling_constant 1JHC)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
[INFO]2019-07-01 15:18:44,636:main:features(1JHC): ['1j_nbonds', 'dist', 'dist_x', 'dist_y', 'dist_z', 'molecule_couples', 'molecule_dist_mean', 'molecule_dist_min', 'molecule_dist_max', 'atom_0_couples_count', 'atom_1_couples_count', 'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean', 'molecule_atom_index_0_y_1_mean_diff', 'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max', 'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_std', 'molecule_atom_index_0_z_1_std', 'molecule_atom_index_0_dist_mean', 'molecule_atom_index_0_dist_mean_diff', 'molecule_atom_index_0_dist_mean_div', 'molecule_atom_index_0

Unnamed: 0,1j_nbonds,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
0,4.0,1.091797,0.000221,1.192383,3.6e-05,10,1.506836,1.091797,1.783203,4,4,0.728027,1.358398,0.272949,1.25138,1.463867,0.37793,0.182251,0.728027,1.610352,0.518555,1.474609,1.783203,0.691406,1.632812,1.091797,0.345703,-0.746582,0.316406,1.091797,-3e-06,1.0,1.091797,0.0,1.0,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,-3e-06,1.0,1.091797,1.091797,3e-06,-1.091797
4,4.0,1.091797,1.049805,0.142822,6e-05,10,1.506836,1.091797,1.783203,3,4,0.300049,1.324219,0.237915,1.219147,1.447266,0.361816,0.206177,0.891602,1.552734,0.460693,1.421875,1.783203,0.691406,1.632812,1.091797,0.39917,-0.692871,0.365479,1.091797,-2e-06,1.0,1.091797,1e-06,1.0,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,-2e-06,1.0,1.091797,1.091797,3e-06,-1.091797
7,4.0,1.091797,0.278809,0.130859,0.782715,10,1.506836,1.091797,1.783203,2,4,0.361328,1.261719,0.176025,1.162151,1.4375,0.352051,0.249023,0.635254,1.4375,0.345703,1.316406,1.783203,0.691406,1.632812,1.091797,0.48877,-0.603027,0.44751,1.091797,3e-06,1.0,1.091797,7e-06,1.0,1.091797,0.0,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,0.0,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,3e-06,1.0,1.091797,1.091797,3e-06,-1.091797
9,4.0,1.091797,0.26123,0.124023,0.807129,10,1.506836,1.091797,1.783203,1,4,,1.085938,0.0,1.0,1.085938,0.0,,,1.091797,0.0,1.0,1.091797,0.0,1.0,1.091797,,,,1.091797,2e-06,1.0,1.091797,6e-06,1.0,1.091797,-1e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-1e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,2e-06,1.0,1.091797,1.091797,3e-06,-1.091797
17,2.0,1.066406,0.00021,1.137695,3.5e-05,2,1.642578,1.066406,2.21875,2,1,0.011055,0.556641,-0.575684,0.491541,1.132812,0.0,0.814453,0.00449,1.642578,0.575684,1.540039,2.21875,1.151367,2.080078,1.066406,0.814453,-0.252197,0.763672,1.066406,0.0,1.0,1.066406,0.0,1.0,1.066406,0.0,1.0,,,,1.066406,1.066406,0.0,1.0,,,,,1.066406,0.0,1.0,1.066406,1.066406,,


0      84.8076
4      84.8074
7      84.8093
9      84.8095
17    171.2200
Name: scalar_coupling_constant, dtype: float64

mean of target. train:94.98392857568842, valid:94.96060144118587
Training until validation scores don't improve for 100 rounds.
[100]	training's l1: 2.06996	training's l2: 8.64586	valid_1's l1: 2.20558	valid_1's l2: 10.1169
[200]	training's l1: 1.82214	training's l2: 6.66291	valid_1's l1: 2.04884	valid_1's l2: 8.85262
[300]	training's l1: 1.66793	training's l2: 5.57882	valid_1's l1: 1.97048	valid_1's l2: 8.26804
[400]	training's l1: 1.54835	training's l2: 4.79719	valid_1's l1: 1.91715	valid_1's l2: 7.88005
[500]	training's l1: 1.44887	training's l2: 4.18129	valid_1's l1: 1.87764	valid_1's l2: 7.58983
[600]	training's l1: 1.36255	training's l2: 3.68757	valid_1's l1: 1.84668	valid_1's l2: 7.3739
[700]	training's l1: 1.29105	training's l2: 3.30048	valid_1's l1: 1.82553	valid_1's l2: 7.22403
[800]	training's l1: 1.22654	training's l2: 2.97473	valid_1's l1: 1.807	valid_1's l2: 7.10147
[900]	training's l1: 1.16437	training's l2: 2.67072	valid_1's l1: 1.78886	valid_1's l2: 6.97871
[1000]	trai

[INFO]2019-07-01 15:23:14,942:main:fold 1 valid 0.532847


mean of target. train:94.9623837111789, valid:95.00369117020259
Training until validation scores don't improve for 100 rounds.
[100]	training's l1: 2.07833	training's l2: 8.73739	valid_1's l1: 2.20753	valid_1's l2: 10.1335
[200]	training's l1: 1.82639	training's l2: 6.71965	valid_1's l1: 2.04647	valid_1's l2: 8.82007
[300]	training's l1: 1.67116	training's l2: 5.58767	valid_1's l1: 1.96889	valid_1's l2: 8.21795
[400]	training's l1: 1.55008	training's l2: 4.78852	valid_1's l1: 1.91588	valid_1's l2: 7.83144
[500]	training's l1: 1.45032	training's l2: 4.1834	valid_1's l1: 1.87643	valid_1's l2: 7.55731
[600]	training's l1: 1.36425	training's l2: 3.68658	valid_1's l1: 1.84628	valid_1's l2: 7.34933
[700]	training's l1: 1.2891	training's l2: 3.28128	valid_1's l1: 1.82075	valid_1's l2: 7.17751
[800]	training's l1: 1.22101	training's l2: 2.93319	valid_1's l1: 1.80027	valid_1's l2: 7.04353
[900]	training's l1: 1.16066	training's l2: 2.64768	valid_1's l1: 1.78282	valid_1's l2: 6.93773
[1000]	trai

[INFO]2019-07-01 15:27:41,126:main:fold 2 valid 0.530910


mean of target. train:94.98214630569106, valid:94.9641659811711
Training until validation scores don't improve for 100 rounds.
[100]	training's l1: 2.07381	training's l2: 8.65151	valid_1's l1: 2.20701	valid_1's l2: 10.1176
[200]	training's l1: 1.82392	training's l2: 6.67842	valid_1's l1: 2.04953	valid_1's l2: 8.87421
[300]	training's l1: 1.66286	training's l2: 5.52762	valid_1's l1: 1.96659	valid_1's l2: 8.24223
[400]	training's l1: 1.5404	training's l2: 4.71393	valid_1's l1: 1.91362	valid_1's l2: 7.84862
[500]	training's l1: 1.44228	training's l2: 4.12793	valid_1's l1: 1.87305	valid_1's l2: 7.57308
[600]	training's l1: 1.36064	training's l2: 3.67143	valid_1's l1: 1.84492	valid_1's l2: 7.38203
[700]	training's l1: 1.28942	training's l2: 3.29205	valid_1's l1: 1.82364	valid_1's l2: 7.23623
[800]	training's l1: 1.22408	training's l2: 2.95925	valid_1's l1: 1.80552	valid_1's l2: 7.11732
[900]	training's l1: 1.16375	training's l2: 2.66535	valid_1's l1: 1.78949	valid_1's l2: 7.00677
[1000]	tra

[INFO]2019-07-01 15:32:09,689:main:fold 3 valid 0.535477
[INFO]2019-07-01 15:32:09,738:main:CV score: 0.533078
[INFO]2019-07-01 15:32:09,746:main:Starting train model(scalar_coupling_constant 2JHH)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
[INFO]2019-07-01 15:32:12,571:main:features(2JHH): ['2j_atom_center', '2j_area_021', '2j_norm_vec_02', '2j_norm_vec_12', '2j_cos', '2j_atom_center_weight', '2j_sum_norm_vec', 'dist', 'dist_x', 'dist_y', 'dist_z', 'molecule_couples', 'molecule_dist_mean', 'molecule_dist_min', 'molecule_dist_max', 'atom_0_couples_count', 'atom_1_couples_count', 'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean', 'molecule_atom_index_0_y_1_mean_diff', 'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max', 'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_std', 'mo

Unnamed: 0,2j_atom_center,2j_area_021,2j_norm_vec_02,2j_norm_vec_12,2j_cos,2j_atom_center_weight,2j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
1,0,1.124023,1.091797,1.091797,-0.333252,12.007812,2.183594,1.783203,1.019531,2.160156,3e-06,10,1.506836,1.091797,1.783203,4,1,0.728027,1.358398,-0.10498,0.928268,1.463867,0.0,0.182251,0.728027,1.610352,-0.172729,0.90332,1.783203,3.7e-05,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,0.0,1.0,,,,1.783203,1.783203,0.0,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,2.7e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
2,0,1.124023,1.091797,1.091797,-0.333252,12.007812,2.183594,1.783203,0.294922,2.113281,0.771973,10,1.506836,1.091797,1.783203,4,2,0.728027,1.358398,-0.088745,0.938673,1.463867,0.01622,0.182251,0.728027,1.610352,-0.172852,0.90332,1.783203,9e-06,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,5e-06,1.0,1.783203,1e-05,1.0,1.783203,0.0,1.0,7e-06,-1.783203,4e-06,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1e-06,1.0,1.783203,1.783203,1.4e-05,-1.783203
3,0,1.124023,1.091797,1.091797,-0.333252,12.007812,2.183594,1.783203,0.276611,2.085938,0.817871,10,1.506836,1.091797,1.783203,4,3,0.728027,1.358398,-0.079163,0.944936,1.463867,0.025818,0.182251,0.728027,1.610352,-0.172852,0.90332,1.783203,0.0,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,-6e-06,1.0,1.783203,0.0,1.0,1.783203,-8.821487e-06,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-3.7e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
5,0,1.124023,1.091797,1.091797,-0.333252,12.007812,2.183594,1.783203,2.410156,0.000263,0.769043,10,1.506836,1.091797,1.783203,3,2,0.300049,1.324219,-0.123779,0.914494,1.447266,0.0,0.206177,0.891602,1.552734,-0.230347,0.870605,1.783203,0.0,1.0,1.091797,-0.691406,0.612305,0.39917,-1.383789,0.223755,1.783203,-5e-06,1.0,1.783203,0.0,1.0,1.783203,-1.019239e-05,1.0,7e-06,-1.783203,4e-06,1.783203,1.783203,-3.8e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1.1e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
6,0,1.124023,1.091797,1.091797,-0.333252,12.007812,2.183594,1.783203,2.357422,0.000667,0.821289,10,1.506836,1.091797,1.783203,3,3,0.300049,1.324219,-0.114197,0.920596,1.447266,0.009598,0.206177,0.891602,1.552734,-0.230347,0.870605,1.783203,9e-06,1.0,1.091797,-0.691406,0.612305,0.39917,-1.383789,0.223755,1.783203,3e-06,1.0,1.783203,8e-06,1.0,1.783203,-5.364418e-07,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-2.9e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-2e-06,1.0,1.783203,1.783203,1.4e-05,-1.783203


1   -11.2570
2   -11.2548
3   -11.2543
5   -11.2541
6   -11.2548
Name: scalar_coupling_constant, dtype: float64

mean of target. train:-10.288322401219846, valid:-10.283170689505688
Training until validation scores don't improve for 100 rounds.
[100]	training's l1: 0.382348	training's l2: 0.347483	valid_1's l1: 0.432153	valid_1's l2: 0.480955
[200]	training's l1: 0.319879	training's l2: 0.235878	valid_1's l1: 0.39859	valid_1's l2: 0.41419
[300]	training's l1: 0.280552	training's l2: 0.177513	valid_1's l1: 0.382425	valid_1's l2: 0.3853
[400]	training's l1: 0.250854	training's l2: 0.138903	valid_1's l1: 0.372365	valid_1's l2: 0.367819
[500]	training's l1: 0.228201	training's l2: 0.11317	valid_1's l1: 0.365746	valid_1's l2: 0.356678
[600]	training's l1: 0.209147	training's l2: 0.0938466	valid_1's l1: 0.360858	valid_1's l2: 0.348877
[700]	training's l1: 0.192648	training's l2: 0.0787019	valid_1's l1: 0.356932	valid_1's l2: 0.342991
[800]	training's l1: 0.17829	training's l2: 0.0666205	valid_1's l1: 0.353764	valid_1's l2: 0.337699
[900]	training's l1: 0.165268	training's l2: 0.0566925	valid_1's l1: 0.

[INFO]2019-07-01 15:35:29,219:main:fold 1 valid -1.076871


mean of target. train:-10.285182848189041, valid:-10.289449795567212
Training until validation scores don't improve for 100 rounds.
[100]	training's l1: 0.379484	training's l2: 0.337708	valid_1's l1: 0.428151	valid_1's l2: 0.473102
[200]	training's l1: 0.317744	training's l2: 0.230728	valid_1's l1: 0.395575	valid_1's l2: 0.413602
[300]	training's l1: 0.27991	training's l2: 0.174774	valid_1's l1: 0.381175	valid_1's l2: 0.388422
[400]	training's l1: 0.250461	training's l2: 0.136602	valid_1's l1: 0.37143	valid_1's l2: 0.371907
[500]	training's l1: 0.227326	training's l2: 0.110451	valid_1's l1: 0.364991	valid_1's l2: 0.361356
[600]	training's l1: 0.207921	training's l2: 0.0911649	valid_1's l1: 0.359751	valid_1's l2: 0.353085
[700]	training's l1: 0.191483	training's l2: 0.0764258	valid_1's l1: 0.356024	valid_1's l2: 0.346897
[800]	training's l1: 0.176809	training's l2: 0.0643747	valid_1's l1: 0.352954	valid_1's l2: 0.342134
[900]	training's l1: 0.164372	training's l2: 0.0553396	valid_1's l1

KeyboardInterrupt: ignored

In [0]:
models[TARGET].pred_dict['1JHC'].head()

In [0]:
score = 0
for j_type, df_score in models[TARGET].score_dict.items():
    print(j_type)    
    score_each_type = np.mean(df_score.values)
    print(score_each_type)
    score += score_each_type / 8
print(score)

### Check training result

In [0]:
# sns.distplot(df_pred['proba'])

In [0]:
def feat_importance(_models, _X, _imp_type='gain'):
    df_imp = pd.DataFrame(index=_X.columns)
    for i, model in enumerate(_models):
        df_imp[i] = model.booster_.feature_importance(importance_type=_imp_type)

    df_imp = df_imp.apply(lambda x: x/sum(x))
    df_imp['imp_mean'] = df_imp[list(range(len(models)))].mean(axis=1)
    df_imp['imp_std'] = df_imp[list(range(len(models)))].std(axis=1)
    sorted_imp = df_imp.sort_values(by='imp_mean', ascending=False)
    return sorted_imp

In [0]:
# imp = feat_importance(model_dict['1JHC'], X, _imp_type='gain')
# imp.head(100)

## Predict

In [0]:
df_test = pd.read_csv(TEST_PATH)
df_strct = pd.read_csv(INPUT + 'structures.csv')

In [0]:
def predict_single(df, strct):
    models = joblib.load(MODEL_PATH)

    df_submit = df[['id']].copy()
    df = preprocess(df, strct, mode='predict')
    X = drop_col(df)
    display(X.head())
    
    X.to_csv('test_prepro.csv', index=False)
    
    y_pred = oof_predict(models, X)
    df_submit['scalar_coupling_constant'] = y_pred
    
    return df_submit

In [0]:
def predict_each_type(df, strct):
    df = df.head(10000)
    # model_dict = joblib.load(MODEL_PATH)
    
    s_type = df['type'].copy()
    df_submit = df[['id']].copy()
    
    df = preprocess(df, strct, mode='predict')
    df = drop_col(df)    
    
    '''
    coupling_types = s_type.unique()
    print(coupling_types)
    for coup_type in coupling_types:
        
        models = model_dict[coup_type]
        
        get_logger().info('Starting predict target(%s)' % coup_type)
        is_the_type = (s_type == coup_type)
        df_type = df[is_the_type]
                      
        X = df_type
        X = drop_uneffect_feature(X)        
        
        display(X.head())  
        y_pred = oof_predict(models, X)        
        
        df_submit.loc[is_the_type, 'scalar_coupling_constant'] = y_pred
    '''
    
    df_submit[TARGET] = 0
    # for target in CONTR_COLS: # ['fc', 'sd', 'pso', 'dso']: 
    for target in [TARGET]: 
        get_logger().info('Start prediction: %s' % target)
        model_file = 'model_%s.pkl' % target
        model = joblib.load(model_file)
                
        df_submit_each_target = model.predict(df, s_type, df_submit)                              
        df_submit[TARGET] += df_submit_each_target[target]
    
    display(df_submit.head())
    print((df_submit[TARGET].isnull()).sum())
    return df_submit

In [0]:
df_submit = predict_each_type(df_test, df_strct)

In [0]:
display(df_submit.head())
df_submit[['id', TARGET]].to_csv('submission.csv', index=False)

In [0]:
df_submit.shape