In [1]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive


In [2]:
%cd "/gdrive/My Drive"

/gdrive/My Drive


In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.base import clone
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import math

from tqdm import tqdm
import joblib
import gc

## config

In [0]:
INPUT = './analysis/mole/data/raw/'
TRAIN_PATH = INPUT + 'train.csv'
TEST_PATH = INPUT + 'test.csv'
PREPROCESS = './analysis/mole/data/preprocess/'

MID_MODEL_PATH = PREPROCESS + 'middle_model.pkl'
MODEL_PATH = PREPROCESS + 'model.pkl'
ENCODER_PATH = PREPROCESS + 'le.pkl'

RUN_PLOT = True
TARGET = 'scalar_coupling_constant'
N_FOLDS = 3

atom_weight = {'H': 1.008, 'C': 12.01, 'N': 14.01, 'O':16.00}

## logging

In [0]:
import logging
import logging.handlers


def create_logger(log_file_name):
    logger_ = logging.getLogger('main')
    logger_.setLevel(logging.DEBUG)
    fh = logging.handlers.RotatingFileHandler(
        log_file_name, maxBytes=100000, backupCount=8)
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '[%(levelname)s]%(asctime)s:%(name)s:%(message)s')
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)
    # add the handlers to the logger
    logger_.addHandler(fh)
    logger_.addHandler(ch)


def get_logger():
    return logging.getLogger('main')

In [0]:
create_logger('mole.log')

## util

In [0]:
def onehot(_df):
    cat_names = [name for name, col in _df.iteritems() if col.dtype == 'O']
    df_cat = pd.get_dummies(_df[cat_names])
    _df = pd.concat([_df, df_cat], axis=1).drop(cat_names, axis=1)
    return _df

def label_encode(df):
    cat_names = [name for name, col in df.iteritems() if col.dtype == 'O']    
    for cat_name in cat_names:
        print(cat_name)
        le = LabelEncoder()
        le.fit(df[cat_name].values)
        df[cat_name] = le.transform(df[cat_name].values)
    return df

class Encoder:
    def __init__(self):        
        self.encoders = {}
    
    def fit(self, df, cat_names):
        for cat_name in cat_names:
            le = LabelEncoder()
            le.fit(df[cat_name].values)
            self.encoders[cat_name] = le        
    
    def transform(self, df):
        for cat_name in self.encoders.keys():            
            df[cat_name] = self.encoders[cat_name].transform(df[cat_name].values)
            
        return df


def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    
    return df

## Preprocess

In [0]:
def map_atom_info(df, strct, atom_idx):
    df = pd.merge(df, strct, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

def calc_dist(df):
    p_0 = df[['x_0', 'y_0', 'z_0']].values
    p_1 = df[['x_1', 'y_1', 'z_1']].values

    df['dist'] = np.linalg.norm(p_0 - p_1, axis=1)
    df['dist_x'] = (df['x_0'] - df['x_1']) ** 2
    df['dist_y'] = (df['y_0'] - df['y_1']) ** 2
    df['dist_z'] = (df['z_0'] - df['z_1']) ** 2

    return df

def divide_type(df):    
    df['type_0'] = df['type'].apply(lambda x: x[0])
    df['type_1'] = df['type'].apply(lambda x: x[1:])
    return df

In [0]:
def feature_engineering(df):
    print("Starting Feature Engineering...")
    g = df.groupby('molecule_name')
    g1 = df.groupby(['molecule_name', 'atom_index_0'])
    g2 = df.groupby(['molecule_name', 'atom_index_1'])
    g3 = df.groupby(['molecule_name', 'atom_1'])
    g4 = df.groupby(['molecule_name', 'type_0'])
    g5 = df.groupby(['molecule_name', 'type'])
    
    df['type_0'] = df['type'].apply(lambda x: x[0])
    df['molecule_couples'] = g['id'].transform('count')
    df['molecule_dist_mean'] = g['dist'].transform('mean')
    df['molecule_dist_min'] = g['dist'].transform('min')
    df['molecule_dist_max'] = g['dist'].transform('max')
    df['atom_0_couples_count'] = g1['id'].transform('count')
    df['atom_1_couples_count'] = g2['id'].transform('count')
    df[f'molecule_atom_index_0_x_1_std'] = g1['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = g1['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = g1['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = g1['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = g1['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = g1['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = g1['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = g1['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = g1['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = g2['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = g2['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = g2['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = g2['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df[f'molecule_atom_1_dist_mean'] = g3['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = g3['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_std'] = g3['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_type_0_dist_std'] = g4['dist'].transform('std')
    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_dist_mean'] = g5['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_dist_max'] = g5['dist'].transform('max')
    df[f'molecule_type_dist_min'] = g5['dist'].transform('min')
    df[f'molecule_type_dist_std'] = g5['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']

    # TODO: back
    # df = reduce_mem_usage(df)
    
    return df

In [0]:
def add_2j_center_atom(df):    
    get_logger().info('load df_2jsim')
    
    df_2j = joblib.load(PREPROCESS + 'df_2jsim.pkl')  
    
    # atom weight
    df_2j['2j_atom_center_weight'] = df_2j['2j_atom_center'].replace(atom_weight)
    
    # sum of norm
    df_2j['2j_sum_norm_vec'] = df_2j['2j_norm_vec_02'] + df_2j['2j_norm_vec_12']
    
    df = df.merge(df_2j, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left')    
    
    # replace missing vlaue to 'nan' for LabelEncoder
    df.loc[df['2j_atom_center'].isnull(), '2j_atom_center'] = 'nan'
    
    return df

def str_sort(s):
    """
    Parameters
    ----------
    x: str   
    """
    # print(s)
    if not isinstance(s, str):
        return s
    elif s[0] > s[1]:
        return s[1] + s[0]
    else:
        return s

def add_3j_center_atom(df):    
    get_logger().info('load df_3jsim')
    
    df_3j = joblib.load(PREPROCESS + 'df_3jsim.pkl')
    
    # atom weight
    s_atom_w0 = df_3j['3j_atom_center_0'].replace(atom_weight)
    s_atom_w1 = df_3j['3j_atom_center_1'].replace(atom_weight)
    df_3j['3j_atom_center_weight'] = s_atom_w0 + s_atom_w1

    # concatenate atom string 'C' + 'C' - > 'CC'
    tmp = df_3j['3j_atom_center_0'] + df_3j['3j_atom_center_1']
    df_3j['3j_atom_center'] = tmp.transform(str_sort)    
    df_3j.drop(['3j_atom_center_0', '3j_atom_center_1'], axis=1, inplace=True)
    
    # sum norm_vec
    df_3j['3j_sum_norm_vec'] = df_3j['3j_norm_vec_02'] + df_3j['3j_norm_vec_13'] + df_3j['3j_norm_vec_23']
    
    df = df.merge(df_3j, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left')    
    
    # replace missing vlaue to 'nan' for LabelEncoder
    df.loc[df['3j_atom_center'].isnull(), '3j_atom_center'] = 'nan'    
    
    return df

In [0]:
def drop_col(df_org):
    df = df_org.copy()
    to_drop = ['id', 'molecule_name', 'atom_index_0', 'atom_index_1',
               'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1', #'dist_x', 'dist_y', 'dist_z',
               'atom_0', 'atom_1'
              ]
    df = df.drop(to_drop, axis=1)
    
    return df

In [0]:
def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    """
    Fast metric computation for this competition: https://www.kaggle.com/c/champs-scalar-coupling
    Code is from this kernel: https://www.kaggle.com/uberkinder/efficient-metric
    """
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

def oof_train(_X, _y, _types):
    """
    Parameters
    ----------
    _X: pd.DataFrame, shape [n_samples, n_features]
    _y: array-like object, shape [n_samples]
    _types: array-like object, shsape [n_samples]
        array of `type` (e.g. 2JHC, 1JHC, 3JHH, etc.)
    """
    # TODO: divide data to training and validation about molecular
    
    models = []
    df_scores = pd.DataFrame(columns=['valid_score'])
    df_pred = pd.DataFrame(index=_X.index).reset_index(drop=True)

    fold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=1)
    for n_fold, (train_idx, valid_idx) in enumerate(fold.split(_X, _types)):
        # prepare data
        X_train, y_train = _X.iloc[train_idx], _y.iloc[train_idx]
        X_valid, y_valid = _X.iloc[valid_idx], _y.iloc[valid_idx]
        print('mean of target. train:{}, valid:{}'.format(y_train.mean(), y_valid.mean()))

        # generate model
        model = gen_model(_X)
        
        # train
        model.fit(X_train, y_train, eval_metric='mae',
                  eval_set=[(X_train, y_train), (X_valid, y_valid)],
                  verbose=100,
                  early_stopping_rounds=100
                  )
        
        # validate
        y_pred = model.predict(X_valid, num_iteration=model.best_iteration_)
        
        types_valid = _types.iloc[valid_idx]
        valid_score = group_mean_log_mae(y_valid, y_pred, types_valid)
        get_logger().info('fold %d valid %f' % (n_fold+1, valid_score))
        
        df_scores = df_scores.append(pd.Series([valid_score], index=['valid_score']), ignore_index=True)
        df_pred.loc[valid_idx, 'proba'] = y_pred
        models.append(model)
        
        # TODO: back
        # break
    get_logger().info('CV score: %f' % df_scores.mean()[0])
    
    return models, df_scores, df_pred

def oof_predict(_models, _X):
    get_logger().info('Start oof_predict')
    y_pred = np.zeros(_X.shape[0])
        
    for i, model in enumerate(_models):
        get_logger().info('prediction: %d' % i)
        y_pred += model.predict(_X) / len(_models)
    
    get_logger().info('Finish oof_predict')
    return y_pred


def gen_model(_X):
    n_features = _X.shape[1]
    colsample_rate = max(0.7, math.sqrt(n_features)/n_features)
    
    _model = lgb.LGBMRegressor(
        learning_rate=0.2,
        n_estimators=1500,
        num_leaves=128,
        # min_child_weight=15, # good value: 0, 5, 15, 300
        min_child_samples=80,
        subsample=0.7,
        colsample_bytree=1, #colsample_rate,
        objective='regression',
        reg_lambda=0.1,
        reg_alpha=0.1,
        seed=2019
        )
    return _model


In [0]:
def preprocess(df, strct, mode, s_type=None):
    """
    Parameters
    ----------
    df: pd.DataFrame
        dataframe of train.csv or test.csv
    strct: pd.DataFrame
        dataframe of structures.csv
    mode: str
        'train' or 'predict'
    s_type: None or pd.Series
        'type' column (e.g. 1JHC, 2JHH).
        If mode is 'train', the s_type must be specified.
    """
    get_logger().info('Start preprocess()')
    df = add_2j_center_atom(df)
    df = add_3j_center_atom(df)
    df = map_atom_info(df, strct, 0)
    df = map_atom_info(df, strct, 1)
    df = calc_dist(df)
    df = divide_type(df)
    df = feature_engineering(df)
    
    # encode
    if mode == 'train':
        enc = Encoder()
        enc.fit(df, ['type', 'type_0', 'type_1', 
                     '2j_atom_center', '3j_atom_center'])
        joblib.dump(enc, ENCODER_PATH)
    elif mode == 'predict':
        get_logger().info('loading encoder from %s' % ENCODER_PATH)
        enc = joblib.load(ENCODER_PATH)
    df = enc.transform(df)
        
    use_features = [col for col in df.columns if col not in [TARGET]]
    get_logger().info(use_features)
    df[use_features] = reduce_mem_usage(df[use_features])
    # TODO: back
    # df = add_scc_feature(df, 'fc', mode=mode, s_type=s_type)
    
    get_logger().info('Finish preprocess()')
    return df

### fermi constant

In [0]:
class CNTR:
    """Model to predict fc/sd/pso/dso columns"""
    
    def __init__(self, y_col):
        self.y_col = y_col
        
    def train(self, df_org, scc, s_type):
        """
        Parameters
        ----------
        s_type: pd.Series
            'type' column (e.g. 1JHC, 2JHH)
        """
        df = df_org.copy()
        # Merge
        key_cols = ['molecule_name', 'atom_index_0', 'atom_index_1']
        df = df.merge(scc[key_cols + [self.y_col]], how='left', on=key_cols)
        
        # drop unnecessary cols        
        df = drop_col(df)        
        
        y = df[self.y_col].copy()        
        df.drop([TARGET, self.y_col], axis=1, inplace=True)
        X = df
        
        display(X.head())
        display(y.head())
        models, scores, y_pred = oof_train(X, y, s_type)
        
        # save model
        joblib.dump(models, MID_MODEL_PATH)
        
        self.models_ = models
        self.scores_ = scores
        self.y_pred_ = y_pred
        
    def predict(self, df_org):    
        y_pred = np.zeros(df_org.shape[0])
        
        X = df_org.copy()
        X = drop_col(X)
        
        display(X.head())
        # X = self.preprocess(df_org)
        for model in self.models_:            
            y_pred += model.predict(X) / len(models)
        
        return y_pred
    
    def load_model(self):
        # load pkl by joblib
        self.models_ = joblib.load(MID_MODEL_PATH)

In [0]:
def add_scc_feature(df, cntr_name, mode, s_type=None):
    """
    Parameters
    ----------
    cntr_name: str
        'fc', 'sd', 'pso' or 'dso'
    mode: str
        'train' or 'predict'
    s_type: None or pd.Series
        'type' column (e.g. 1JHC, 2JHH).
        If mode is 'train', the s_type must be specified.
    """
    add_feature = '%s_pred' % cntr_name
    cntr = CNTR(cntr_name)
    if mode == 'train': 
        assert s_type is not None, 's_type must be specified.'
        
        get_logger().info('start loading scalar_coupling_contributions')
        scc = pd.read_csv(INPUT + 'scalar_coupling_contributions.csv')
        get_logger().info('finished loading scalar_coupling_contributions')
        
        # train contribution(fc/sd/pso/dso)
        cntr.train(df, scc, s_type)
    
        display(cntr.y_pred_.head())
        df[add_feature] = cntr.y_pred_
    elif mode == 'predict':
        cntr.load_model()
        y_pred = cntr.predict(df)
        df[add_feature] = y_pred
    
    return df

## Train

In [16]:
df_train = pd.read_csv(TRAIN_PATH)
df_train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


Joint type:  
1JHC, 1JHN, 2JHH, 2JHN, 3JHH, 3JHC, 3JHN

In [17]:
df_strct = pd.read_csv(INPUT + 'structures.csv')
df_strct.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [0]:
def train_single_model(df, strct):
    # TODO: back
    df = df.head(10000)

    s_type = df['type'].copy()

    df = preprocess(df, strct, mode='train', s_type=s_type)
    df = drop_col(df)

    y = df[TARGET].copy()
    df.drop([TARGET], axis=1, inplace=True)
    X = df
    
    display(X.head())
    display(y.head())
    models, df_scores, df_pred = oof_train(X, y, s_type)

    joblib.dump(models, MODEL_PATH)
    
    return models, df_scores, df_pred

In [0]:
def drop_uneffect_feature(df):
    """
    Drop uneffective features from dataframe
    """
    for col in df.columns:
        if len(df[col].unique()) == 1:
            df.drop(col, axis=1, inplace=True)
    return df

In [0]:
def train_models_each_type(df, strct):
    # TODO:back
    # df = df.head(10000)
    
    s_type = df['type'].copy()
    
    df = preprocess(df, strct, mode='train', s_type=s_type)
    df = drop_col(df)
    
    model_dict = {}
    score_dict = {}
    pred_dict = {}
    coupling_types = s_type.unique()
    for coup_type in coupling_types:
        get_logger().info('Starting train model(%s)' % coup_type)
        is_the_type = (s_type == coup_type)
        df_type = df[is_the_type]
        

        
        y = df_type[TARGET]
        df_type.drop([TARGET], axis=1, inplace=True)
        X = df_type
        X = drop_uneffect_feature(X)
        
        display(X.head())
        display(y.head())
        models, df_scores, df_pred = oof_train(X, y, _types=s_type[is_the_type].reset_index(drop=True))
        
        model_dict[coup_type] = models
        score_dict[coup_type] = df_scores
        pred_dict[coup_type] = df_pred
        
    joblib.dump(model_dict, MODEL_PATH)
    
    return model_dict, score_dict, pred_dict

In [21]:
# models, df_scores, df_pred = train_single_model(df_train, df_strct)
model_dict, score_dict, pred_dict = train_models_each_type(df_train, df_strct)

[INFO]2019-06-23 13:34:39,689:main:Start preprocess()
[INFO]2019-06-23 13:34:39,692:main:load df_2jsim
[INFO]2019-06-23 13:34:45,359:main:load df_3jsim


Starting Feature Engineering...


[INFO]2019-06-23 13:35:33,158:main:['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type', '2j_atom_center', '2j_area_021', '2j_norm_vec_02', '2j_norm_vec_12', '2j_cos', '2j_atom_center_weight', '2j_sum_norm_vec', '3j_norm_vec_02', '3j_norm_vec_13', '3j_norm_vec_23', '3j_cos_023', '3j_cos_231', '3j_area_023', '3j_area_231', '3j_dihedral', '3j_atom_center_weight', '3j_atom_center', '3j_sum_norm_vec', 'atom_0', 'x_0', 'y_0', 'z_0', 'atom_1', 'x_1', 'y_1', 'z_1', 'dist', 'dist_x', 'dist_y', 'dist_z', 'type_0', 'type_1', 'molecule_couples', 'molecule_dist_mean', 'molecule_dist_min', 'molecule_dist_max', 'atom_0_couples_count', 'atom_1_couples_count', 'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean', 'molecule_atom_index_0_y_1_mean_diff', 'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max', 'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_std', 'molecule_atom_index_0_z_1_std', 'molecule_atom_index_0_dist_mean', 'molecule_atom_index_

Mem. usage decreased to 910.68 Mb (71.8% reduction)


[INFO]2019-06-23 13:37:27,034:main:Finish preprocess()
[INFO]2019-06-23 13:37:29,119:main:Starting train model(1JHC)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
0,1.091797,0.000221,1.192383,3.6e-05,10,1.506836,1.091797,1.783203,4,4,0.728027,1.358398,0.272949,1.25138,1.463867,0.37793,0.182251,0.728027,1.610352,0.518555,1.474609,1.783203,0.691406,1.632812,1.091797,0.345703,-0.746582,0.316406,1.091797,-3e-06,1.0,1.091797,0.0,1.0,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-7e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,-3e-06,1.0,1.091797,1.091797,3e-06,-1.091797
4,1.091797,1.049805,0.142822,6e-05,10,1.506836,1.091797,1.783203,3,4,0.300049,1.324219,0.237915,1.219147,1.447266,0.361816,0.206177,0.891602,1.552734,0.460693,1.421875,1.783203,0.691406,1.632812,1.091797,0.39917,-0.692871,0.365479,1.091797,-2e-06,1.0,1.091797,1e-06,1.0,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-5e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,-2e-06,1.0,1.091797,1.091797,3e-06,-1.091797
7,1.091797,0.278809,0.130859,0.782715,10,1.506836,1.091797,1.783203,2,4,0.361328,1.261719,0.176025,1.162151,1.4375,0.352051,0.249023,0.635254,1.4375,0.345703,1.316406,1.783203,0.691406,1.632812,1.091797,0.48877,-0.603027,0.44751,1.091797,3e-06,1.0,1.091797,7e-06,1.0,1.091797,0.0,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,0.0,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,3e-06,1.0,1.091797,1.091797,3e-06,-1.091797
9,1.091797,0.26123,0.124023,0.807129,10,1.506836,1.091797,1.783203,1,4,,1.085938,0.0,1.0,1.085938,0.0,,,1.091797,0.0,1.0,1.091797,0.0,1.0,1.091797,,,,1.091797,2e-06,1.0,1.091797,6e-06,1.0,1.091797,-1e-06,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-1e-06,1.0,3e-06,-1.091797,3e-06,-1.091797,1.091797,2e-06,1.0,1.091797,1.091797,3e-06,-1.091797
17,1.066406,0.00021,1.137695,3.5e-05,2,1.642578,1.066406,2.21875,2,1,0.011055,0.556641,-0.575684,0.491541,1.132812,0.0,0.814453,0.00449,1.642578,0.575684,1.540039,2.21875,1.151367,2.080078,1.066406,0.814453,-0.252197,0.763672,1.066406,0.0,1.0,1.066406,0.0,1.0,1.066406,0.0,1.0,,,,1.066406,1.066406,0.0,1.0,,,,,1.066406,0.0,1.0,1.066406,1.066406,,


0      84.8076
4      84.8074
7      84.8093
9      84.8095
17    171.2200
Name: scalar_coupling_constant, dtype: float64

mean of target. train:94.98392857568842, valid:94.96060144118587
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 8.37789	training's l1: 2.04058	valid_1's l2: 10.4251	valid_1's l1: 2.23008
[200]	training's l2: 6.27101	training's l1: 1.77625	valid_1's l2: 9.33032	valid_1's l1: 2.09699
[300]	training's l2: 5.04069	training's l1: 1.60017	valid_1's l2: 8.80841	valid_1's l1: 2.02941
[400]	training's l2: 4.21905	training's l1: 1.46719	valid_1's l2: 8.45903	valid_1's l1: 1.9839
[500]	training's l2: 3.55357	training's l1: 1.35433	valid_1's l2: 8.22118	valid_1's l1: 1.95341
[600]	training's l2: 3.06055	training's l1: 1.26017	valid_1's l2: 8.07412	valid_1's l1: 1.93348
[700]	training's l2: 2.65849	training's l1: 1.17821	valid_1's l2: 7.95239	valid_1's l1: 1.91723
[800]	training's l2: 2.33051	training's l1: 1.10523	valid_1's l2: 7.85147	valid_1's l1: 1.90286
[900]	training's l2: 2.03966	training's l1: 1.03737	valid_1's l2: 7.7643	valid_1's l1: 1.89099
[1000]	tra

[INFO]2019-06-23 13:41:25,704:main:fold 1 valid 0.463914


mean of target. train:94.9623837111789, valid:95.00369117020259
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 8.36815	training's l1: 2.04152	valid_1's l2: 10.3788	valid_1's l1: 2.22834
[200]	training's l2: 6.30404	training's l1: 1.78023	valid_1's l2: 9.28252	valid_1's l1: 2.09332
[300]	training's l2: 5.06641	training's l1: 1.60506	valid_1's l2: 8.73278	valid_1's l1: 2.02387
[400]	training's l2: 4.22777	training's l1: 1.47146	valid_1's l2: 8.41766	valid_1's l1: 1.98197
[500]	training's l2: 3.58164	training's l1: 1.35859	valid_1's l2: 8.18604	valid_1's l1: 1.94949
[600]	training's l2: 3.06601	training's l1: 1.26263	valid_1's l2: 8.02711	valid_1's l1: 1.9282
[700]	training's l2: 2.66833	training's l1: 1.18017	valid_1's l2: 7.90916	valid_1's l1: 1.91206
[800]	training's l2: 2.3454	training's l1: 1.10814	valid_1's l2: 7.81167	valid_1's l1: 1.89733
[900]	training's l2: 2.06086	training's l1: 1.04179	valid_1's l2: 7.72496	valid_1's l1: 1.8868
[1000]	train

[INFO]2019-06-23 13:45:14,187:main:fold 2 valid 0.452015


mean of target. train:94.98214630569106, valid:94.9641659811711
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 8.34331	training's l1: 2.03947	valid_1's l2: 10.48	valid_1's l1: 2.23444
[200]	training's l2: 6.25821	training's l1: 1.77373	valid_1's l2: 9.34056	valid_1's l1: 2.09384
[300]	training's l2: 4.98567	training's l1: 1.59459	valid_1's l2: 8.79008	valid_1's l1: 2.02581
[400]	training's l2: 4.1586	training's l1: 1.46154	valid_1's l2: 8.48874	valid_1's l1: 1.98564
[500]	training's l2: 3.52038	training's l1: 1.35007	valid_1's l2: 8.24982	valid_1's l1: 1.95405
[600]	training's l2: 3.02694	training's l1: 1.25681	valid_1's l2: 8.09528	valid_1's l1: 1.93367
[700]	training's l2: 2.62212	training's l1: 1.17308	valid_1's l2: 7.96298	valid_1's l1: 1.9152
[800]	training's l2: 2.29214	training's l1: 1.09966	valid_1's l2: 7.87381	valid_1's l1: 1.90245
[900]	training's l2: 2.03347	training's l1: 1.03656	valid_1's l2: 7.80551	valid_1's l1: 1.89171
[1000]	traini

[INFO]2019-06-23 13:49:01,021:main:fold 3 valid 0.470574
[INFO]2019-06-23 13:49:01,054:main:CV score: 0.462168
[INFO]2019-06-23 13:49:01,063:main:Starting train model(2JHH)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,2j_atom_center,2j_area_021,2j_norm_vec_02,2j_norm_vec_12,2j_cos,2j_atom_center_weight,2j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
1,0,1.124023,1.091797,1.091797,-0.333252,12.007812,2.183594,1.783203,1.019531,2.160156,3e-06,10,1.506836,1.091797,1.783203,4,1,0.728027,1.358398,-0.10498,0.928268,1.463867,0.0,0.182251,0.728027,1.610352,-0.172729,0.90332,1.783203,3.7e-05,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,0.0,1.0,,,,1.783203,1.783203,0.0,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,2.7e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
2,0,1.124023,1.091797,1.091797,-0.333252,12.007812,2.183594,1.783203,0.294922,2.113281,0.771973,10,1.506836,1.091797,1.783203,4,2,0.728027,1.358398,-0.088745,0.938673,1.463867,0.01622,0.182251,0.728027,1.610352,-0.172852,0.90332,1.783203,9e-06,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,5e-06,1.0,1.783203,1e-05,1.0,1.783203,0.0,1.0,7e-06,-1.783203,4e-06,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1e-06,1.0,1.783203,1.783203,1.4e-05,-1.783203
3,0,1.124023,1.091797,1.091797,-0.333252,12.007812,2.183594,1.783203,0.276611,2.085938,0.817871,10,1.506836,1.091797,1.783203,4,3,0.728027,1.358398,-0.079163,0.944936,1.463867,0.025818,0.182251,0.728027,1.610352,-0.172852,0.90332,1.783203,0.0,1.0,1.091797,-0.691406,0.612305,0.345703,-1.4375,0.193848,1.783203,-6e-06,1.0,1.783203,0.0,1.0,1.783203,-8.821487e-06,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-3.7e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
5,0,1.124023,1.091797,1.091797,-0.333252,12.007812,2.183594,1.783203,2.410156,0.000263,0.769043,10,1.506836,1.091797,1.783203,3,2,0.300049,1.324219,-0.123779,0.914494,1.447266,0.0,0.206177,0.891602,1.552734,-0.230347,0.870605,1.783203,0.0,1.0,1.091797,-0.691406,0.612305,0.39917,-1.383789,0.223755,1.783203,-5e-06,1.0,1.783203,0.0,1.0,1.783203,-1.019239e-05,1.0,7e-06,-1.783203,4e-06,1.783203,1.783203,-3.8e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-1.1e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203
6,0,1.124023,1.091797,1.091797,-0.333252,12.007812,2.183594,1.783203,2.357422,0.000667,0.821289,10,1.506836,1.091797,1.783203,3,3,0.300049,1.324219,-0.114197,0.920596,1.447266,0.009598,0.206177,0.891602,1.552734,-0.230347,0.870605,1.783203,9e-06,1.0,1.091797,-0.691406,0.612305,0.39917,-1.383789,0.223755,1.783203,3e-06,1.0,1.783203,8e-06,1.0,1.783203,-5.364418e-07,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-2.9e-05,1.0,1.4e-05,-1.783203,1.4e-05,-1.783203,1.783203,-2e-06,1.0,1.783203,1.783203,1.4e-05,-1.783203


1   -11.2570
2   -11.2548
3   -11.2543
5   -11.2541
6   -11.2548
Name: scalar_coupling_constant, dtype: float64

mean of target. train:-10.288322401219846, valid:-10.283170689505688
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.3023	training's l1: 0.362926	valid_1's l2: 0.468301	valid_1's l1: 0.429086
[200]	training's l2: 0.19577	training's l1: 0.298393	valid_1's l2: 0.417323	valid_1's l1: 0.40267
[300]	training's l2: 0.141591	training's l1: 0.258136	valid_1's l2: 0.39829	valid_1's l1: 0.3923
[400]	training's l2: 0.105653	training's l1: 0.22661	valid_1's l2: 0.385726	valid_1's l1: 0.385318
[500]	training's l2: 0.0820922	training's l1: 0.201369	valid_1's l2: 0.377532	valid_1's l1: 0.380085
[600]	training's l2: 0.064863	training's l1: 0.180838	valid_1's l2: 0.372407	valid_1's l1: 0.377321
[700]	training's l2: 0.0523272	training's l1: 0.163522	valid_1's l2: 0.36842	valid_1's l1: 0.375038
[800]	training's l2: 0.0427598	training's l1: 0.14831	valid_1's l2: 0.365113	valid_1's l1: 0.372952
[900]	training's l2: 0.0353828	training's l1: 0.135273	valid_1's l2: 0.3629

[INFO]2019-06-23 13:51:58,450:main:fold 1 valid -1.158065


mean of target. train:-10.285182848189041, valid:-10.289449795567212
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.300433	training's l1: 0.361649	valid_1's l2: 0.472132	valid_1's l1: 0.428091
[200]	training's l2: 0.195244	training's l1: 0.298961	valid_1's l2: 0.423204	valid_1's l1: 0.403507
[300]	training's l2: 0.140544	training's l1: 0.257675	valid_1's l2: 0.402162	valid_1's l1: 0.391462
[400]	training's l2: 0.106341	training's l1: 0.227041	valid_1's l2: 0.391062	valid_1's l1: 0.385099
[500]	training's l2: 0.0822185	training's l1: 0.202223	valid_1's l2: 0.38252	valid_1's l1: 0.380409
[600]	training's l2: 0.0649979	training's l1: 0.181435	valid_1's l2: 0.377404	valid_1's l1: 0.377488
[700]	training's l2: 0.052755	training's l1: 0.164088	valid_1's l2: 0.373804	valid_1's l1: 0.375352
[800]	training's l2: 0.0431875	training's l1: 0.149064	valid_1's l2: 0.370665	valid_1's l1: 0.373501
[900]	training's l2: 0.0357852	training's l1: 0.13595	valid_1's l2

[INFO]2019-06-23 13:54:49,083:main:fold 2 valid -1.156123


mean of target. train:-10.28631024253647, valid:-10.287195006872503
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.300613	training's l1: 0.361697	valid_1's l2: 0.474897	valid_1's l1: 0.429222
[200]	training's l2: 0.193831	training's l1: 0.296971	valid_1's l2: 0.425623	valid_1's l1: 0.403837
[300]	training's l2: 0.138526	training's l1: 0.255762	valid_1's l2: 0.402636	valid_1's l1: 0.391781
[400]	training's l2: 0.104911	training's l1: 0.224845	valid_1's l2: 0.390623	valid_1's l1: 0.38485
[500]	training's l2: 0.0819113	training's l1: 0.200618	valid_1's l2: 0.383305	valid_1's l1: 0.380787
[600]	training's l2: 0.0646814	training's l1: 0.180083	valid_1's l2: 0.377566	valid_1's l1: 0.377693
[700]	training's l2: 0.0522971	training's l1: 0.162922	valid_1's l2: 0.373322	valid_1's l1: 0.37532
[800]	training's l2: 0.0424645	training's l1: 0.147743	valid_1's l2: 0.370064	valid_1's l1: 0.373621
[900]	training's l2: 0.0350878	training's l1: 0.134605	valid_1's l2

[INFO]2019-06-23 13:57:41,392:main:fold 3 valid -1.139932
[INFO]2019-06-23 13:57:41,413:main:CV score: -1.151374
[INFO]2019-06-23 13:57:41,417:main:Starting train model(1JHN)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
10,1.017578,0.003328,1.023438,0.008087,6,1.318359,1.017578,1.619141,3,3,0.730957,1.242188,0.218018,1.212888,1.358398,0.334717,0.188965,0.459717,1.417969,0.400879,1.394531,1.619141,0.601562,1.591797,1.017578,0.347168,-0.669922,0.341309,1.017578,5e-06,1.0,1.017578,1.8e-05,1.0,1.017578,-3e-06,1.0,1.1e-05,-1.017578,1.1e-05,1.017578,1.017578,-3e-06,1.0,1.1e-05,-1.017578,1.1e-05,-1.017578,1.017578,5e-06,1.0,1.017578,1.017578,1.1e-05,-1.017578
13,1.017578,0.914551,0.112,0.008339,6,1.318359,1.017578,1.619141,2,3,0.339355,1.183594,0.159668,1.155953,1.34375,0.319336,0.22583,0.592773,1.318359,0.300781,1.295898,1.619141,0.601562,1.591797,1.017578,0.425293,-0.591797,0.418213,1.017578,8e-06,1.0,1.017578,2.1e-05,1.0,1.017578,0.0,1.0,1.1e-05,-1.017578,1.1e-05,1.017578,1.017578,0.0,1.0,1.1e-05,-1.017578,1.1e-05,-1.017578,1.017578,8e-06,1.0,1.017578,1.017578,1.1e-05,-1.017578
15,1.017578,0.230225,0.102051,0.702637,6,1.318359,1.017578,1.619141,1,3,,1.024414,0.0,1.0,1.024414,0.0,,,1.017578,0.0,1.0,1.017578,0.0,1.0,1.017578,,,,1.017578,-1.3e-05,1.0,1.017578,0.0,1.0,1.017578,-2.1e-05,1.0,1.1e-05,-1.017578,1.1e-05,1.017578,1.017578,-2.1e-05,1.0,1.1e-05,-1.017578,1.1e-05,-1.017578,1.017578,-1.3e-05,1.0,1.017578,1.017578,1.1e-05,-1.017578
97,1.007812,0.724609,0.290527,2.7e-05,9,1.80957,1.004883,2.960938,4,3,0.546875,0.671387,-0.674805,0.498868,1.827148,0.480713,1.09082,0.007298,1.938477,0.930664,1.923828,2.960938,1.953125,2.9375,1.007812,0.809082,-0.198242,0.803223,1.353516,0.345703,1.342773,2.046875,1.040039,2.03125,1.004883,-0.002577,0.997559,0.601074,-0.406494,0.59668,1.353516,1.004883,-0.002577,0.997559,0.601074,-0.406494,0.059509,-0.947754,1.005859,-0.001288,0.998535,1.007812,1.004883,0.001822,-1.005859
101,1.004883,0.778809,0.231079,0.0001,9,1.80957,1.004883,2.960938,3,3,0.562012,0.286377,-1.05957,0.212805,1.345703,0.0,0.946289,0.004223,1.792969,0.788086,1.78418,2.302734,1.297852,2.291016,1.004883,0.692383,-0.312744,0.688965,1.353516,0.348389,1.34668,2.046875,1.041992,2.037109,1.004883,0.0,1.0,0.601074,-0.403809,0.598145,1.353516,1.004883,0.0,1.0,0.601074,-0.403809,0.059509,-0.945312,1.005859,0.001288,1.000977,1.007812,1.004883,0.001822,-1.00293


10     32.6889
13     32.6891
15     32.6905
97     55.5252
101    54.7359
Name: scalar_coupling_constant, dtype: float64

mean of target. train:47.485131115953955, valid:47.4693919474233
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.431148	training's l1: 0.459721	valid_1's l2: 2.28412	valid_1's l1: 1.00539
[200]	training's l2: 0.144891	training's l1: 0.256693	valid_1's l2: 2.17874	valid_1's l1: 0.982588
[300]	training's l2: 0.0647726	training's l1: 0.157254	valid_1's l2: 2.14414	valid_1's l1: 0.975161
[400]	training's l2: 0.0339024	training's l1: 0.101918	valid_1's l2: 2.13244	valid_1's l1: 0.972659
[500]	training's l2: 0.0209462	training's l1: 0.0691715	valid_1's l2: 2.12944	valid_1's l1: 0.971363
[600]	training's l2: 0.0145138	training's l1: 0.0487421	valid_1's l2: 2.12653	valid_1's l1: 0.970301
Early stopping, best iteration is:
[595]	training's l2: 0.0148182	training's l1: 0.0495872	valid_1's l2: 2.12635	valid_1's l1: 0.970344


[INFO]2019-06-23 13:58:13,646:main:fold 1 valid 0.069307


mean of target. train:47.46433043342862, valid:47.5109936626538
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.410627	training's l1: 0.460942	valid_1's l2: 2.28741	valid_1's l1: 1.00883
[200]	training's l2: 0.126802	training's l1: 0.254974	valid_1's l2: 2.19353	valid_1's l1: 0.986916
[300]	training's l2: 0.0492803	training's l1: 0.154123	valid_1's l2: 2.16069	valid_1's l1: 0.979988
[400]	training's l2: 0.0229993	training's l1: 0.0983836	valid_1's l2: 2.14763	valid_1's l1: 0.977483
[500]	training's l2: 0.0124255	training's l1: 0.0650478	valid_1's l2: 2.14041	valid_1's l1: 0.975567
[600]	training's l2: 0.00760749	training's l1: 0.044471	valid_1's l2: 2.13483	valid_1's l1: 0.9744
[700]	training's l2: 0.00520191	training's l1: 0.0313783	valid_1's l2: 2.13209	valid_1's l1: 0.973837
[800]	training's l2: 0.00372332	training's l1: 0.0227364	valid_1's l2: 2.13045	valid_1's l1: 0.973544
[900]	training's l2: 0.0027157	training's l1: 0.0171862	valid_1's l2: 2

[INFO]2019-06-23 13:59:20,302:main:fold 2 valid -0.127946


mean of target. train:47.49019208550947, valid:47.45926856925414
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.424971	training's l1: 0.45631	valid_1's l2: 2.27679	valid_1's l1: 1.00512
[200]	training's l2: 0.143369	training's l1: 0.254895	valid_1's l2: 2.17441	valid_1's l1: 0.983772
[300]	training's l2: 0.0638565	training's l1: 0.15594	valid_1's l2: 2.14556	valid_1's l1: 0.977331
[400]	training's l2: 0.0348474	training's l1: 0.100982	valid_1's l2: 2.12965	valid_1's l1: 0.973843
[500]	training's l2: 0.02151	training's l1: 0.0680214	valid_1's l2: 2.12465	valid_1's l1: 0.972947
[600]	training's l2: 0.0148212	training's l1: 0.0478019	valid_1's l2: 2.12096	valid_1's l1: 0.972438
[700]	training's l2: 0.010812	training's l1: 0.0346842	valid_1's l2: 2.11843	valid_1's l1: 0.972024
[800]	training's l2: 0.00833906	training's l1: 0.0259688	valid_1's l2: 2.11587	valid_1's l1: 0.971613
[900]	training's l2: 0.00671024	training's l1: 0.0201679	valid_1's l2: 2.11

[INFO]2019-06-23 14:00:16,048:main:fold 3 valid -0.095027
[INFO]2019-06-23 14:00:16,063:main:CV score: -0.051222
[INFO]2019-06-23 14:00:16,066:main:Starting train model(2JHN)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,2j_atom_center,2j_area_021,2j_norm_vec_02,2j_norm_vec_12,2j_cos,2j_atom_center_weight,2j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
18,0,0.0,1.066406,1.151367,-1.0,12.007812,2.21875,2.21875,0.000907,4.921875,0.0001494884,2,1.642578,1.066406,2.21875,2,1,0.011055,0.556641,0.575684,-29.055043,1.132812,1.151367,0.814453,0.00449,1.642578,-0.575684,0.740234,2.21875,0.0,1.0,1.066406,-1.151367,0.480713,0.814453,-1.404297,0.367188,2.21875,0.0,1.0,2.21875,0.0,1.0,2.21875,0.0,1.0,,,,2.21875,2.21875,0.0,1.0,,,,,2.21875,0.0,1.0,2.21875,2.21875,,
104,0,1.402344,1.109375,1.359375,-0.368652,12.007812,2.46875,2.046875,0.875,3.316406,6.556511e-07,9,1.80957,1.004883,2.960938,2,3,0.051178,0.666992,-0.678711,0.495638,1.345703,0.0,0.959961,0.005436,1.578125,-0.468994,0.770996,2.046875,0.0,1.0,1.109375,-0.937988,0.541992,0.663574,-1.383789,0.323975,1.353516,-0.694336,0.661133,2.046875,0.0,1.0,1.004883,-1.041992,0.490723,0.601074,-1.446289,0.293701,1.353516,1.004883,-1.041992,0.490723,0.601074,-1.446289,0.161255,-1.885742,2.046875,0.0,1.0,2.046875,2.046875,,
400,0,1.458008,1.102539,1.463867,-0.429443,12.007812,2.566406,2.177734,0.276123,3.595703,0.8720703,20,2.130859,1.014648,3.285156,5,4,1.214844,-0.062622,-1.549805,-0.042105,1.487305,0.0,0.936035,0.37793,2.048828,-0.12854,0.940918,3.123047,0.944824,1.433594,1.102539,-1.075195,0.506348,0.73291,-1.445312,0.336426,1.574219,-0.604004,0.722656,2.177734,0.0,1.0,1.014648,-1.163086,0.466064,0.646973,-1.53125,0.296875,2.09375,1.014648,-1.163086,0.466064,0.948242,-1.230469,0.18811,-1.990234,2.132812,-0.044678,0.979492,2.177734,2.089844,0.063171,-2.115234
405,0,1.520508,1.095703,1.463867,-0.317871,12.007812,2.560547,2.089844,0.318604,3.359375,0.6855469,20,2.130859,1.014648,3.285156,4,4,1.170898,0.008034,-1.479492,0.005403,1.487305,0.0,1.06543,0.02417,2.103516,0.014084,1.006836,3.142578,1.054688,1.504883,1.095703,-0.992676,0.524902,0.835938,-1.25293,0.400146,1.574219,-0.514648,0.753906,2.177734,0.089355,1.042969,1.014648,-1.074219,0.48584,0.646973,-1.442383,0.30957,2.09375,1.014648,-1.074219,0.48584,0.948242,-1.140625,0.18811,-1.900391,2.132812,0.044678,1.021484,2.177734,2.089844,0.063171,-2.025391
695,2,1.326172,0.962891,1.407227,-0.207031,16.0,2.371094,1.862305,0.003351,0.032227,3.433594,32,2.222656,1.09082,3.513672,2,7,0.061554,-0.347656,0.344238,0.502479,-0.003431,0.688477,0.486816,0.759277,2.417969,0.555664,1.298828,2.974609,1.111328,1.59668,1.862305,0.0,1.0,0.786133,-1.076172,0.421875,2.779297,0.916504,1.492188,3.341797,1.478516,1.793945,1.862305,0.0,1.0,0.494385,-1.368164,0.265381,2.779297,1.862305,0.0,1.0,0.494385,-1.368164,0.190552,-1.671875,1.862305,0.0,1.0,1.862305,1.862305,,


18      5.182460
104    15.513500
400     0.633806
405     1.378300
695     0.784661
Name: scalar_coupling_constant, dtype: float64

mean of target. train:3.1358723658649814, valid:3.1025161085255877
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.163965	training's l1: 0.274155	valid_1's l2: 0.457483	valid_1's l1: 0.417196
[200]	training's l2: 0.0765871	training's l1: 0.192098	valid_1's l2: 0.417195	valid_1's l1: 0.398892
[300]	training's l2: 0.0415392	training's l1: 0.143783	valid_1's l2: 0.402644	valid_1's l1: 0.391497
[400]	training's l2: 0.023901	training's l1: 0.110279	valid_1's l2: 0.395712	valid_1's l1: 0.388222
[500]	training's l2: 0.0146214	training's l1: 0.0865814	valid_1's l2: 0.391906	valid_1's l1: 0.386473
[600]	training's l2: 0.00902933	training's l1: 0.0684791	valid_1's l2: 0.389193	valid_1's l1: 0.38505
[700]	training's l2: 0.00571947	training's l1: 0.0547829	valid_1's l2: 0.387308	valid_1's l1: 0.384201
[800]	training's l2: 0.00374255	training's l1: 0.0442341	valid_1's l2: 0.385777	valid_1's l1: 0.383394
[900]	training's l2: 0.00248409	training's l1: 0.0360093	v

[INFO]2019-06-23 14:01:59,418:main:fold 1 valid -1.369323


mean of target. train:3.106771189731069, valid:3.1607184607934324
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.162425	training's l1: 0.274192	valid_1's l2: 0.430759	valid_1's l1: 0.41891
[200]	training's l2: 0.0748257	training's l1: 0.191811	valid_1's l2: 0.394062	valid_1's l1: 0.399773
[300]	training's l2: 0.0400764	training's l1: 0.142688	valid_1's l2: 0.380231	valid_1's l1: 0.392241
[400]	training's l2: 0.0230566	training's l1: 0.109422	valid_1's l2: 0.373962	valid_1's l1: 0.388665
[500]	training's l2: 0.0139219	training's l1: 0.0854698	valid_1's l2: 0.370432	valid_1's l1: 0.386645
[600]	training's l2: 0.00865345	training's l1: 0.0676734	valid_1's l2: 0.367616	valid_1's l1: 0.384959
[700]	training's l2: 0.00549449	training's l1: 0.0539025	valid_1's l2: 0.365934	valid_1's l1: 0.383915
[800]	training's l2: 0.00352998	training's l1: 0.0433615	valid_1's l2: 0.365164	valid_1's l1: 0.383385
[900]	training's l2: 0.00234243	training's l1: 0.0352017	v

[INFO]2019-06-23 14:03:38,407:main:fold 2 valid -1.286160


mean of target. train:3.1316172846594754, valid:3.1110262709365695
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.163601	training's l1: 0.275668	valid_1's l2: 0.460966	valid_1's l1: 0.424981
[200]	training's l2: 0.0759483	training's l1: 0.192802	valid_1's l2: 0.424205	valid_1's l1: 0.406754
[300]	training's l2: 0.0410143	training's l1: 0.143959	valid_1's l2: 0.408721	valid_1's l1: 0.399023
[400]	training's l2: 0.023647	training's l1: 0.110676	valid_1's l2: 0.401099	valid_1's l1: 0.395143
[500]	training's l2: 0.0140814	training's l1: 0.0861992	valid_1's l2: 0.396575	valid_1's l1: 0.392921
[600]	training's l2: 0.00886694	training's l1: 0.0684604	valid_1's l2: 0.394219	valid_1's l1: 0.391534
[700]	training's l2: 0.00557125	training's l1: 0.0544988	valid_1's l2: 0.392285	valid_1's l1: 0.390492
[800]	training's l2: 0.00361738	training's l1: 0.0438623	valid_1's l2: 0.391188	valid_1's l1: 0.389831
[900]	training's l2: 0.00241157	training's l1: 0.0356725	

[INFO]2019-06-23 14:05:19,581:main:fold 3 valid -1.400766
[INFO]2019-06-23 14:05:19,593:main:CV score: -1.352083
[INFO]2019-06-23 14:05:19,596:main:Starting train model(2JHC)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,2j_atom_center,2j_area_021,2j_norm_vec_02,2j_norm_vec_12,2j_cos,2j_atom_center_weight,2j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
20,0,1.558594,1.094727,1.529297,-0.365479,12.007812,2.625,2.181641,0.985352,3.777344,8.940697e-07,27,2.029297,1.094727,3.095703,7,6,0.572754,0.592285,0.596191,-152.615283,1.923828,1.927734,1.134766,0.720703,2.140625,-0.040985,0.981445,3.095703,0.913574,1.418945,1.094727,-1.087891,0.501465,0.660156,-1.522461,0.30249,1.638672,-0.543945,0.750977,2.181641,4.7e-05,1.0,1.094727,-1.087891,0.501465,0.595703,-1.586914,0.272949,1.638672,1.094727,-1.087891,0.501465,0.567871,-1.614258,0.217896,-1.964844,2.181641,1.2e-05,1.0,2.181641,2.181641,2.6e-05,-2.181641
27,0,1.558594,1.094727,1.529297,-0.365479,12.007812,2.625,2.181641,0.296143,3.714844,0.7519531,27,2.029297,1.094727,3.095703,6,6,0.598145,0.370605,0.374512,-95.462501,1.914062,1.917969,1.063477,0.667969,2.205078,0.021729,1.009766,3.095703,0.913574,1.418945,1.094727,-1.087891,0.501465,0.699707,-1.482422,0.320557,1.638672,-0.543945,0.750977,2.181641,5.2e-05,1.0,1.094727,-1.087891,0.501465,0.595703,-1.586914,0.272949,1.638672,1.094727,-1.087891,0.501465,0.567871,-1.614258,0.217896,-1.964844,2.181641,1.7e-05,1.0,2.181641,2.181641,2.6e-05,-2.181641
33,0,1.558594,1.094727,1.529297,-0.365723,12.007812,2.625,2.181641,0.278076,3.679688,0.8066406,27,2.029297,1.094727,3.095703,5,6,0.624023,0.061859,0.065735,-15.934839,1.525391,1.529297,0.836426,0.624023,2.292969,0.109497,1.049805,3.095703,0.913574,1.418945,1.094727,-1.087891,0.501465,0.744629,-1.4375,0.341064,1.638672,-0.543945,0.750977,2.181641,0.0,1.0,1.094727,-1.087891,0.501465,0.595703,-1.586914,0.272949,1.638672,1.094727,-1.087891,0.501465,0.567871,-1.614258,0.217896,-1.964844,2.181641,-3.5e-05,1.0,2.181641,2.181641,2.6e-05,-2.181641
37,0,1.558594,1.094727,1.529297,-0.365479,12.007812,2.625,2.181641,0.296143,3.714844,0.7519531,27,2.029297,1.094727,3.095703,4,6,0.63623,0.177856,-1.347656,0.116545,1.525391,0.0,0.918457,0.44751,1.702148,-0.480469,0.779785,2.181641,0.0,1.0,1.094727,-1.087891,0.501465,0.449951,-1.732422,0.206177,1.638672,-0.543945,0.750977,2.181641,5.2e-05,1.0,1.094727,-1.087891,0.501465,0.595703,-1.586914,0.272949,1.638672,1.094727,-1.087891,0.501465,0.567871,-1.614258,0.217896,-1.964844,2.181641,1.9e-05,1.0,2.181641,2.181641,2.6e-05,-2.181641
41,0,1.558594,1.094727,1.529297,-0.365479,12.007812,2.625,2.181641,0.98584,3.777344,8.34465e-07,27,2.029297,1.094727,3.095703,3,6,0.298584,0.376465,-1.149414,0.246732,1.525391,0.0,1.013672,0.516113,1.680664,-0.501465,0.77002,2.181641,0.0,1.0,1.094727,-1.087891,0.501465,0.548828,-1.633789,0.251465,1.638672,-0.543945,0.750977,2.181641,5.2e-05,1.0,1.094727,-1.087891,0.501465,0.595703,-1.586914,0.272949,1.638672,1.094727,-1.087891,0.501465,0.567871,-1.614258,0.217896,-1.964844,2.181641,1.9e-05,1.0,2.181641,2.181641,2.6e-05,-2.181641


20   -2.37831
27   -2.37862
33   -2.37716
37   -2.37876
41   -2.37852
Name: scalar_coupling_constant, dtype: float64

mean of target. train:-0.2669883847266586, valid:-0.27789653463344455
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 1.6332	training's l1: 0.867872	valid_1's l2: 1.91016	valid_1's l1: 0.923244
[200]	training's l2: 1.24348	training's l1: 0.76365	valid_1's l2: 1.65837	valid_1's l1: 0.857287
[300]	training's l2: 1.02243	training's l1: 0.696098	valid_1's l2: 1.53095	valid_1's l1: 0.820965
[400]	training's l2: 0.870918	training's l1: 0.646272	valid_1's l2: 1.45552	valid_1's l1: 0.799348
[500]	training's l2: 0.752034	training's l1: 0.603729	valid_1's l2: 1.39921	valid_1's l1: 0.782654
[600]	training's l2: 0.661064	training's l1: 0.569103	valid_1's l2: 1.36124	valid_1's l1: 0.771201
[700]	training's l2: 0.590387	training's l1: 0.539444	valid_1's l2: 1.33415	valid_1's l1: 0.762335
[800]	training's l2: 0.526087	training's l1: 0.511503	valid_1's l2: 1.30922	valid_1's l1: 0.754576
[900]	training's l2: 0.475537	training's l1: 0.487523	valid_1's l2: 1.29055	vali

[INFO]2019-06-23 14:12:10,307:main:fold 1 valid -0.387391


mean of target. train:-0.2697028296664154, valid:-0.2724676518929613
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 1.62467	training's l1: 0.863348	valid_1's l2: 1.92327	valid_1's l1: 0.919126
[200]	training's l2: 1.24145	training's l1: 0.762214	valid_1's l2: 1.67449	valid_1's l1: 0.855964
[300]	training's l2: 1.02228	training's l1: 0.69634	valid_1's l2: 1.55393	valid_1's l1: 0.822628
[400]	training's l2: 0.869003	training's l1: 0.644853	valid_1's l2: 1.47249	valid_1's l1: 0.798636
[500]	training's l2: 0.758407	training's l1: 0.605075	valid_1's l2: 1.424	valid_1's l1: 0.784118
[600]	training's l2: 0.666919	training's l1: 0.570029	valid_1's l2: 1.38329	valid_1's l1: 0.77173
[700]	training's l2: 0.593713	training's l1: 0.53997	valid_1's l2: 1.35075	valid_1's l1: 0.76231
[800]	training's l2: 0.533976	training's l1: 0.513347	valid_1's l2: 1.32707	valid_1's l1: 0.754776
[900]	training's l2: 0.479999	training's l1: 0.488563	valid_1's l2: 1.30633	valid_1's

[INFO]2019-06-23 14:19:37,564:main:fold 2 valid -0.390424


mean of target. train:-0.2751820932632042, valid:-0.2615091031497235
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 1.62156	training's l1: 0.8665	valid_1's l2: 1.88921	valid_1's l1: 0.918228
[200]	training's l2: 1.23299	training's l1: 0.760527	valid_1's l2: 1.63918	valid_1's l1: 0.850409
[300]	training's l2: 1.02013	training's l1: 0.695486	valid_1's l2: 1.52326	valid_1's l1: 0.817482
[400]	training's l2: 0.872335	training's l1: 0.647555	valid_1's l2: 1.45583	valid_1's l1: 0.798354
[500]	training's l2: 0.758756	training's l1: 0.60664	valid_1's l2: 1.4032	valid_1's l1: 0.781895
[600]	training's l2: 0.668941	training's l1: 0.571971	valid_1's l2: 1.36577	valid_1's l1: 0.770547
[700]	training's l2: 0.594164	training's l1: 0.541061	valid_1's l2: 1.33428	valid_1's l1: 0.76075
[800]	training's l2: 0.531198	training's l1: 0.513633	valid_1's l2: 1.30871	valid_1's l1: 0.752781
[900]	training's l2: 0.478997	training's l1: 0.488841	valid_1's l2: 1.28993	valid_1'

[INFO]2019-06-23 14:26:07,689:main:fold 3 valid -0.389229
[INFO]2019-06-23 14:26:07,731:main:CV score: -0.389015
[INFO]2019-06-23 14:26:07,741:main:Starting train model(3JHH)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,3j_norm_vec_02,3j_norm_vec_13,3j_norm_vec_23,3j_cos_023,3j_cos_231,3j_area_023,3j_area_231,3j_dihedral,3j_atom_center_weight,3j_atom_center,3j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
23,1.094727,1.094727,1.529297,-0.365479,-0.365479,1.558594,1.558594,0.5,24.015625,0,3.71875,2.542969,0.220337,5.484375,0.765137,27,2.029297,1.094727,3.095703,7,3,0.572754,0.592285,0.994141,-1.474066,1.923828,2.326172,1.134766,0.720703,2.140625,-0.401855,0.841797,3.095703,0.552734,1.216797,1.094727,-1.448242,0.43042,0.660156,-1.882812,0.259521,2.728516,0.184204,1.072266,3.095703,0.552734,1.216797,2.542969,0.0,1.0,0.319092,-2.224609,0.125488,2.341797,1.765625,-0.77832,0.693848,0.530762,-2.011719,0.276367,-2.267578,2.728516,0.184204,1.072266,3.095703,2.542969,0.276367,-2.267578
24,1.094727,1.094727,1.529297,-0.365479,-0.365479,1.558594,1.558594,-1.0,24.015625,0,3.71875,3.095703,4.027344,5.558594,4.3e-05,27,2.029297,1.094727,3.095703,7,4,0.572754,0.592285,1.010742,-1.417203,1.923828,2.341797,1.134766,0.720703,2.140625,-0.95459,0.691895,3.095703,0.0,1.0,1.094727,-2.0,0.35376,0.660156,-2.435547,0.213257,2.486328,-0.608887,0.803223,3.095703,0.0,1.0,1.765625,-1.331055,0.570312,0.547363,-2.548828,0.176758,2.341797,1.765625,-1.331055,0.570312,0.530762,-2.564453,0.276367,-2.820312,2.728516,-0.368408,0.880859,3.095703,2.542969,0.276367,-2.820312
25,1.094727,1.094727,1.529297,-0.365479,-0.365723,1.558594,1.558594,0.5,24.015625,0,3.71875,2.542969,0.23645,5.4375,0.792969,27,2.029297,1.094727,3.095703,7,5,0.572754,0.592285,0.984863,-1.509512,1.923828,2.316406,1.134766,0.720703,2.140625,-0.401855,0.841797,3.095703,0.552734,1.216797,1.094727,-1.448242,0.43042,0.660156,-1.883789,0.259521,2.341797,-0.200684,0.920898,3.095703,0.552734,1.216797,1.765625,-0.77832,0.693848,0.573242,-1.969727,0.225464,2.341797,1.765625,-0.77832,0.693848,0.530762,-2.011719,0.276367,-2.267578,2.728516,0.184204,1.072266,3.095703,2.542969,0.276367,-2.267578
29,1.094727,1.094727,1.529297,-0.365479,-0.365479,1.558594,1.558594,-1.0,24.015625,0,3.71875,3.095703,1.139648,5.40625,3.037109,27,2.029297,1.094727,3.095703,6,3,0.598145,0.370605,0.772461,-0.922044,1.914062,2.316406,1.063477,0.667969,2.205078,-0.891602,0.711914,3.095703,0.0,1.0,1.094727,-2.0,0.35376,0.699707,-2.396484,0.226074,2.728516,-0.368408,0.880859,3.095703,0.0,1.0,2.542969,-0.552734,0.821289,0.319092,-2.777344,0.103027,2.341797,1.765625,-1.331055,0.570312,0.530762,-2.564453,0.276367,-2.820312,2.728516,-0.368408,0.880859,3.095703,2.542969,0.276367,-2.820312
30,1.094727,1.094727,1.529297,-0.365479,-0.365479,1.558594,1.558594,0.5,24.015625,0,3.71875,2.542969,0.220337,5.484375,0.765137,27,2.029297,1.094727,3.095703,6,4,0.598145,0.370605,0.788574,-0.886476,1.914062,2.332031,1.063477,0.667969,2.205078,-0.339111,0.866699,3.095703,0.552734,1.216797,1.094727,-1.448242,0.43042,0.699707,-1.84375,0.275146,2.486328,-0.056396,0.978027,3.095703,0.552734,1.216797,1.765625,-0.77832,0.693848,0.547363,-1.996094,0.215088,2.341797,1.765625,-0.77832,0.693848,0.530762,-2.011719,0.276367,-2.267578,2.728516,0.184204,1.072266,3.095703,2.542969,0.276367,-2.267578


23     3.25281
24    13.69130
25     3.25205
29    13.69240
30     3.25253
Name: scalar_coupling_constant, dtype: float64

mean of target. train:4.768076147147949, valid:4.776917754971605
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.213324	training's l1: 0.317231	valid_1's l2: 0.279341	valid_1's l1: 0.354478
[200]	training's l2: 0.152523	training's l1: 0.26932	valid_1's l2: 0.24419	valid_1's l1: 0.328641
[300]	training's l2: 0.116337	training's l1: 0.237314	valid_1's l2: 0.226522	valid_1's l1: 0.315313
[400]	training's l2: 0.0933257	training's l1: 0.214139	valid_1's l2: 0.217081	valid_1's l1: 0.307931
[500]	training's l2: 0.0759719	training's l1: 0.19431	valid_1's l2: 0.209774	valid_1's l1: 0.301893
[600]	training's l2: 0.0631724	training's l1: 0.178033	valid_1's l2: 0.204791	valid_1's l1: 0.297688
[700]	training's l2: 0.0533358	training's l1: 0.164434	valid_1's l2: 0.201351	valid_1's l1: 0.294902
[800]	training's l2: 0.0454585	training's l1: 0.152361	valid_1's l2: 0.198622	valid_1's l1: 0.292533
[900]	training's l2: 0.0391263	training's l1: 0.14181	valid_1's l2: 0.

[INFO]2019-06-23 14:30:21,214:main:fold 1 valid -1.432246


mean of target. train:4.774106235047843, valid:4.764857593452466
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.212283	training's l1: 0.317269	valid_1's l2: 0.278065	valid_1's l1: 0.353598
[200]	training's l2: 0.149543	training's l1: 0.268475	valid_1's l2: 0.242406	valid_1's l1: 0.327741
[300]	training's l2: 0.114366	training's l1: 0.236178	valid_1's l2: 0.224946	valid_1's l1: 0.313866
[400]	training's l2: 0.0909665	training's l1: 0.21214	valid_1's l2: 0.214503	valid_1's l1: 0.30545
[500]	training's l2: 0.07394	training's l1: 0.192869	valid_1's l2: 0.207363	valid_1's l1: 0.299895
[600]	training's l2: 0.0615383	training's l1: 0.176766	valid_1's l2: 0.202572	valid_1's l1: 0.295884
[700]	training's l2: 0.0520384	training's l1: 0.163428	valid_1's l2: 0.199354	valid_1's l1: 0.293245
[800]	training's l2: 0.0444496	training's l1: 0.151539	valid_1's l2: 0.196924	valid_1's l1: 0.291005
[900]	training's l2: 0.0378854	training's l1: 0.140659	valid_1's l2: 0.

[INFO]2019-06-23 14:34:31,024:main:fold 2 valid -1.460313


mean of target. train:4.770887689526883, valid:4.771294700843139
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.210439	training's l1: 0.315226	valid_1's l2: 0.275518	valid_1's l1: 0.35139
[200]	training's l2: 0.148511	training's l1: 0.266767	valid_1's l2: 0.240873	valid_1's l1: 0.325769
[300]	training's l2: 0.11364	training's l1: 0.235432	valid_1's l2: 0.224157	valid_1's l1: 0.31334
[400]	training's l2: 0.0908334	training's l1: 0.212059	valid_1's l2: 0.214336	valid_1's l1: 0.305683
[500]	training's l2: 0.0745595	training's l1: 0.192991	valid_1's l2: 0.207822	valid_1's l1: 0.300214
[600]	training's l2: 0.0622262	training's l1: 0.177195	valid_1's l2: 0.20302	valid_1's l1: 0.296235
[700]	training's l2: 0.0525843	training's l1: 0.163642	valid_1's l2: 0.19951	valid_1's l1: 0.293224
[800]	training's l2: 0.0447863	training's l1: 0.151653	valid_1's l2: 0.196788	valid_1's l1: 0.290867
[900]	training's l2: 0.0385223	training's l1: 0.141128	valid_1's l2: 0.1

[INFO]2019-06-23 14:38:37,970:main:fold 3 valid -1.437315
[INFO]2019-06-23 14:38:38,003:main:CV score: -1.443291
[INFO]2019-06-23 14:38:38,010:main:Starting train model(3JHC)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,3j_norm_vec_02,3j_norm_vec_13,3j_norm_vec_23,3j_cos_023,3j_cos_231,3j_area_023,3j_area_231,3j_dihedral,3j_atom_center_weight,3j_atom_center,3j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
58,1.095703,1.201172,1.455078,-0.361572,-1.0,1.486328,0.000355,-0.772949,24.015625,0,3.751953,3.21875,0.959473,9.398438,5.1e-05,15,2.109375,1.061523,3.71875,5,4,0.293213,0.797852,1.989258,-0.669307,1.858398,3.050781,1.34668,0.625488,1.993164,-1.225586,0.619141,3.21875,0.0,1.0,1.095703,-2.123047,0.340332,0.77832,-2.439453,0.241821,2.679688,-0.539551,0.83252,3.21875,0.0,1.0,1.061523,-2.158203,0.329834,1.078125,-2.140625,0.335205,2.193359,1.061523,-2.158203,0.329834,0.972168,-2.246094,0.249634,-2.96875,3.34375,0.124756,1.039062,3.71875,3.21875,0.249634,-2.96875
63,1.095703,1.201172,1.455078,-0.361572,-1.0,1.486328,0.000355,-0.163086,24.015625,0,3.751953,3.21875,0.314209,9.304688,0.744141,15,2.109375,1.061523,3.71875,4,4,0.263428,0.532715,1.724609,-0.446886,1.848633,3.041016,1.396484,0.449463,2.048828,-1.169922,0.636719,3.21875,0.0,1.0,1.095703,-2.123047,0.340332,0.887207,-2.332031,0.275635,2.679688,-0.539551,0.83252,3.21875,4.5e-05,1.0,1.061523,-2.15625,0.329834,1.078125,-2.140625,0.335205,2.193359,1.061523,-2.15625,0.329834,0.972168,-2.246094,0.249634,-2.96875,3.34375,0.124756,1.039062,3.71875,3.21875,0.249634,-2.96875
67,1.095703,1.201172,1.455078,-0.361328,-1.0,1.486328,0.000355,0.936035,24.015625,0,3.751953,3.21875,0.295654,9.242188,0.820801,15,2.109375,1.061523,3.71875,3,4,0.018112,0.093994,1.286133,-0.078889,1.464844,2.65625,1.330078,0.007313,2.142578,-1.076172,0.665527,3.21875,0.0,1.0,1.095703,-2.123047,0.340332,1.061523,-2.15625,0.329834,2.679688,-0.539062,0.83252,3.21875,0.000195,1.0,1.061523,-2.15625,0.329834,1.078125,-2.140625,0.335205,2.193359,1.061523,-2.15625,0.329834,0.972168,-2.246094,0.249634,-2.96875,3.34375,0.124939,1.039062,3.71875,3.21875,0.249634,-2.96875
68,1.061523,1.455078,1.201172,-1.0,-1.0,0.00046,0.000355,0.771484,24.015625,0,3.71875,3.71875,0.002514,13.820312,0.000414,15,2.109375,1.061523,3.71875,3,4,0.018112,0.093994,-1.370117,0.064206,1.464844,0.0,1.330078,0.007313,2.347656,-1.370117,0.631348,3.71875,0.0,1.0,1.061523,-2.65625,0.2854,1.330078,-2.386719,0.35791,1.750977,-1.966797,0.470947,3.71875,0.0,1.0,1.095703,-2.623047,0.294678,1.311523,-2.40625,0.352783,2.193359,1.061523,-2.65625,0.2854,0.972168,-2.746094,0.249634,-3.46875,3.34375,-0.374512,0.899414,3.71875,3.21875,0.249634,-3.46875
108,1.095703,1.53125,1.53125,-0.360352,-0.388916,1.56543,2.160156,0.503418,24.015625,0,4.160156,2.818359,0.066467,6.203125,1.672852,43,2.175781,1.094727,3.505859,7,8,0.619141,0.588867,1.115234,-1.120066,1.948242,2.474609,1.148438,0.80957,2.175781,-0.641602,0.772461,3.078125,0.260986,1.092773,1.095703,-1.72168,0.388916,0.689453,-2.128906,0.244629,2.09375,-0.724121,0.743164,3.505859,0.688477,1.244141,1.094727,-1.723633,0.388428,0.929199,-1.888672,0.329834,2.033203,1.094727,-1.723633,0.388428,0.78125,-2.037109,0.338867,-2.478516,3.046875,0.229492,1.081055,3.505859,2.818359,0.355713,-2.462891


58     4.55166
63     4.55410
67     4.55341
68     2.51865
108    2.51277
Name: scalar_coupling_constant, dtype: float64

mean of target. train:3.690534941137389, valid:3.684338890434173
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.80267	training's l1: 0.620912	valid_1's l2: 0.922152	valid_1's l1: 0.649438
[200]	training's l2: 0.623188	training's l1: 0.550608	valid_1's l2: 0.798362	valid_1's l1: 0.600096
[300]	training's l2: 0.517862	training's l1: 0.505605	valid_1's l2: 0.732902	valid_1's l1: 0.5735
[400]	training's l2: 0.449197	training's l1: 0.473107	valid_1's l2: 0.694021	valid_1's l1: 0.556613
[500]	training's l2: 0.395463	training's l1: 0.445467	valid_1's l2: 0.665065	valid_1's l1: 0.543465
[600]	training's l2: 0.352771	training's l1: 0.421477	valid_1's l2: 0.642316	valid_1's l1: 0.532288
[700]	training's l2: 0.318679	training's l1: 0.401699	valid_1's l2: 0.626282	valid_1's l1: 0.524706
[800]	training's l2: 0.290267	training's l1: 0.384344	valid_1's l2: 0.614013	valid_1's l1: 0.518741
[900]	training's l2: 0.265712	training's l1: 0.368689	valid_1's l2: 0.60277

[INFO]2019-06-23 14:47:19,351:main:fold 1 valid -0.668790


mean of target. train:3.6877465648081365, valid:3.6899156375542
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.802149	training's l1: 0.621754	valid_1's l2: 0.904973	valid_1's l1: 0.651988
[200]	training's l2: 0.629131	training's l1: 0.553047	valid_1's l2: 0.784876	valid_1's l1: 0.603505
[300]	training's l2: 0.528339	training's l1: 0.5092	valid_1's l2: 0.72387	valid_1's l1: 0.577494
[400]	training's l2: 0.456797	training's l1: 0.475856	valid_1's l2: 0.683905	valid_1's l1: 0.560199
[500]	training's l2: 0.401703	training's l1: 0.447698	valid_1's l2: 0.653454	valid_1's l1: 0.546218
[600]	training's l2: 0.359459	training's l1: 0.424717	valid_1's l2: 0.632333	valid_1's l1: 0.536327
[700]	training's l2: 0.323883	training's l1: 0.404645	valid_1's l2: 0.615879	valid_1's l1: 0.528777
[800]	training's l2: 0.293786	training's l1: 0.386458	valid_1's l2: 0.602329	valid_1's l1: 0.522219
[900]	training's l2: 0.266916	training's l1: 0.369897	valid_1's l2: 0.59041	

[INFO]2019-06-23 14:55:44,705:main:fold 2 valid -0.674944


mean of target. train:3.6871272639941557, valid:3.691154245950344
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.807423	training's l1: 0.623073	valid_1's l2: 0.911349	valid_1's l1: 0.651136
[200]	training's l2: 0.629329	training's l1: 0.553325	valid_1's l2: 0.787392	valid_1's l1: 0.60233
[300]	training's l2: 0.527759	training's l1: 0.509268	valid_1's l2: 0.724496	valid_1's l1: 0.576063
[400]	training's l2: 0.452597	training's l1: 0.474984	valid_1's l2: 0.681262	valid_1's l1: 0.557821
[500]	training's l2: 0.400584	training's l1: 0.446905	valid_1's l2: 0.652556	valid_1's l1: 0.543613
[600]	training's l2: 0.358391	training's l1: 0.424057	valid_1's l2: 0.631459	valid_1's l1: 0.533772
[700]	training's l2: 0.323427	training's l1: 0.404164	valid_1's l2: 0.615192	valid_1's l1: 0.5261
[800]	training's l2: 0.293312	training's l1: 0.38575	valid_1's l2: 0.60045	valid_1's l1: 0.518901
[900]	training's l2: 0.268331	training's l1: 0.369846	valid_1's l2: 0.59014	

[INFO]2019-06-23 15:04:27,700:main:fold 3 valid -0.676353
[INFO]2019-06-23 15:04:27,763:main:CV score: -0.673363
[INFO]2019-06-23 15:04:27,777:main:Starting train model(3JHN)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,3j_norm_vec_02,3j_norm_vec_13,3j_norm_vec_23,3j_cos_023,3j_cos_231,3j_area_023,3j_area_231,3j_dihedral,3j_atom_center_weight,3j_atom_center,3j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
73,1.09375,1.155273,1.457031,-0.347412,-1.0,1.493164,0.000536,0.966309,24.015625,0,3.705078,3.162109,0.969238,9.03125,4.4e-05,12,2.033203,1.09375,3.162109,5,3,0.293945,0.802734,1.947266,-0.701237,1.844727,2.990234,1.324219,0.62793,1.982422,-1.180664,0.626953,3.162109,0.0,1.0,1.09375,-2.068359,0.345703,0.755371,-2.40625,0.238892,3.162109,5.1e-05,1.0,3.162109,0.000125,1.0,3.162109,0.0,1.0,6.6e-05,-3.162109,2.1e-05,3.162109,3.162109,0.0,1.0,6.6e-05,-3.162109,6.6e-05,-3.162109,3.162109,5.1e-05,1.0,3.162109,3.162109,6.6e-05,-3.162109
78,1.09375,1.155273,1.457031,-0.347412,-1.0,1.494141,0.000536,-0.706055,24.015625,0,3.705078,3.162109,0.31543,8.9375,0.750977,12,2.033203,1.09375,3.162109,4,3,0.26416,0.541992,1.686523,-0.473586,1.834961,2.980469,1.373047,0.451172,2.033203,-1.128906,0.643066,3.162109,0.0,1.0,1.09375,-2.070312,0.345703,0.862305,-2.300781,0.272705,3.162109,-7.4e-05,1.0,3.162109,0.0,1.0,3.162109,-0.000125,1.0,6.6e-05,-3.162109,2.1e-05,3.162109,3.162109,-0.000125,1.0,6.6e-05,-3.162109,6.6e-05,-3.162109,3.162109,-7.4e-05,1.0,3.162109,3.162109,6.6e-05,-3.162109
82,1.09375,1.155273,1.457031,-0.347412,-1.0,1.494141,0.000536,-0.260254,24.015625,0,3.705078,3.162109,0.296631,8.875,0.826172,12,2.033203,1.09375,3.162109,3,3,0.017807,0.110962,1.255859,-0.096969,1.466797,2.611328,1.308594,0.007175,2.119141,-1.042969,0.67041,3.162109,0.0,1.0,1.09375,-2.068359,0.345703,1.035156,-2.126953,0.327148,3.162109,2.4e-05,1.0,3.162109,9.8e-05,1.0,3.162109,-2.7e-05,1.0,6.6e-05,-3.162109,2.1e-05,3.162109,3.162109,-2.7e-05,1.0,6.6e-05,-3.162109,6.6e-05,-3.162109,3.162109,2.4e-05,1.0,3.162109,3.162109,6.6e-05,-3.162109
213,1.09375,1.369141,1.522461,-0.377686,-0.416016,1.541992,1.895508,0.772949,24.015625,0,3.986328,2.626953,0.069885,6.566406,0.268311,19,1.985352,1.004883,3.357422,5,5,0.730469,0.90918,1.541992,-1.435578,1.911133,2.544922,1.157227,0.577637,1.892578,-0.734863,0.720215,2.626953,0.0,1.0,1.09375,-1.533203,0.41626,0.567871,-2.060547,0.216064,2.166016,-0.461426,0.824219,3.318359,0.69043,1.262695,1.004883,-1.623047,0.382324,1.087891,-1.540039,0.414062,2.166016,1.004883,-1.623047,0.382324,1.087891,-1.540039,0.365967,-2.261719,2.939453,0.312012,1.119141,3.318359,2.626953,0.350098,-2.277344
218,1.089844,1.369141,1.522461,-0.321289,-0.416016,1.571289,1.895508,-0.945312,24.015625,0,3.982422,3.318359,4.199219,6.472656,0.336914,19,1.985352,1.004883,3.357422,4,5,0.597656,0.658691,1.291992,-1.039927,1.785156,2.417969,1.168945,0.541992,2.080078,-1.238281,0.626953,3.318359,0.0,1.0,1.089844,-2.228516,0.328369,0.933105,-2.384766,0.28125,2.166016,-1.152344,0.652832,3.318359,0.0,1.0,1.004883,-2.3125,0.302734,1.087891,-2.230469,0.327881,2.166016,1.004883,-2.3125,0.302734,1.087891,-2.230469,0.365967,-2.951172,2.939453,-0.378418,0.885742,3.318359,2.626953,0.350098,-2.96875


73     0.880802
78     0.880957
82     0.880871
213   -0.052074
218    1.611320
Name: scalar_coupling_constant, dtype: float64

mean of target. train:0.9912910848093136, valid:0.9896074279816763
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.0551495	training's l1: 0.159259	valid_1's l2: 0.126442	valid_1's l1: 0.223659
[200]	training's l2: 0.0285488	training's l1: 0.117585	valid_1's l2: 0.116586	valid_1's l1: 0.213315
[300]	training's l2: 0.0168556	training's l1: 0.0918731	valid_1's l2: 0.112181	valid_1's l1: 0.208573
[400]	training's l2: 0.0104127	training's l1: 0.0733993	valid_1's l2: 0.110046	valid_1's l1: 0.205992
[500]	training's l2: 0.00680883	training's l1: 0.0599887	valid_1's l2: 0.108702	valid_1's l1: 0.204523
[600]	training's l2: 0.00456102	training's l1: 0.0493334	valid_1's l2: 0.107723	valid_1's l1: 0.203291
[700]	training's l2: 0.00313301	training's l1: 0.0411926	valid_1's l2: 0.107062	valid_1's l1: 0.202582
[800]	training's l2: 0.00218021	training's l1: 0.0344459	valid_1's l2: 0.106656	valid_1's l1: 0.201985
[900]	training's l2: 0.00155405	training's l1: 0.029

[INFO]2019-06-23 15:06:31,902:main:fold 1 valid -2.025204


mean of target. train:0.9900282797472645, valid:0.9921330153410689
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.0562032	training's l1: 0.160686	valid_1's l2: 0.128165	valid_1's l1: 0.224452
[200]	training's l2: 0.0294161	training's l1: 0.119047	valid_1's l2: 0.117791	valid_1's l1: 0.213573
[300]	training's l2: 0.0172974	training's l1: 0.0929165	valid_1's l2: 0.113242	valid_1's l1: 0.208625
[400]	training's l2: 0.0108901	training's l1: 0.0745675	valid_1's l2: 0.110742	valid_1's l1: 0.205845
[500]	training's l2: 0.00706258	training's l1: 0.0608131	valid_1's l2: 0.109565	valid_1's l1: 0.204607
[600]	training's l2: 0.00469103	training's l1: 0.0500431	valid_1's l2: 0.108536	valid_1's l1: 0.20329
[700]	training's l2: 0.0032052	training's l1: 0.0416625	valid_1's l2: 0.107879	valid_1's l1: 0.202375
[800]	training's l2: 0.00224895	training's l1: 0.0349147	valid_1's l2: 0.107596	valid_1's l1: 0.201888
[900]	training's l2: 0.00160023	training's l1: 0.02946

[INFO]2019-06-23 15:08:32,454:main:fold 2 valid -1.962074


mean of target. train:0.9908702216613917, valid:0.990449139099707
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.0565225	training's l1: 0.160498	valid_1's l2: 0.132582	valid_1's l1: 0.226004
[200]	training's l2: 0.0294651	training's l1: 0.118745	valid_1's l2: 0.121875	valid_1's l1: 0.214984
[300]	training's l2: 0.0168207	training's l1: 0.092007	valid_1's l2: 0.117257	valid_1's l1: 0.210185
[400]	training's l2: 0.0105486	training's l1: 0.0736694	valid_1's l2: 0.114895	valid_1's l1: 0.207286
[500]	training's l2: 0.006857	training's l1: 0.0599458	valid_1's l2: 0.113551	valid_1's l1: 0.205575
[600]	training's l2: 0.00459645	training's l1: 0.0494354	valid_1's l2: 0.11267	valid_1's l1: 0.204527
[700]	training's l2: 0.00312633	training's l1: 0.0410518	valid_1's l2: 0.112067	valid_1's l1: 0.203818
[800]	training's l2: 0.00218106	training's l1: 0.0344598	valid_1's l2: 0.111657	valid_1's l1: 0.203254
[900]	training's l2: 0.00154078	training's l1: 0.029002	v

[INFO]2019-06-23 15:10:32,724:main:fold 3 valid -1.939617
[INFO]2019-06-23 15:10:32,739:main:CV score: -1.975631


In [22]:
for _, df_score in score_dict.items():
    display(df_score.mean()[0])

0.4621677888713554

-1.1513735210142848

-0.05122232298527627

-1.3520829281541118

-0.3890146201593507

-1.4432911222931148

-0.6733625027669011

-1.975631348540895

### Check training result

In [23]:
sns.distplot(df_pred['proba'])

NameError: ignored

In [0]:
def feat_importance(_models, _X, _imp_type='gain'):
    df_imp = pd.DataFrame(index=_X.columns)
    for i, model in enumerate(_models):
        df_imp[i] = model.booster_.feature_importance(importance_type=_imp_type)

    df_imp = df_imp.apply(lambda x: x/sum(x))
    df_imp['imp_mean'] = df_imp[list(range(len(models)))].mean(axis=1)
    df_imp['imp_std'] = df_imp[list(range(len(models)))].std(axis=1)
    sorted_imp = df_imp.sort_values(by='imp_mean', ascending=False)
    return sorted_imp

In [0]:
imp = feat_importance(models, X, _imp_type='gain')
imp.head(100)

## Predict

In [14]:
df_test = pd.read_csv(TEST_PATH)
df_test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type
0,4658147,dsgdb9nsd_000004,2,0,2JHC
1,4658148,dsgdb9nsd_000004,2,1,1JHC
2,4658149,dsgdb9nsd_000004,2,3,3JHH
3,4658150,dsgdb9nsd_000004,3,0,1JHC
4,4658151,dsgdb9nsd_000004,3,1,2JHC


In [17]:
df_strct = pd.read_csv(INPUT + 'structures.csv')
df_strct.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [0]:
models = joblib.load(MODEL_PATH)

df_submit = df_test[['id']].copy()
df_test = preprocess(df_test, df_strct, mode='predict')
X = drop_col(df_test)
display(X.head())

In [0]:
X.to_csv('test_prepro.csv', index=False)

In [0]:
y_pred = oof_predict(models, X)

In [0]:
def predict_each_type(df, strct):
    df = df.head(10000)
    model_dict = joblib.load(MODEL_PATH)
    
    df_submit = df[['id']].copy()
    df = preprocess(df, strct, mode='predict')
    df = drop_col(df)    
    
    s_type = df['type']
    coupling_types = s_type.unique()
    print(coupling_types)
    for coup_type in coupling_types:
        models = model_dict[coup_type]
        
        get_logger().info('Starting train model(%s)' % coup_type)
        is_the_type = (s_type == coup_type)
        df_type = df[is_the_type]
                      
        X = df_type
        X = drop_uneffect_feature(X)
        
        display(X.head())  
        y_pred = oof_predict(models, X)        
        
        df_submit.loc[is_the_type, 'scalar_coupling_constant'] = y_pred
    
    display(df_submit.head())
    print((df_submit[TARGET].isnull()).sum())
    return df_submit

In [24]:
predict_each_type(df_test, df_strct)

[INFO]2019-06-23 23:40:08,716:main:Start preprocess()
[INFO]2019-06-23 23:40:08,717:main:load df_2jsim
[INFO]2019-06-23 23:40:10,351:main:load df_3jsim
[INFO]2019-06-23 23:40:16,659:main:loading encoder from ./analysis/mole/data/preprocess/le.pkl
[INFO]2019-06-23 23:40:16,682:main:['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type', '2j_atom_center', '2j_area_021', '2j_norm_vec_02', '2j_norm_vec_12', '2j_cos', '2j_atom_center_weight', '2j_sum_norm_vec', '3j_norm_vec_02', '3j_norm_vec_13', '3j_norm_vec_23', '3j_cos_023', '3j_cos_231', '3j_area_023', '3j_area_231', '3j_dihedral', '3j_atom_center_weight', '3j_atom_center', '3j_sum_norm_vec', 'atom_0', 'x_0', 'y_0', 'z_0', 'atom_1', 'x_1', 'y_1', 'z_1', 'dist', 'dist_x', 'dist_y', 'dist_z', 'type_0', 'type_1', 'molecule_couples', 'molecule_dist_mean', 'molecule_dist_min', 'molecule_dist_max', 'atom_0_couples_count', 'atom_1_couples_count', 'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean', 'molecule_atom_index_0_

Starting Feature Engineering...


[INFO]2019-06-23 23:40:16,917:main:Finish preprocess()


Mem. usage decreased to  2.80 Mb (59.6% reduction)
[2 0 6 5 3 1 7 4]


KeyError: ignored

In [0]:
df_submit['scalar_coupling_constant'] = y_pred
display(df_submit.head())
df_submit.to_csv('submission.csv', index=False)

In [0]:
df_submit.shape