In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
%cd "/gdrive/My Drive"

/gdrive/My Drive


In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.base import clone
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import math

from tqdm import tqdm
import joblib
import gc

## config

In [0]:
INPUT = './analysis/mole/data/raw/'
TRAIN_PATH = INPUT + 'train.csv'
TEST_PATH = INPUT + 'test.csv'
PREPROCESS = './analysis/mole/data/preprocess/'

MID_MODEL_PATH = PREPROCESS + 'middle_model.pkl'
MODEL_PATH = PREPROCESS + 'model.pkl'
ENCODER_PATH = PREPROCESS + 'le.pkl'

RUN_PLOT = True
TARGET = 'scalar_coupling_constant'
N_FOLDS = 3

atom_weight = {'H': 1.008, 'C': 12.01, 'N': 14.01, 'O':16.00}

## logging

In [0]:
import logging
import logging.handlers


def create_logger(log_file_name):
    logger_ = logging.getLogger('main')
    logger_.setLevel(logging.DEBUG)
    fh = logging.handlers.RotatingFileHandler(
        log_file_name, maxBytes=100000, backupCount=8)
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '[%(levelname)s]%(asctime)s:%(name)s:%(message)s')
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)
    # add the handlers to the logger
    logger_.addHandler(fh)
    logger_.addHandler(ch)


def get_logger():
    return logging.getLogger('main')

In [0]:
create_logger('mole.log')

## util

In [0]:
def onehot(_df):
    cat_names = [name for name, col in _df.iteritems() if col.dtype == 'O']
    df_cat = pd.get_dummies(_df[cat_names])
    _df = pd.concat([_df, df_cat], axis=1).drop(cat_names, axis=1)
    return _df

def label_encode(df):
    cat_names = [name for name, col in df.iteritems() if col.dtype == 'O']    
    for cat_name in cat_names:
        print(cat_name)
        le = LabelEncoder()
        le.fit(df[cat_name].values)
        df[cat_name] = le.transform(df[cat_name].values)
    return df

class Encoder:
    def __init__(self):        
        self.encoders = {}
    
    def fit(self, df, cat_names):
        for cat_name in cat_names:
            le = LabelEncoder()
            le.fit(df[cat_name].values)
            self.encoders[cat_name] = le        
    
    def transform(self, df):
        for cat_name in self.encoders.keys():            
            df[cat_name] = self.encoders[cat_name].transform(df[cat_name].values)
            
        return df


def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    
    return df

## Preprocess

In [0]:
def map_atom_info(df, strct, atom_idx):
    df = pd.merge(df, strct, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

def calc_dist(df):
    p_0 = df[['x_0', 'y_0', 'z_0']].values
    p_1 = df[['x_1', 'y_1', 'z_1']].values

    df['dist'] = np.linalg.norm(p_0 - p_1, axis=1)
    df['dist_x'] = (df['x_0'] - df['x_1']) ** 2
    df['dist_y'] = (df['y_0'] - df['y_1']) ** 2
    df['dist_z'] = (df['z_0'] - df['z_1']) ** 2

    return df

def divide_type(df):    
    df['type_0'] = df['type'].apply(lambda x: x[0])
    df['type_1'] = df['type'].apply(lambda x: x[1:])
    return df

In [0]:
def feature_engineering(df):
    print("Starting Feature Engineering...")
    g = df.groupby('molecule_name')
    g1 = df.groupby(['molecule_name', 'atom_index_0'])
    g2 = df.groupby(['molecule_name', 'atom_index_1'])
    g3 = df.groupby(['molecule_name', 'atom_1'])
    g4 = df.groupby(['molecule_name', 'type_0'])
    g5 = df.groupby(['molecule_name', 'type'])
    
    df['type_0'] = df['type'].apply(lambda x: x[0])
    df['molecule_couples'] = g['id'].transform('count')
    df['molecule_dist_mean'] = g['dist'].transform('mean')
    df['molecule_dist_min'] = g['dist'].transform('min')
    df['molecule_dist_max'] = g['dist'].transform('max')
    df['atom_0_couples_count'] = g1['id'].transform('count')
    df['atom_1_couples_count'] = g2['id'].transform('count')
    df[f'molecule_atom_index_0_x_1_std'] = g1['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = g1['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = g1['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = g1['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = g1['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = g1['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = g1['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = g1['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = g1['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = g2['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = g2['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = g2['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = g2['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df[f'molecule_atom_1_dist_mean'] = g3['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = g3['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_std'] = g3['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_type_0_dist_std'] = g4['dist'].transform('std')
    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_dist_mean'] = g5['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_dist_max'] = g5['dist'].transform('max')
    df[f'molecule_type_dist_min'] = g5['dist'].transform('min')
    df[f'molecule_type_dist_std'] = g5['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']

    # TODO: back
    # df = reduce_mem_usage(df)
    
    return df

In [0]:
def add_2j_center_atom(df):    
    get_logger().info('load df_2jsim')
    
    df_2j = joblib.load(PREPROCESS + 'df_2jsim.pkl')  
    
    # atom weight
    df_2j['2j_atom_center_weight'] = df_2j['2j_atom_center'].replace(atom_weight)
    
    # sum of norm
    df_2j['2j_sum_norm_vec'] = df_2j['2j_norm_vec_02'] + df_2j['2j_norm_vec_12']
    
    df = df.merge(df_2j, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left')    
    
    # replace missing vlaue to 'nan' for LabelEncoder
    df.loc[df['2j_atom_center'].isnull(), '2j_atom_center'] = 'nan'
    
    return df

def str_sort(s):
    """
    Parameters
    ----------
    x: str   
    """
    # print(s)
    if not isinstance(s, str):
        return s
    elif s[0] > s[1]:
        return s[1] + s[0]
    else:
        return s

def add_3j_center_atom(df):    
    get_logger().info('load df_3jsim')
    
    df_3j = joblib.load(PREPROCESS + 'df_3jsim.pkl')
    
    # atom weight
    s_atom_w0 = df_3j['3j_atom_center_0'].replace(atom_weight)
    s_atom_w1 = df_3j['3j_atom_center_1'].replace(atom_weight)
    df_3j['3j_atom_center_weight'] = s_atom_w0 + s_atom_w1

    # concatenate atom string 'C' + 'C' - > 'CC'
    tmp = df_3j['3j_atom_center_0'] + df_3j['3j_atom_center_1']
    df_3j['3j_atom_center'] = tmp.transform(str_sort)    
    df_3j.drop(['3j_atom_center_0', '3j_atom_center_1'], axis=1, inplace=True)
    
    # sum norm_vec
    df_3j['3j_sum_norm_vec'] = df_3j['3j_norm_vec_02'] + df_3j['3j_norm_vec_13'] + df_3j['3j_norm_vec_23']
    
    df = df.merge(df_3j, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left')    
    
    # replace missing vlaue to 'nan' for LabelEncoder
    df.loc[df['3j_atom_center'].isnull(), '3j_atom_center'] = 'nan'    
    
    return df

In [0]:
def drop_col(df_org):
    df = df_org.copy()
    to_drop = ['id', 'molecule_name', 'atom_index_0', 'atom_index_1',
               'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1', #'dist_x', 'dist_y', 'dist_z',
               'atom_0', 'atom_1'
              ]
    df = df.drop(to_drop, axis=1)
    
    return df

In [0]:
def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    """
    Fast metric computation for this competition: https://www.kaggle.com/c/champs-scalar-coupling
    Code is from this kernel: https://www.kaggle.com/uberkinder/efficient-metric
    """
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

def oof_train(_X, _y, _types):
    """
    Parameters
    ----------
    _X: pd.DataFrame, shape [n_samples, n_features]
    _y: array-like object, shape [n_samples]
    _types: array-like object, shsape [n_samples]
        array of `type` (e.g. 2JHC, 1JHC, 3JHH, etc.)
    """
    # TODO: divide data to training and validation about molecular
    
    models = []
    df_scores = pd.DataFrame(columns=['valid_score'])
    df_pred = pd.DataFrame(index=_X.index).reset_index(drop=True)

    fold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=1)
    for n_fold, (train_idx, valid_idx) in enumerate(fold.split(_X, _types)):
        # prepare data
        X_train, y_train = _X.iloc[train_idx], _y.iloc[train_idx]
        X_valid, y_valid = _X.iloc[valid_idx], _y.iloc[valid_idx]
        print('mean of target. train:{}, valid:{}'.format(y_train.mean(), y_valid.mean()))

        # generate model
        model = gen_model(_X)
        
        # train
        model.fit(X_train, y_train, eval_metric='mae',
                  eval_set=[(X_train, y_train), (X_valid, y_valid)],
                  verbose=100,
                  early_stopping_rounds=100
                  )
        
        # validate
        y_pred = model.predict(X_valid, num_iteration=model.best_iteration_)
        
        types_valid = _types.iloc[valid_idx]
        valid_score = group_mean_log_mae(y_valid, y_pred, types_valid)
        get_logger().info('fold %d valid %f' % (n_fold+1, valid_score))
        
        df_scores = df_scores.append(pd.Series([valid_score], index=['valid_score']), ignore_index=True)
        df_pred.loc[valid_idx, 'proba'] = y_pred
        models.append(model)
        
        # TODO: back
        # break
    get_logger().info('CV score: %f' % df_scores.mean()[0])
    
    return models, df_scores, df_pred

def oof_predict(_models, _X):
    get_logger().info('Start oof_predict')
    y_pred = np.zeros(_X.shape[0])
        
    for i, model in enumerate(_models):
        get_logger().info('prediction: %d' % i)
        y_pred += model.predict(_X) / len(_models)
    
    get_logger().info('Finish oof_predict')
    return y_pred


def gen_model(_X):
    n_features = _X.shape[1]
    colsample_rate = max(0.7, math.sqrt(n_features)/n_features)
    
    _model = lgb.LGBMRegressor(
        learning_rate=0.2,
        n_estimators=1500,
        num_leaves=128,
        # min_child_weight=15, # good value: 0, 5, 15, 300
        min_child_samples=80,
        subsample=0.7,
        colsample_bytree=1, #colsample_rate,
        objective='regression',
        reg_lambda=0.1,
        reg_alpha=0.1,
        seed=2019
        )
    return _model


In [0]:
def preprocess(df, strct, mode, s_type=None):
    """
    Parameters
    ----------
    df: pd.DataFrame
        dataframe of train.csv or test.csv
    strct: pd.DataFrame
        dataframe of structures.csv
    mode: str
        'train' or 'predict'
    s_type: None or pd.Series
        'type' column (e.g. 1JHC, 2JHH).
        If mode is 'train', the s_type must be specified.
    """
    get_logger().info('Start preprocess()')
    df = add_2j_center_atom(df)
    df = add_3j_center_atom(df)
    df = map_atom_info(df, strct, 0)
    df = map_atom_info(df, strct, 1)
    df = calc_dist(df)
    df = divide_type(df)
    df = feature_engineering(df)
    
    # encode
    if mode == 'train':
        enc = Encoder()
        enc.fit(df, ['type', 'type_0', 'type_1', 
                     '2j_atom_center', '3j_atom_center'])
        joblib.dump(enc, ENCODER_PATH)
    elif mode == 'predict':
        get_logger().info('loading encoder from %s' % ENCODER_PATH)
        enc = joblib.load(ENCODER_PATH)
    df = enc.transform(df)
        
    use_features = [col for col in df.columns if col not in [TARGET]]
    get_logger().info(use_features)
    df[use_features] = reduce_mem_usage(df[use_features])
    # TODO: back
    # df = add_scc_feature(df, 'fc', mode=mode, s_type=s_type)
    
    get_logger().info('Finish preprocess()')
    return df

In [0]:
def drop_uneffect_feature(df):
    """
    Drop uneffective features from dataframe
    """
    for col in df.columns:
        if len(df[col].unique()) == 1:
            df.drop(col, axis=1, inplace=True)
    return df

### fermi constant

In [0]:
class CNTR:
    """Model to predict fc/sd/pso/dso columns"""
    
    def __init__(self, y_col):
        self.y_col = y_col
        
    def train(self, df_org, scc, s_type):
        """
        Parameters
        ----------
        s_type: pd.Series
            'type' column (e.g. 1JHC, 2JHH)
        """
        df = df_org.copy()
        # Merge
        key_cols = ['molecule_name', 'atom_index_0', 'atom_index_1']
        df = df.merge(scc[key_cols + [self.y_col]], how='left', on=key_cols)
        
        # drop unnecessary cols        
        df = drop_col(df)        
        
        y = df[self.y_col].copy()        
        df.drop([TARGET, self.y_col], axis=1, inplace=True)
        X = df
        
        display(X.head())
        display(y.head())
        models, scores, y_pred = oof_train(X, y, s_type)
        
        # save model
        joblib.dump(models, MID_MODEL_PATH)
        
        self.models_ = models
        self.scores_ = scores
        self.y_pred_ = y_pred
        
    def predict(self, df_org):    
        y_pred = np.zeros(df_org.shape[0])
        
        X = df_org.copy()
        X = drop_col(X)
        
        display(X.head())
        # X = self.preprocess(df_org)
        for model in self.models_:            
            y_pred += model.predict(X) / len(models)
        
        return y_pred
    
    def load_model(self):
        # load pkl by joblib
        self.models_ = joblib.load(MID_MODEL_PATH)

In [0]:
def add_scc_feature(df, cntr_name, mode, s_type=None):
    """
    Parameters
    ----------
    cntr_name: str
        'fc', 'sd', 'pso' or 'dso'
    mode: str
        'train' or 'predict'
    s_type: None or pd.Series
        'type' column (e.g. 1JHC, 2JHH).
        If mode is 'train', the s_type must be specified.
    """
    add_feature = '%s_pred' % cntr_name
    cntr = CNTR(cntr_name)
    if mode == 'train': 
        assert s_type is not None, 's_type must be specified.'
        
        get_logger().info('start loading scalar_coupling_contributions')
        scc = pd.read_csv(INPUT + 'scalar_coupling_contributions.csv')
        get_logger().info('finished loading scalar_coupling_contributions')
        
        # train contribution(fc/sd/pso/dso)
        cntr.train(df, scc, s_type)
    
        display(cntr.y_pred_.head())
        df[add_feature] = cntr.y_pred_
    elif mode == 'predict':
        cntr.load_model()
        y_pred = cntr.predict(df)
        df[add_feature] = y_pred
    
    return df

## Train

In [0]:
df_train = pd.read_csv(TRAIN_PATH)
df_strct = pd.read_csv(INPUT + 'structures.csv')

In [0]:
def train_single_model(df, strct):
    # TODO: back
    df = df.head(10000)

    s_type = df['type'].copy()

    df = preprocess(df, strct, mode='train', s_type=s_type)
    df = drop_col(df)

    y = df[TARGET].copy()
    df.drop([TARGET], axis=1, inplace=True)
    X = df
    
    display(X.head())
    display(y.head())
    models, df_scores, df_pred = oof_train(X, y, s_type)

    joblib.dump(models, MODEL_PATH)
    
    return models, df_scores, df_pred

In [0]:
def train_models_each_type(df, strct):
    # TODO:back
    # df = df.head(10000)
    
    s_type = df['type'].copy()
    
    df = preprocess(df, strct, mode='train', s_type=s_type)
    df = drop_col(df)
    
    model_dict = {}
    score_dict = {}
    pred_dict = {}
    coupling_types = s_type.unique()
    for coup_type in coupling_types:
        get_logger().info('Starting train model(%s)' % coup_type)
        is_the_type = (s_type == coup_type)
        df_type = df[is_the_type]
        

        
        y = df_type[TARGET]
        df_type.drop([TARGET], axis=1, inplace=True)
        X = df_type
        X = drop_uneffect_feature(X)
        
        display(X.head())
        display(y.head())
        models, df_scores, df_pred = oof_train(X, y, _types=s_type[is_the_type].reset_index(drop=True))
        
        model_dict[coup_type] = models
        score_dict[coup_type] = df_scores
        pred_dict[coup_type] = df_pred
        
    joblib.dump(model_dict, MODEL_PATH)
    
    return model_dict, score_dict, pred_dict

In [0]:
# models, df_scores, df_pred = train_single_model(df_train, df_strct)
model_dict, score_dict, pred_dict = train_models_each_type(df_train, df_strct)

[INFO]2019-06-24 13:10:01,422:main:Start preprocess()
[INFO]2019-06-24 13:10:01,425:main:load df_2jsim
[INFO]2019-06-24 13:10:06,961:main:load df_3jsim


Starting Feature Engineering...


[INFO]2019-06-24 13:10:54,888:main:['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type', '2j_atom_center', '2j_area_021', '2j_norm_vec_02', '2j_norm_vec_12', '2j_cos', '2j_atom_center_weight', '2j_sum_norm_vec', '3j_norm_vec_02', '3j_norm_vec_13', '3j_norm_vec_23', '3j_cos_023', '3j_cos_231', '3j_area_023', '3j_area_231', '3j_dihedral', '3j_atom_center_weight', '3j_atom_center', '3j_sum_norm_vec', 'atom_0', 'x_0', 'y_0', 'z_0', 'atom_1', 'x_1', 'y_1', 'z_1', 'dist', 'dist_x', 'dist_y', 'dist_z', 'type_0', 'type_1', 'molecule_couples', 'molecule_dist_mean', 'molecule_dist_min', 'molecule_dist_max', 'atom_0_couples_count', 'atom_1_couples_count', 'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean', 'molecule_atom_index_0_y_1_mean_diff', 'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max', 'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_std', 'molecule_atom_index_0_z_1_std', 'molecule_atom_index_0_dist_mean', 'molecule_atom_index_

In [0]:
for _, df_score in score_dict.items():
    display(df_score.mean()[0])

### Check training result

In [0]:
sns.distplot(df_pred['proba'])

In [0]:
def feat_importance(_models, _X, _imp_type='gain'):
    df_imp = pd.DataFrame(index=_X.columns)
    for i, model in enumerate(_models):
        df_imp[i] = model.booster_.feature_importance(importance_type=_imp_type)

    df_imp = df_imp.apply(lambda x: x/sum(x))
    df_imp['imp_mean'] = df_imp[list(range(len(models)))].mean(axis=1)
    df_imp['imp_std'] = df_imp[list(range(len(models)))].std(axis=1)
    sorted_imp = df_imp.sort_values(by='imp_mean', ascending=False)
    return sorted_imp

In [0]:
imp = feat_importance(models, X, _imp_type='gain')
imp.head(100)

## Predict

In [0]:
df_test = pd.read_csv(TEST_PATH)
df_strct = pd.read_csv(INPUT + 'structures.csv')

In [0]:
def predict_single(df, strct):
    models = joblib.load(MODEL_PATH)

    df_submit = df[['id']].copy()
    df = preprocess(df, strct, mode='predict')
    X = drop_col(df)
    display(X.head())
    
    X.to_csv('test_prepro.csv', index=False)
    
    y_pred = oof_predict(models, X)
    df_submit['scalar_coupling_constant'] = y_pred
    
    return df_submit

In [0]:
def predict_each_type(df, strct):
    # df = df.head(10000)
    model_dict = joblib.load(MODEL_PATH)
    
    s_type = df['type'].copy()
    df_submit = df[['id']].copy()
    
    df = preprocess(df, strct, mode='predict')
    df = drop_col(df)    
    
    coupling_types = s_type.unique()
    print(coupling_types)
    for coup_type in coupling_types:
        
        models = model_dict[coup_type]
        
        get_logger().info('Starting predict target(%s)' % coup_type)
        is_the_type = (s_type == coup_type)
        df_type = df[is_the_type]
                      
        X = df_type
        X = drop_uneffect_feature(X)        
        
        display(X.head())  
        y_pred = oof_predict(models, X)        
        
        df_submit.loc[is_the_type, 'scalar_coupling_constant'] = y_pred
    
    display(df_submit.head())
    print((df_submit[TARGET].isnull()).sum())
    return df_submit

In [22]:
df_submit = predict_each_type(df_test, df_strct)

[INFO]2019-06-24 13:31:02,218:main:Start preprocess()
[INFO]2019-06-24 13:31:02,221:main:load df_2jsim
[INFO]2019-06-24 13:31:07,030:main:load df_3jsim


Starting Feature Engineering...


[INFO]2019-06-24 13:31:33,444:main:loading encoder from ./analysis/mole/data/preprocess/le.pkl
[INFO]2019-06-24 13:31:36,559:main:['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type', '2j_atom_center', '2j_area_021', '2j_norm_vec_02', '2j_norm_vec_12', '2j_cos', '2j_atom_center_weight', '2j_sum_norm_vec', '3j_norm_vec_02', '3j_norm_vec_13', '3j_norm_vec_23', '3j_cos_023', '3j_cos_231', '3j_area_023', '3j_area_231', '3j_dihedral', '3j_atom_center_weight', '3j_atom_center', '3j_sum_norm_vec', 'atom_0', 'x_0', 'y_0', 'z_0', 'atom_1', 'x_1', 'y_1', 'z_1', 'dist', 'dist_x', 'dist_y', 'dist_z', 'type_0', 'type_1', 'molecule_couples', 'molecule_dist_mean', 'molecule_dist_min', 'molecule_dist_max', 'atom_0_couples_count', 'atom_1_couples_count', 'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean', 'molecule_atom_index_0_y_1_mean_diff', 'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max', 'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_

Mem. usage decreased to 480.28 Mb (72.4% reduction)


[INFO]2019-06-24 13:32:22,744:main:Finish preprocess()
[INFO]2019-06-24 13:32:23,776:main:Starting predict target(2JHC)


['2JHC' '1JHC' '3JHH' '3JHC' '2JHH' '1JHN' '3JHN' '2JHN']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,2j_atom_center,2j_area_021,2j_norm_vec_02,2j_norm_vec_12,2j_cos,2j_atom_center_weight,2j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
0,0,0.0,1.0625,1.199219,-1.0,12.007812,2.261719,2.261719,5.113281,0.0,0.0,5,1.994141,1.0625,3.324219,3,2,1.130859,0.0,0.0,,0.0,0.0,0.0,0.0,2.214844,-0.045654,0.97998,3.324219,1.0625,1.469727,1.0625,-1.199219,0.469727,1.130859,-1.129883,0.500488,1.662109,-0.599609,0.734863,2.261719,0.0,1.0,1.0625,-1.199219,0.469727,0.847656,-1.413086,0.375,1.662109,1.0625,-1.199219,0.469727,0.692383,-1.569336,0.0,-2.261719,2.261719,0.0,1.0,2.261719,2.261719,0.0,-2.261719
4,0,0.0,1.0625,1.199219,-1.0,12.007812,2.261719,2.261719,5.113281,0.0,0.0,5,1.994141,1.0625,3.324219,2,2,0.847656,0.0,0.0,,0.0,0.0,0.0,0.0,1.662109,-0.599609,0.734863,2.261719,0.0,1.0,1.0625,-1.199219,0.469727,0.847656,-1.413086,0.375,1.662109,-0.599609,0.734863,2.261719,0.0,1.0,1.0625,-1.199219,0.469727,0.847656,-1.413086,0.375,1.662109,1.0625,-1.199219,0.469727,0.692383,-1.569336,0.0,-2.261719,2.261719,0.0,1.0,2.261719,2.261719,0.0,-2.261719
24,0,1.44043,1.084961,1.506836,-0.47168,12.007812,2.591797,2.232422,2.587891,1.533203,0.867676,33,2.201172,1.083984,3.115234,8,6,0.994141,0.61084,-0.168213,0.783957,2.027344,1.248047,0.908691,0.756348,2.332031,0.099487,1.044922,3.115234,0.882812,1.395508,1.084961,-1.148438,0.485596,0.67041,-1.5625,0.300293,1.850586,-0.383057,0.828613,2.232422,0.0,1.0,1.083984,-1.148438,0.485596,0.593262,-1.639648,0.265625,1.850586,1.083984,-1.148438,0.485596,0.557129,-1.675781,0.171631,-2.0625,2.232422,6.1e-05,1.0,2.234375,2.232422,0.000143,-2.232422
25,0,1.44043,1.084961,1.506836,-0.471924,12.007812,2.591797,2.234375,0.098633,4.039062,0.852539,33,2.201172,1.083984,3.115234,8,6,0.994141,0.61084,0.603027,79.604134,2.027344,2.019531,0.908691,0.756348,2.332031,0.099304,1.044922,3.115234,0.882812,1.395508,1.084961,-1.149414,0.485596,0.67041,-1.563477,0.300293,1.850586,-0.383057,0.828613,2.234375,0.0,1.0,1.083984,-1.149414,0.485596,0.593262,-1.640625,0.265625,1.850586,1.083984,-1.149414,0.485596,0.557129,-1.675781,0.171631,-2.0625,2.232422,-0.000153,1.0,2.234375,2.232422,0.000143,-2.232422
32,0,1.44043,1.084961,1.506836,-0.47168,12.007812,2.591797,2.232422,2.642578,1.557617,0.787109,33,2.201172,1.083984,3.115234,7,6,1.007812,0.408203,-0.370605,0.524201,1.514648,0.735352,0.762207,0.742676,2.40625,0.172852,1.077148,3.115234,0.882812,1.395508,1.084961,-1.148438,0.485596,0.688477,-1.544922,0.30835,1.850586,-0.383057,0.828613,2.232422,3e-06,1.0,1.083984,-1.148438,0.485596,0.593262,-1.639648,0.265625,1.850586,1.083984,-1.148438,0.485596,0.557129,-1.675781,0.171631,-2.0625,2.232422,6.4e-05,1.0,2.234375,2.232422,0.000143,-2.232422


[INFO]2019-06-24 13:32:27,747:main:Start oof_predict
[INFO]2019-06-24 13:32:27,750:main:prediction: 0
[INFO]2019-06-24 13:33:25,275:main:prediction: 1
[INFO]2019-06-24 13:34:21,778:main:prediction: 2
[INFO]2019-06-24 13:35:19,781:main:Finish oof_predict
[INFO]2019-06-24 13:35:19,843:main:Starting predict target(1JHC)


Unnamed: 0,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
1,1.0625,1.12793,0.0,0.0,5,1.994141,1.0625,3.324219,3,2,1.130859,0.0,0.0,,0.0,0.0,0.0,0.0,2.214844,1.15332,2.085938,3.324219,2.261719,3.128906,1.0625,1.130859,0.069153,1.06543,1.662109,0.599609,1.564453,2.261719,1.199219,2.128906,1.0625,0.0,1.0,0.847656,-0.214233,0.79834,1.662109,1.0625,0.0,1.0,0.692383,-0.369873,0.0,-1.0625,1.0625,0.0,1.0,1.0625,1.0625,0.0,-1.0625
3,1.0625,1.12793,0.0,0.0,5,1.994141,1.0625,3.324219,2,2,0.847656,0.0,0.0,,0.0,0.0,0.0,0.0,1.662109,0.599609,1.564453,2.261719,1.199219,2.128906,1.0625,0.847656,-0.214233,0.79834,1.662109,0.599609,1.564453,2.261719,1.199219,2.128906,1.0625,0.0,1.0,0.847656,-0.214233,0.79834,1.662109,1.0625,0.0,1.0,0.692383,-0.369873,0.0,-1.0625,1.0625,0.0,1.0,1.0625,1.0625,0.0,-1.0625
5,1.102539,1.041016,0.174561,1e-06,18,1.910156,1.092773,3.261719,4,6,0.558594,1.088867,-0.303467,0.782001,1.792969,0.401123,1.108398,0.920898,1.828125,0.725098,1.658203,2.640625,1.538086,2.394531,1.102539,0.629883,-0.472412,0.571289,1.973633,0.871094,1.790039,3.261719,2.160156,2.958984,1.092773,-0.009476,0.991211,0.984375,-0.118042,0.893066,1.973633,1.092773,-0.009476,0.991211,0.938477,-0.163818,0.004894,-1.097656,1.099609,-0.003159,0.99707,1.102539,1.092773,0.004894,-1.097656
9,1.102539,0.283203,0.160767,0.770996,18,1.910156,1.092773,3.261719,3,6,0.585449,0.854004,-0.538574,0.613333,1.722656,0.330566,1.229492,1.014648,1.842773,0.740234,1.670898,2.640625,1.537109,2.394531,1.102539,0.770508,-0.332031,0.69873,1.973633,0.871094,1.790039,3.261719,2.160156,2.958984,1.092773,-0.009476,0.991211,0.984375,-0.118042,0.893066,1.973633,1.092773,-0.009476,0.991211,0.938477,-0.163818,0.004894,-1.097656,1.099609,-0.003159,0.99707,1.102539,1.092773,0.004894,-1.097656
12,1.092773,0.265381,0.109253,0.819824,18,1.910156,1.092773,3.261719,2,6,0.46167,0.419678,-0.972656,0.301317,1.392578,0.0,1.375977,0.791504,2.177734,1.084961,1.993164,3.261719,2.169922,2.986328,1.092773,1.53418,0.441406,1.404297,1.973633,0.880371,1.805664,3.261719,2.169922,2.986328,1.092773,0.0,1.0,0.984375,-0.108582,0.900879,1.973633,1.092773,0.0,1.0,0.938477,-0.154419,0.004894,-1.087891,1.099609,0.006317,1.005859,1.102539,1.092773,0.004894,-1.087891


[INFO]2019-06-24 13:35:24,247:main:Start oof_predict
[INFO]2019-06-24 13:35:24,249:main:prediction: 0
[INFO]2019-06-24 13:36:03,120:main:prediction: 1
[INFO]2019-06-24 13:36:40,335:main:prediction: 2
[INFO]2019-06-24 13:37:17,775:main:Finish oof_predict
[INFO]2019-06-24 13:37:17,820:main:Starting predict target(3JHH)


Unnamed: 0,3j_norm_vec_02,3j_norm_vec_13,3j_norm_vec_23,3j_cos_023,3j_cos_231,3j_area_023,3j_area_231,3j_dihedral,3j_atom_center_weight,3j_atom_center,3j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
2,1.0625,1.0625,1.199219,-1.0,-1.0,0.0,0.0,,24.015625,0,3.324219,3.324219,11.046875,0.0,0.0,5,1.994141,1.0625,3.324219,3,1,1.130859,0.0,0.0,,0.0,0.0,0.0,0.0,2.214844,-1.107422,0.666504,3.324219,0.0,1.0,1.0625,-2.261719,0.31958,1.130859,-2.191406,0.340332,3.324219,0.0,1.0,3.324219,0.0,1.0,3.324219,0.0,1.0,,,,3.324219,3.324219,0.0,1.0,,,,,3.324219,0.0,1.0,3.324219,3.324219,,
27,1.084961,1.083984,1.506836,-0.47168,-0.471924,1.44043,1.44043,-0.809082,24.015625,0,3.675781,3.115234,4.796875,1.501953,3.410156,33,2.201172,1.083984,3.115234,8,2,0.994141,0.61084,-0.180908,0.771492,2.027344,1.235352,0.908691,0.756348,2.332031,-0.783203,0.748535,3.115234,0.000153,1.0,1.084961,-2.03125,0.348145,0.67041,-2.445312,0.21521,2.822266,-0.292969,0.905762,3.115234,0.0,1.0,2.529297,-0.585938,0.812012,0.414307,-2.701172,0.132935,2.623047,1.819336,-1.296875,0.583496,0.496338,-2.619141,0.306152,-2.810547,2.822266,-0.292969,0.905762,3.115234,2.529297,0.306152,-2.810547
28,1.084961,1.083984,1.506836,-0.47168,-0.471924,1.44043,1.44043,1.0,24.015625,0,3.675781,2.529297,4.875,1.526367,0.000765,33,2.201172,1.083984,3.115234,8,3,0.994141,0.61084,-0.171021,0.781161,2.027344,1.245117,0.908691,0.756348,2.332031,-0.197144,0.921875,3.115234,0.585938,1.231445,1.084961,-1.445312,0.428711,0.67041,-1.859375,0.265137,2.488281,-0.041656,0.983398,3.115234,0.585938,1.231445,1.819336,-0.710938,0.71875,0.649414,-1.880859,0.256836,2.623047,1.819336,-0.710938,0.71875,0.496338,-2.033203,0.306152,-2.224609,2.822266,0.292969,1.116211,3.115234,2.529297,0.306152,-2.224609
29,1.084961,1.083984,1.506836,-0.471924,-0.471924,1.44043,1.44043,1.0,24.015625,0,3.675781,2.529297,0.001193,6.402344,0.000195,33,2.201172,1.083984,3.115234,8,4,0.994141,0.61084,1.123047,-1.1905,2.027344,2.539062,0.908691,0.756348,2.332031,-0.197388,0.921875,3.115234,0.585938,1.231445,1.084961,-1.445312,0.428711,0.67041,-1.859375,0.264893,2.822266,0.292725,1.115234,3.115234,0.585938,1.231445,2.529297,-0.000641,0.999512,0.338379,-2.191406,0.133667,2.623047,1.819336,-0.711426,0.71875,0.496338,-2.033203,0.306152,-2.224609,2.822266,0.292725,1.115234,3.115234,2.529297,0.306152,-2.224609
30,1.084961,1.083984,1.506836,-0.471924,-0.471924,1.44043,1.44043,-0.80957,24.015625,0,3.675781,3.115234,0.000296,6.351562,3.359375,33,2.201172,1.083984,3.115234,8,5,0.994141,0.61084,1.113281,-1.21366,2.027344,2.529297,0.908691,0.756348,2.332031,-0.783203,0.748535,3.115234,0.0,1.0,1.084961,-2.03125,0.3479,0.67041,-2.445312,0.21521,2.623047,-0.494141,0.841309,3.115234,0.0,1.0,1.819336,-1.296875,0.583496,0.536133,-2.580078,0.171997,2.623047,1.819336,-1.296875,0.583496,0.496338,-2.619141,0.306152,-2.810547,2.822266,-0.293213,0.905762,3.115234,2.529297,0.306152,-2.810547


[INFO]2019-06-24 13:37:20,110:main:Start oof_predict
[INFO]2019-06-24 13:37:20,112:main:prediction: 0
[INFO]2019-06-24 13:37:53,751:main:prediction: 1
[INFO]2019-06-24 13:38:26,387:main:prediction: 2
[INFO]2019-06-24 13:38:59,156:main:Finish oof_predict
[INFO]2019-06-24 13:38:59,207:main:Starting predict target(3JHC)


Unnamed: 0,3j_norm_vec_02,3j_norm_vec_13,3j_norm_vec_23,3j_cos_023,3j_cos_231,3j_area_023,3j_area_231,3j_dihedral,3j_atom_center_weight,3j_atom_center,3j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
6,1.102539,1.40625,1.40625,-0.372314,-0.38208,1.438477,1.827148,0.490234,28.015625,2,3.914062,2.640625,0.134888,5.585938,1.25,18,1.910156,1.092773,3.261719,4,6,0.558594,1.088867,1.642578,-1.967963,1.792969,2.347656,1.108398,0.920898,1.828125,-0.8125,0.692383,2.640625,0.0,1.0,1.102539,-1.538086,0.41748,0.629883,-2.009766,0.238525,1.973633,-0.666992,0.747559,3.261719,0.62207,1.235352,1.092773,-1.547852,0.413818,0.984375,-1.65625,0.372803,1.973633,1.092773,-1.547852,0.413818,0.938477,-1.702148,0.321533,-2.318359,2.847656,0.207275,1.078125,3.261719,2.640625,0.321533,-2.318359
10,1.102539,1.40625,1.40625,-0.372314,-0.38208,1.438477,1.827148,0.490967,28.015625,2,3.914062,2.640625,1.404297,5.507812,0.058105,18,1.910156,1.092773,3.261719,3,6,0.585449,0.854004,1.407227,-1.543497,1.722656,2.275391,1.229492,1.014648,1.842773,-0.797363,0.697754,2.640625,0.0,1.0,1.102539,-1.537109,0.41748,0.770508,-1.869141,0.291748,1.973633,-0.666504,0.747559,3.261719,0.622559,1.236328,1.092773,-1.546875,0.414062,0.984375,-1.655273,0.372803,1.973633,1.092773,-1.546875,0.414062,0.938477,-1.701172,0.321533,-2.318359,2.847656,0.207642,1.079102,3.261719,2.640625,0.321533,-2.318359
13,1.092773,1.40625,1.40625,-0.300537,-0.38208,1.46582,1.827148,-1.0,28.015625,2,3.904297,3.261719,1.364258,5.179688,4.097656,18,1.910156,1.092773,3.261719,2,6,0.46167,0.419678,0.972656,-0.758287,1.392578,1.945312,1.375977,0.791504,2.177734,-1.084961,0.66748,3.261719,0.0,1.0,1.092773,-2.169922,0.334961,1.53418,-1.728516,0.470215,1.973633,-1.289062,0.60498,3.261719,0.0,1.0,1.092773,-2.169922,0.334961,0.984375,-2.277344,0.301758,1.973633,1.092773,-2.169922,0.334961,0.938477,-2.324219,0.321533,-2.941406,2.847656,-0.415039,0.873047,3.261719,2.640625,0.321533,-2.941406
14,1.102539,1.40625,1.40625,-0.372314,-0.38208,1.438477,1.827148,0.490234,28.015625,2,3.914062,2.640625,0.023956,2.716797,4.230469,18,1.910156,1.092773,3.261719,4,6,0.708496,-0.260254,-1.652344,-0.186972,1.392578,0.0,1.254883,0.558105,1.828125,-0.8125,0.692383,2.640625,0.0,1.0,1.102539,-1.538086,0.41748,0.629883,-2.009766,0.238525,1.973633,-0.666992,0.747559,3.261719,0.62207,1.235352,1.092773,-1.547852,0.413818,0.984375,-1.65625,0.372803,1.973633,1.092773,-1.547852,0.413818,0.938477,-1.702148,0.321533,-2.318359,2.847656,0.207275,1.078125,3.261719,2.640625,0.321533,-2.318359
18,1.102539,1.40625,1.40625,-0.372314,-0.38208,1.438477,1.827148,0.490967,28.015625,2,3.914062,2.640625,2.916016,2.660156,1.393555,18,1.910156,1.092773,3.261719,3,6,0.366211,-0.267578,-1.660156,-0.192157,1.392578,0.0,1.537109,0.62207,1.842773,-0.797363,0.697754,2.640625,0.0,1.0,1.102539,-1.538086,0.41748,0.770508,-1.869141,0.291748,1.973633,-0.666504,0.747559,3.261719,0.622559,1.235352,1.092773,-1.546875,0.414062,0.984375,-1.655273,0.372803,1.973633,1.092773,-1.546875,0.414062,0.938477,-1.701172,0.321533,-2.318359,2.847656,0.207642,1.079102,3.261719,2.640625,0.321533,-2.318359


[INFO]2019-06-24 13:39:03,313:main:Start oof_predict
[INFO]2019-06-24 13:39:03,319:main:prediction: 0
[INFO]2019-06-24 13:40:16,568:main:prediction: 1
[INFO]2019-06-24 13:41:27,733:main:prediction: 2
[INFO]2019-06-24 13:42:41,166:main:Finish oof_predict
[INFO]2019-06-24 13:42:41,226:main:Starting predict target(2JHH)


Unnamed: 0,2j_atom_center,2j_area_021,2j_norm_vec_02,2j_norm_vec_12,2j_cos,2j_atom_center_weight,2j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
7,0,1.15625,1.102539,1.102539,-0.308105,12.007812,2.205078,1.783203,2.410156,0.00028,0.769531,18,1.910156,1.092773,3.261719,4,1,0.558594,1.088867,-0.70459,0.607141,1.792969,0.0,1.108398,0.920898,1.828125,0.044708,1.025391,2.640625,0.857422,1.480469,1.102539,-0.680664,0.618164,0.629883,-1.15332,0.353271,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,0.0,1.0,,,,1.78418,1.783203,-2.980232e-07,1.0,0.001056,-1.782227,0.001056,-1.782227,1.78418,0.001363,1.000977,1.785156,1.783203,0.001056,-1.782227
8,0,1.140625,1.102539,1.092773,-0.32251,12.007812,2.195312,1.785156,2.357422,0.00761,0.821289,18,1.910156,1.092773,3.261719,4,2,0.558594,1.088867,-0.634277,0.63199,1.792969,0.070496,1.108398,0.920898,1.828125,0.042633,1.023438,2.640625,0.855469,1.479492,1.102539,-0.682617,0.617676,0.629883,-1.155273,0.352783,1.785156,-8e-06,1.0,1.785156,0.0,1.0,1.785156,-1.6e-05,1.0,1.1e-05,-1.785156,6e-06,1.78418,1.783203,-0.002052307,0.999023,0.001056,-1.78418,0.001056,-1.78418,1.78418,-0.00069,0.999512,1.785156,1.783203,0.001056,-1.78418
11,0,1.140625,1.102539,1.092773,-0.32251,12.007812,2.195312,1.785156,0.000284,0.004971,3.181641,18,1.910156,1.092773,3.261719,3,2,0.585449,0.854004,-0.869141,0.495677,1.722656,0.0,1.229492,1.014648,1.842773,0.057404,1.032227,2.640625,0.85498,1.478516,1.102539,-0.682617,0.617676,0.770508,-1.014648,0.431641,1.785156,8e-06,1.0,1.785156,1.6e-05,1.0,1.785156,0.0,1.0,1.1e-05,-1.785156,6e-06,1.78418,1.783203,-0.002037048,0.999023,0.001056,-1.78418,0.001056,-1.78418,1.78418,-0.000673,0.999512,1.785156,1.783203,0.001056,-1.78418
16,0,1.15625,1.102539,1.102539,-0.308105,12.007812,2.205078,1.783203,2.410156,0.0003,0.767578,18,1.910156,1.092773,3.261719,4,1,0.708496,-0.260254,-0.021652,1.090742,1.392578,1.630859,1.254883,0.558105,1.828125,0.044678,1.025391,2.640625,0.857422,1.480469,1.102539,-0.680664,0.618164,0.629883,-1.15332,0.353271,1.783203,0.0,1.0,1.783203,0.0,1.0,1.783203,0.0,1.0,,,,1.78418,1.783203,0.0,1.0,0.001056,-1.782227,0.001056,-1.782227,1.78418,0.001364,1.000977,1.785156,1.783203,0.001056,-1.782227
17,0,1.140625,1.102539,1.092773,-0.32251,12.007812,2.195312,1.785156,0.211304,1.920898,1.054688,18,1.910156,1.092773,3.261719,4,2,0.708496,-0.260254,1.381836,0.158571,1.392578,3.035156,1.254883,0.558105,1.828125,0.042633,1.023438,2.640625,0.855469,1.479492,1.102539,-0.682617,0.617676,0.629883,-1.155273,0.352783,1.785156,-9e-06,1.0,1.785156,0.0,1.0,1.785156,-1.7e-05,1.0,1.2e-05,-1.785156,7e-06,1.78418,1.783203,-0.002054214,0.999023,0.001056,-1.78418,0.001056,-1.78418,1.78418,-0.00069,0.999512,1.785156,1.783203,0.001056,-1.78418


[INFO]2019-06-24 13:42:43,704:main:Start oof_predict
[INFO]2019-06-24 13:42:43,712:main:prediction: 0
[INFO]2019-06-24 13:43:06,244:main:prediction: 1
[INFO]2019-06-24 13:43:28,913:main:prediction: 2
[INFO]2019-06-24 13:43:52,103:main:Finish oof_predict
[INFO]2019-06-24 13:43:52,145:main:Starting predict target(1JHN)


Unnamed: 0,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
56,1.008789,0.595215,0.224487,0.198608,14,1.924805,1.008789,3.197266,4,4,0.854492,0.629883,-0.730469,0.463045,1.803711,0.442871,1.140625,0.10022,1.81543,0.806152,1.798828,2.5,1.491211,2.478516,1.008789,0.630859,-0.378174,0.625,1.928711,0.919434,1.911133,3.197266,2.1875,3.167969,1.008789,-0.000327,0.999512,1.099609,0.090393,1.089844,1.928711,1.008789,-0.000327,0.999512,1.017578,0.008858,0.000189,-1.008789,1.008789,-0.000162,1.0,1.008789,1.008789,0.000189,-1.008789
60,1.008789,0.799805,0.196167,0.0215,14,1.924805,1.008789,3.197266,3,4,0.702148,0.238892,-1.12207,0.175554,1.360352,0.0,1.017578,0.121948,2.070312,1.061523,2.052734,3.197266,2.1875,3.169922,1.008789,1.095703,0.08667,1.085938,1.928711,0.919922,1.912109,3.197266,2.1875,3.167969,1.008789,0.0,1.0,1.099609,0.090759,1.089844,1.928711,1.008789,0.0,1.0,1.017578,0.009186,0.000189,-1.008789,1.008789,0.000165,1.0,1.008789,1.008789,0.000189,-1.008789
65,1.008789,0.60791,0.202759,0.207642,14,1.924805,1.008789,3.197266,4,4,0.690918,-0.226196,0.397217,0.362886,1.360352,1.984375,1.24707,0.100037,1.81543,0.806152,1.798828,2.5,1.491211,2.478516,1.008789,0.630859,-0.378174,0.625,1.928711,0.919434,1.911133,3.197266,2.1875,3.167969,1.008789,-0.000328,0.999512,1.099609,0.090393,1.089844,1.928711,1.008789,-0.00033,0.999512,1.017578,0.008858,0.000189,-1.008789,1.008789,-0.000165,1.0,1.008789,1.008789,0.000189,-1.008789
69,1.008789,0.001417,0.996094,0.019806,14,1.924805,1.008789,3.197266,3,4,0.702148,0.238892,0.862305,-0.383241,1.360352,1.984375,1.017578,0.121948,2.070312,1.061523,2.052734,3.197266,2.1875,3.167969,1.008789,1.095703,0.086609,1.085938,1.928711,0.919922,1.912109,3.197266,2.1875,3.169922,1.008789,0.0,1.0,1.099609,0.090759,1.089844,1.928711,1.008789,-2e-06,1.0,1.017578,0.009186,0.000189,-1.008789,1.008789,0.000163,1.0,1.008789,1.008789,0.000189,-1.008789
156,1.006836,0.006924,0.257812,0.749023,22,2.082031,1.006836,3.402344,4,5,0.245972,-0.225952,-0.264404,-5.898284,1.478516,1.44043,1.354492,0.599121,1.854492,0.847168,1.841797,2.242188,1.235352,2.226562,1.006836,0.570801,-0.435791,0.566895,1.867188,0.859863,1.854492,2.113281,1.105469,2.097656,1.006836,0.0,1.0,0.481689,-0.525391,0.478516,1.867188,1.006836,0.0,1.0,0.481689,-0.525391,0.041046,-0.96582,1.006836,0.0,1.0,1.006836,1.006836,,


[INFO]2019-06-24 13:43:54,710:main:Start oof_predict
[INFO]2019-06-24 13:43:54,712:main:prediction: 0
[INFO]2019-06-24 13:43:57,134:main:prediction: 1
[INFO]2019-06-24 13:44:03,541:main:prediction: 2
[INFO]2019-06-24 13:44:08,473:main:Finish oof_predict
[INFO]2019-06-24 13:44:08,508:main:Starting predict target(3JHN)


Unnamed: 0,3j_norm_vec_02,3j_norm_vec_13,3j_norm_vec_23,3j_cos_023,3j_cos_231,3j_area_023,3j_area_231,3j_dihedral,3j_atom_center_weight,3j_atom_center,3j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
58,1.008789,1.388672,1.388672,-0.461182,-0.399414,1.243164,1.767578,0.864746,26.015625,1,3.787109,2.5,0.169922,6.042969,0.040741,14,1.924805,1.008789,3.197266,4,4,0.854492,0.629883,1.253906,-1.010844,1.803711,2.427734,1.140625,0.10022,1.81543,-0.685059,0.726074,2.5,0.0,1.0,1.008789,-1.491211,0.403564,0.630859,-1.869141,0.252197,1.928711,-0.571777,0.771484,3.197266,0.696289,1.27832,1.008789,-1.491211,0.40332,1.099609,-1.401367,0.439697,1.928711,1.008789,-1.491211,0.40332,1.017578,-1.482422,0.401855,-2.097656,2.847656,0.348145,1.139648,3.197266,2.5,0.401855,-2.097656
62,1.008789,1.388672,1.388672,-0.384033,-0.399414,1.292969,1.767578,-0.96875,26.015625,1,3.785156,3.197266,4.316406,5.890625,0.009445,14,1.924805,1.008789,3.197266,3,4,0.702148,0.238892,0.862305,-0.383241,1.360352,1.984375,1.017578,0.121948,2.070312,-1.125977,0.647461,3.197266,0.0,1.0,1.008789,-2.1875,0.315674,1.095703,-2.101562,0.342773,1.928711,-1.267578,0.603516,3.197266,0.0,1.0,1.008789,-2.1875,0.315674,1.099609,-2.097656,0.343994,1.928711,1.008789,-2.1875,0.315674,1.017578,-2.177734,0.401855,-2.794922,2.847656,-0.348145,0.891113,3.197266,2.5,0.401855,-2.794922
63,1.008789,1.388672,1.388672,-0.461182,-0.399414,1.243164,1.767578,0.864746,26.015625,1,3.787109,2.5,3.853516,2.353516,0.044891,14,1.924805,1.008789,3.197266,4,4,0.690918,-0.226196,-1.586914,-0.16623,1.360352,0.0,1.24707,0.100037,1.81543,-0.685059,0.726074,2.5,0.0,1.0,1.008789,-1.491211,0.403564,0.630859,-1.869141,0.252197,1.928711,-0.571777,0.771484,3.197266,0.696289,1.27832,1.008789,-1.491211,0.40332,1.099609,-1.401367,0.439697,1.928711,1.008789,-1.491211,0.40332,1.017578,-1.482422,0.401855,-2.097656,2.847656,0.348145,1.139648,3.197266,2.5,0.401855,-2.097656
67,1.008789,1.388672,1.388672,-0.384033,-0.399414,1.292969,1.767578,-0.96875,26.015625,1,3.785156,3.197266,1.313477,8.890625,0.01062,14,1.924805,1.008789,3.197266,3,4,0.702148,0.238892,-1.12207,0.175554,1.360352,0.0,1.017578,0.121948,2.070312,-1.125977,0.647461,3.197266,0.0,1.0,1.008789,-2.1875,0.315674,1.095703,-2.101562,0.342773,1.928711,-1.267578,0.603516,3.197266,0.0,1.0,1.008789,-2.1875,0.315674,1.099609,-2.097656,0.343994,1.928711,1.008789,-2.1875,0.315674,1.017578,-2.177734,0.401855,-2.794922,2.847656,-0.348145,0.891113,3.197266,2.5,0.401855,-2.794922
408,1.116211,1.362305,1.541992,-0.433838,-0.38208,1.551758,1.942383,1.0,24.015625,0,4.019531,2.560547,4.480469,2.072266,0.000154,10,1.972656,1.004883,3.357422,3,3,0.744141,0.176636,-1.148438,0.133281,1.325195,0.0,1.060547,0.005013,1.979492,-0.580078,0.773438,2.560547,0.0,1.0,1.116211,-1.443359,0.436279,0.762207,-1.797852,0.297852,1.524414,-1.036133,0.595215,2.560547,0.0,1.0,1.004883,-1.554688,0.392578,0.896973,-1.663086,0.350342,1.524414,1.004883,-1.554688,0.392578,0.896973,-1.663086,0.460938,-2.097656,2.560547,0.0,1.0,2.560547,2.560547,,


[INFO]2019-06-24 13:44:10,099:main:Start oof_predict
[INFO]2019-06-24 13:44:10,104:main:prediction: 0
[INFO]2019-06-24 13:44:22,277:main:prediction: 1
[INFO]2019-06-24 13:44:34,230:main:prediction: 2
[INFO]2019-06-24 13:44:46,195:main:Finish oof_predict
[INFO]2019-06-24 13:44:46,237:main:Starting predict target(2JHN)


Unnamed: 0,2j_atom_center,2j_area_021,2j_norm_vec_02,2j_norm_vec_12,2j_cos,2j_atom_center_weight,2j_sum_norm_vec,dist,dist_x,dist_y,dist_z,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,atom_0_couples_count,atom_1_couples_count,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff
141,0,1.480469,1.095703,1.451172,-0.364258,12.007812,2.546875,2.113281,0.328613,3.720703,0.41333,22,2.082031,1.006836,3.402344,6,5,0.498779,0.660645,0.62207,17.239294,1.824219,1.785156,1.154297,0.78125,2.082031,-0.030411,0.98584,3.191406,1.078125,1.510742,1.095703,-1.017578,0.518555,0.719727,-1.392578,0.340576,1.867188,-0.245728,0.883789,2.113281,1.3e-05,1.0,1.006836,-1.105469,0.476562,0.481689,-1.630859,0.228027,1.867188,1.006836,-1.105469,0.476562,0.481689,-1.630859,0.154541,-1.958008,2.082031,-0.030807,0.985352,2.113281,2.044922,0.035919,-2.076172
147,0,1.480469,1.095703,1.451172,-0.364258,12.007812,2.546875,2.113281,1.373047,2.962891,0.126587,22,2.082031,1.006836,3.402344,5,5,0.166748,0.440674,0.402344,11.500521,1.824219,1.785156,1.141602,0.827637,2.142578,0.02977,1.013672,3.191406,1.078125,1.510742,1.095703,-1.017578,0.518555,0.787598,-1.325195,0.372803,1.867188,-0.24585,0.883789,2.113281,0.0,1.0,1.006836,-1.105469,0.476562,0.481689,-1.630859,0.228027,1.867188,1.006836,-1.105469,0.476562,0.481689,-1.630859,0.154541,-1.958008,2.082031,-0.030807,0.985352,2.113281,2.044922,0.035919,-2.076172
152,0,1.510742,1.09082,1.451172,-0.296387,12.007812,2.541016,2.056641,0.00197,3.1875,1.041992,22,2.082031,1.006836,3.402344,4,5,0.19104,0.094788,0.056488,2.474538,1.478516,1.44043,0.969727,0.818359,2.158203,0.10083,1.048828,2.96875,0.912598,1.443359,1.09082,-0.966797,0.530273,0.803223,-1.253906,0.390381,1.867188,-0.190308,0.907715,2.113281,0.055481,1.027344,1.006836,-1.049805,0.489502,0.481689,-1.575195,0.234131,1.867188,1.006836,-1.049805,0.489502,0.481689,-1.575195,0.154541,-1.902344,2.082031,0.024673,1.011719,2.113281,2.044922,0.035919,-2.021484
160,0,1.402344,1.108398,1.359375,-0.366211,12.007812,2.46875,2.044922,0.13208,3.207031,0.842285,22,2.082031,1.006836,3.402344,3,5,0.2229,0.282959,0.244507,7.383074,1.478516,1.44043,1.094727,0.653809,2.185547,0.140625,1.068359,3.402344,1.358398,1.664062,1.108398,-0.936523,0.541992,1.154297,-0.891113,0.564453,1.867188,-0.178101,0.913086,2.113281,0.067749,1.033203,1.006836,-1.038086,0.492432,0.481689,-1.563477,0.235596,1.867188,1.006836,-1.038086,0.492432,0.481689,-1.563477,0.154541,-1.890625,2.082031,0.036926,1.017578,2.113281,2.044922,0.035919,-2.009766
264,0,1.475586,1.104492,1.455078,-0.396729,12.007812,2.558594,2.146484,0.719727,3.802734,0.089783,37,2.210938,1.085938,3.359375,6,7,0.782715,0.666992,0.64502,30.452684,1.819336,1.797852,1.151367,0.747559,2.015625,-0.131348,0.938965,2.640625,0.493408,1.229492,1.104492,-1.042969,0.51416,0.589844,-1.557617,0.274658,2.148438,0.001721,1.000977,2.191406,0.043915,1.020508,2.091797,-0.056335,0.973633,0.042542,-2.105469,0.019821,2.148438,2.091797,-0.056335,0.973633,0.042542,-2.105469,0.186523,-1.960938,2.148438,0.001721,1.000977,2.191406,2.091797,0.042542,-2.105469


[INFO]2019-06-24 13:44:48,174:main:Start oof_predict
[INFO]2019-06-24 13:44:48,176:main:prediction: 0
[INFO]2019-06-24 13:44:58,241:main:prediction: 1
[INFO]2019-06-24 13:45:07,642:main:prediction: 2
[INFO]2019-06-24 13:45:17,403:main:Finish oof_predict


Unnamed: 0,id,scalar_coupling_constant
0,4658147,18.565199
1,4658148,192.494871
2,4658149,1.389909
3,4658150,185.061554
4,4658151,16.415972


0


In [23]:
display(df_submit.head())
df_submit.to_csv('submission.csv', index=False)

Unnamed: 0,id,scalar_coupling_constant
0,4658147,18.565199
1,4658148,192.494871
2,4658149,1.389909
3,4658150,185.061554
4,4658151,16.415972


In [24]:
df_submit.shape

(2505542, 2)