# Data loading

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import os

In [3]:
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn import metrics
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
import gc
import seaborn as sns
import warnings
import scipy.signal as sg
warnings.filterwarnings("ignore")

from scipy import stats

In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max and c_prec == np.finfo(np.float16).precision:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [29]:
def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [None]:
#need to implement a way to calculate group mae
def train_model_regression(X, X_test, y, params, folds, molecules, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None,
                               verbose=10000, early_stopping_rounds=200, n_estimators=50000):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
    
    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type
    
    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]
    
    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'sklearn_scoring_function': metrics.mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                        'catboost_metric_name': 'MSE',
                        'sklearn_scoring_function': metrics.mean_squared_error}
                    }

    
    result_dict = {}
    
    # out-of-fold predictions on train data
    oof = np.zeros(len(X))
    
    # averaged predictions on train data
    prediction = np.zeros(len(X_test))
    
    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    
    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, groups = molecules)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators = n_estimators, n_jobs = -1)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                    verbose=verbose, early_stopping_rounds=early_stopping_rounds)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')
            
            y_pred = model.predict(X_test).reshape(-1,)
        
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,  eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params,
                                      loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        if eval_metric != 'group_mae':
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred    
        
        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= folds.n_splits
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores
    
    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= folds.n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
            
            result_dict['feature_importance'] = feature_importance
        
    return result_dict

# Data Preparation

Read different given datasets and public accessible datasets and merge them together to form training and test datasets

In [12]:
train1 = pd.read_csv('E:/kaggle/Molecular_properties/simple-molecular-geometry-features/train_geom.csv', index_col='id')
test1 = pd.read_csv('E:/kaggle/Molecular_properties/simple-molecular-geometry-features/test_geom.csv', index_col='id')

In [9]:
train = pd.read_csv('E:/kaggle/Molecular_properties/Data_use/train_bonds_QM9_neighbor_selected_all.csv')
test = pd.read_csv('E:/kaggle/Molecular_properties/Data_use/test_bonds_QM9_neighbor_selected_all.csv')

train.fillna(0, inplace = True)
test.fillna(0, inplace = True)

In [None]:
test.tail()

In [33]:
y_tr = train1.pop('scalar_coupling_constant')

In [34]:
df_molecules = pd.read_csv('E:/kaggle/Molecular_properties/Data_use/molecules.csv')

In [None]:
scalar_coupling = pd.read_csv('E:/kaggle/Molecular_properties/champs-scalar-coupling/scalar_coupling_contributions.csv')
scalar_coupling.head()
y_fc = scalar_coupling['fc']
y_sd = scalar_coupling['sd']
y_pso = scalar_coupling['pso']
y_dso = scalar_coupling['dso']

In [None]:
scalar_coupling.head()

In [None]:
del test1
del train1

In [None]:
structures = pd.read_csv('E:/kaggle/Molecular_properties/champs-scalar-coupling/structures.csv')
display(structures.head())

Maybe there is a ratio between the electrons distance to 'N' and 'H'

In [None]:
dipole_moments = pd.read_csv('E:/kaggle/Molecular_properties/champs-scalar-coupling/dipole_moments.csv')
dipole_moments.head()

In [None]:
magnetic_shielding = pd.read_csv('E:/kaggle/Molecular_properties/champs-scalar-coupling/magnetic_shielding_tensors.csv')
magnetic_shielding.head()

In [None]:
mulliken_charges = pd.read_csv('E:/kaggle/Molecular_properties/champs-scalar-coupling/mulliken_charges.csv')
mulliken_charges.head()

In [None]:
potential = pd.read_csv('E:/kaggle/Molecular_properties/champs-scalar-coupling/potential_energy.csv')
potential.head()

In [None]:
QM9 = pd.read_csv('E:/kaggle/Molecular_properties/qm9-processed/QM9_processed.csv', index_col='id')

In [None]:
QM9.sort_index(inplace = True)

In [None]:
QM9 = reduce_mem_usage(QM9)

In [None]:
QM9_cols = QM9.columns.values[5:]

In [None]:
QM9 = QM9[QM9_cols]

In [58]:
train_bonds = pd.read_csv('E:/kaggle/Molecular_properties/data-bonds/train_bonds.csv', index_col='id')
test_bonds = pd.read_csv('E:/kaggle/Molecular_properties/data-bonds/test_bonds.csv', index_col='id')

In [None]:
train_bonds = reduce_mem_usage(train_bonds)
test_bonds = reduce_mem_usage(test_bonds)

In [61]:
cols_tokeep_bonds = ['EN_0', 'rad_0', #'n_bonds_0', 'bond_lengths_mean_0', 'nbond_unpaired_diff_0',
                     'EN_1', 'rad_1', #'n_bonds_1', 'bond_lengths_mean_1', 'nbond_unpaired_diff_1',
                     ]

In [62]:
train_bonds = train_bonds[cols_tokeep_bonds]
test_bonds = test_bonds[cols_tokeep_bonds]

In [None]:
test_bonds.head()

In [35]:
train_neighbor = pd.read_csv('E:/kaggle/Molecular_properties/data-neighbor/train_neighbor.csv', index_col='id')
test_neighbor = pd.read_csv('E:/kaggle/Molecular_properties/data-neighbor/test_neighbor.csv', index_col='id')

In [36]:
train_neighbor = reduce_mem_usage(train_neighbor)
test_neighbor = reduce_mem_usage(test_neighbor)

Mem. usage decreased to 1203.88 Mb (41.6% reduction)
Mem. usage decreased to 637.99 Mb (41.4% reduction)


In [6]:
angles = pd.read_csv('E:/kaggle/Molecular_properties/angle-and-dihedral-for-the-champs-structures/angles.csv')

In [None]:
angles.head()

In [None]:
train = pd.merge(train, QM9, how = 'left', left_index = True, right_index = True)
test = pd.merge(test, QM9, how = 'left', left_index = True, right_index = True)

In [None]:
del QM9

In [78]:
train = pd.merge(train, train_bonds, how = 'left', left_index = True, right_index = True)
test = pd.merge(test, test_bonds, how = 'left', left_index = True, right_index = True)

In [83]:
test.drop(['id'], axis = 1, inplace = True)

In [405]:
train['atom_index_0'] = train1['atom_index_0']
train['atom_index_1'] = train1['atom_index_1']
train['molecule_name'] = train1['molecule_name']

In [406]:
test['atom_index_0'] = test1['atom_index_0']
test['atom_index_1'] = test1['atom_index_1']
test['molecule_name'] = test1['molecule_name']

In [39]:
train = pd.merge(train, train_neighbor, how = 'left', left_on = ['molecule_name', 'atom_index_0', 'atom_index_1'], right_on = ['molecule_name', 'atom_index_0', 'atom_index_1'])
test = pd.merge(test, test_neighbor, how = 'left', left_on = ['molecule_name', 'atom_index_0', 'atom_index_1'], right_on = ['molecule_name', 'atom_index_0', 'atom_index_1'])

In [46]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 1972.41 Mb (35.2% reduction)
Mem. usage decreased to 1051.37 Mb (35.4% reduction)


In [29]:
test1.reset_index(inplace = True)

# Feature Engineering

In [None]:
atoms = structures['atom'].unique()
df_atoms = pd.DataFrame(index=atoms)
df_atoms['mass'] = [12, 1, 14, 16, 19]
df_atoms['unpaired_electrons'] = [2, 1, 3, 2, 1]
df_atoms['bonding_electrons_1'] = [-4, 1, -3, -2, -1]
df_atoms['bonding_electrons_2'] = [4, 1, -3, -2, -1]

In [None]:
df_atoms

Aggregate features from different data sets
Features to try: 
1. Total mass of a molecule and mass of a atom 
2. Size of molecule (This is not well defined)
3. Free electron of an atom

Finish generating mass, charge and size features of molecules (weighted and un-weighted) and start do some modeling!!!

In [15]:
# Map the atom structure data into train and test files

def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={
                            #'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

In [16]:
train.drop(['atom_x', 'atom_y'], axis = 1, inplace = True)
test.drop(['atom_x', 'atom_y'], axis = 1, inplace = True)

In [None]:
#Calculate distance between atoms
% time

train['dist'] = ((train['x_1'] - train['x_0']) ** 2 + (train['y_1'] - train['y_0']) ** 2 + (train['z_1'] - train['z_0']) ** 2) ** 0.5
test['dist'] = ((test['x_1'] - test['x_0']) ** 2 + (test['y_1'] - test['y_0']) ** 2 + (test['z_1'] - test['z_0']) ** 2) ** 0.5

In [None]:
# Aggregation features in a molecule
def map_structures_agg_info(df):
    
    agg_func = {'atom': 'count',
            'x': ['max', 'min', 'std'],
            'y': ['max', 'min', 'std'],
            'z': ['max', 'min', 'std'],
            #'dist': ['max', 'min', 'std']
           }
    
    intermediate = structures.groupby('molecule_name').agg(agg_func)
    intermediate.columns = ['_'.join(col).strip() for col in intermediate.columns.values]
    intermediate.reset_index(inplace = True)
    
    intermediate['size_x'] = intermediate['x_max'] - intermediate['x_min']
    intermediate['size_y'] = intermediate['y_max'] - intermediate['y_min']
    intermediate['size_z'] = intermediate['z_max'] - intermediate['z_min']
    intermediate['size'] = (intermediate['size_x'] ** 2 + intermediate['size_y'] ** 2 + intermediate['size_z'] ** 2) ** 0.5
    
    for col in intermediate.columns.values:
        if col == 'molecule_name':
            continue
        intermediate.rename({col: 'whole' + '_' + col}, axis=1, inplace=True)

    df = pd.merge(df, intermediate, how = 'left',
                  left_on  = ['molecule_name'],
                  right_on = ['molecule_name'])
    
    for atom in atoms:    
        if atom == 'F':
            continue
        intermediate = structures[structures['atom'] == atom].groupby('molecule_name').agg(agg_func)
        intermediate.columns = ['_'.join(col).strip() for col in intermediate.columns.values]
        intermediate.reset_index(inplace = True)

        intermediate['size_x'] = intermediate['x_max'] - intermediate['x_min']
        intermediate['size_y'] = intermediate['y_max'] - intermediate['y_min']
        intermediate['size_z'] = intermediate['z_max'] - intermediate['z_min']
        intermediate['size'] = (intermediate['size_x'] ** 2 + intermediate['size_y'] ** 2 + intermediate['size_z'] ** 2) ** 0.5

        for col in intermediate.columns.values:
            if col == 'molecule_name':
                continue
            intermediate.rename({col: str(atom) + '_' + col}, axis=1, inplace=True)

        df = pd.merge(df, intermediate, how = 'left',
                  left_on  = ['molecule_name'],
                  right_on = ['molecule_name'])
    
    df.fillna(0, inplace = True)
    return df

train = map_structures_agg_info(train)

test = map_structures_agg_info(test)

In [None]:
train.head()

In [None]:
structures_more = pd.merge(structures, df_atoms, how = 'left',
                  left_on  = ['atom'],
                  right_on = df_atoms.index)
col_coordinate = ['x', 'y', 'z']
col_multiply = ['mass', 'unpaired_electrons', 'bonding_electrons_1', 'bonding_electrons_2']

for coordinate in col_coordinate:
    for col in col_multiply:
        structures_more[col + '_' + coordinate] = structures_more[col] * structures_more[coordinate]

structures_more.head()   

In [None]:
# Can always engineer these features depending on atoms

def map_structure_atoms_info(df):
    agg_func = {'mass': 'sum',
                'unpaired_electrons': 'sum',
                'bonding_electrons_1': 'sum',
                'bonding_electrons_2': 'sum',
                'mass_x': 'sum',
                'mass_y': 'sum',
                'mass_z': 'sum',
                'unpaired_electrons_x': 'sum',
                'unpaired_electrons_y': 'sum',
                'unpaired_electrons_z': 'sum',
                'bonding_electrons_1_x': 'sum',
                'bonding_electrons_1_y': 'sum',
                'bonding_electrons_1_z': 'sum',
                'bonding_electrons_2_x': 'sum',
                'bonding_electrons_2_y': 'sum',
                'bonding_electrons_2_z': 'sum',
            
           }
    
    intermediate = structures_more.groupby('molecule_name').agg(agg_func).reset_index()
    #intermediate.columns = ['_'.join(col).strip() for col in intermediate.columns.values]
    #intermediate.reset_index(inplace = True)
    
    for col in intermediate.columns.values:
        if col == 'molecule_name':
            continue
        intermediate.rename({col: 'whole' + '_' + col + '_' + 'sum'}, axis=1, inplace=True)
    
    intermediate['whole_mass_size'] = (intermediate['whole_mass_x_sum'] ** 2 + intermediate['whole_mass_y_sum'] ** 2 + intermediate['whole_mass_z_sum'] ** 2) ** 0.5
    intermediate['whole_unpaired_electrons_size'] = (intermediate['whole_unpaired_electrons_x_sum'] ** 2 + intermediate['whole_unpaired_electrons_y_sum'] ** 2 + intermediate['whole_unpaired_electrons_z_sum'] ** 2) ** 0.5
    intermediate['whole_bonding_electrons_1_size'] = (intermediate['whole_bonding_electrons_1_x_sum'] ** 2 + intermediate['whole_bonding_electrons_1_y_sum'] ** 2 + intermediate['whole_bonding_electrons_1_z_sum'] ** 2) ** 0.5
    intermediate['whole_bonding_electrons_2_size'] = (intermediate['whole_bonding_electrons_2_x_sum'] ** 2 + intermediate['whole_bonding_electrons_2_y_sum'] ** 2 + intermediate['whole_bonding_electrons_2_z_sum'] ** 2) ** 0.5

    df = pd.merge(df, intermediate, how = 'left',
                  left_on  = ['molecule_name'],
                  right_on = ['molecule_name'])
    
    return df

train = map_structure_atoms_info(train)

test = map_structure_atoms_info(test)

In [None]:
train.head()

In [None]:
# More distance related aggregation features
def train_dist_info(df):
    
    df['molecule_couples'] = df.groupby('molecule_name')['molecule_name'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    
    df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    
    df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    
    df = reduce_mem_usage(df)
    return df

train = train_dist_info(train)

test = train_dist_info(test)

In [None]:
# Angle features in a molecule
def train_angle_info(df):
    
    
    df['molecule_bond_angle_axis_mean'] = df.groupby('molecule_name')['bond_angle_axis'].transform('mean')
    df['molecule_bond_angle_axis_min'] = df.groupby('molecule_name')['bond_angle_axis'].transform('min')
    df['molecule_bond_angle_axis_max'] = df.groupby('molecule_name')['bond_angle_axis'].transform('max')
    
    df['molecule_bond_angle_plane_mean'] = df.groupby('molecule_name')['bond_angle_plane'].transform('mean')
    df['molecule_bond_angle_plane_min'] = df.groupby('molecule_name')['bond_angle_plane'].transform('min')
    df['molecule_bond_angle_plane_max'] = df.groupby('molecule_name')['bond_angle_plane'].transform('max')
    
    df[f'molecule_atom_index_0_bond_angle_axis_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['bond_angle_axis'].transform('mean')
    df[f'molecule_atom_index_0_bond_angle_axis_mean_diff'] = df[f'molecule_atom_index_0_bond_angle_axis_mean'] - df['bond_angle_axis']
    df[f'molecule_atom_index_0_bond_angle_axis_max'] = df.groupby(['molecule_name', 'atom_index_0'])['bond_angle_axis'].transform('max')
    df[f'molecule_atom_index_0_bond_angle_axis_max_diff'] = df[f'molecule_atom_index_0_bond_angle_axis_max'] - df['bond_angle_axis']
    df[f'molecule_atom_index_0_bond_angle_axis_min'] = df.groupby(['molecule_name', 'atom_index_0'])['bond_angle_axis'].transform('min')
    df[f'molecule_atom_index_0_bond_angle_axis_min_diff'] = df[f'molecule_atom_index_0_bond_angle_axis_min'] - df['bond_angle_axis']
    df[f'molecule_atom_index_0_bond_angle_axis_std'] = df.groupby(['molecule_name', 'atom_index_0'])['bond_angle_axis'].transform('std')
    df[f'molecule_atom_index_0_bond_angle_axis_std_diff'] = df[f'molecule_atom_index_0_bond_angle_axis_std'] - df['bond_angle_axis']
    
    df[f'molecule_atom_index_1_bond_angle_axis_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['bond_angle_axis'].transform('mean')
    df[f'molecule_atom_index_1_bond_angle_axis_mean_diff'] = df[f'molecule_atom_index_1_bond_angle_axis_mean'] - df['bond_angle_axis']
    df[f'molecule_atom_index_1_bond_angle_axis_max'] = df.groupby(['molecule_name', 'atom_index_1'])['bond_angle_axis'].transform('max')
    df[f'molecule_atom_index_1_bond_angle_axis_max_diff'] = df[f'molecule_atom_index_1_bond_angle_axis_max'] - df['bond_angle_axis']
    df[f'molecule_atom_index_1_bond_angle_axis_min'] = df.groupby(['molecule_name', 'atom_index_1'])['bond_angle_axis'].transform('min')
    df[f'molecule_atom_index_1_bond_angle_axis_min_diff'] = df[f'molecule_atom_index_1_bond_angle_axis_min'] - df['bond_angle_axis']
    df[f'molecule_atom_index_1_bond_angle_axis_std'] = df.groupby(['molecule_name', 'atom_index_1'])['bond_angle_axis'].transform('std')
    df[f'molecule_atom_index_1_bond_angle_axis_std_diff'] = df[f'molecule_atom_index_1_bond_angle_axis_std'] - df['bond_angle_axis']
    
    df[f'molecule_atom_index_0_bond_angle_plane_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['bond_angle_plane'].transform('mean')
    df[f'molecule_atom_index_0_bond_angle_plane_mean_diff'] = df[f'molecule_atom_index_0_bond_angle_plane_mean'] - df['bond_angle_plane']
    df[f'molecule_atom_index_0_bond_angle_plane_max'] = df.groupby(['molecule_name', 'atom_index_0'])['bond_angle_plane'].transform('max')
    df[f'molecule_atom_index_0_bond_angle_plane_max_diff'] = df[f'molecule_atom_index_0_bond_angle_plane_max'] - df['bond_angle_plane']
    df[f'molecule_atom_index_0_bond_angle_plane_min'] = df.groupby(['molecule_name', 'atom_index_0'])['bond_angle_plane'].transform('min')
    df[f'molecule_atom_index_0_bond_angle_plane_min_diff'] = df[f'molecule_atom_index_0_bond_angle_plane_min'] - df['bond_angle_plane']
    df[f'molecule_atom_index_0_bond_angle_plane_std'] = df.groupby(['molecule_name', 'atom_index_0'])['bond_angle_plane'].transform('std')
    df[f'molecule_atom_index_0_bond_angle_plane_std_diff'] = df[f'molecule_atom_index_0_bond_angle_plane_std'] - df['bond_angle_plane']
    
    df[f'molecule_atom_index_1_bond_angle_plane_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['bond_angle_plane'].transform('mean')
    df[f'molecule_atom_index_1_bond_angle_plane_mean_diff'] = df[f'molecule_atom_index_1_bond_angle_plane_mean'] - df['bond_angle_plane']
    df[f'molecule_atom_index_1_bond_angle_plane_max'] = df.groupby(['molecule_name', 'atom_index_1'])['bond_angle_plane'].transform('max')
    df[f'molecule_atom_index_1_bond_angle_plane_max_diff'] = df[f'molecule_atom_index_1_bond_angle_plane_max'] - df['bond_angle_plane']
    df[f'molecule_atom_index_1_bond_angle_plane_min'] = df.groupby(['molecule_name', 'atom_index_1'])['bond_angle_plane'].transform('min')
    df[f'molecule_atom_index_1_bond_angle_plane_min_diff'] = df[f'molecule_atom_index_1_bond_angle_plane_min'] - df['bond_angle_plane']
    df[f'molecule_atom_index_1_bond_angle_plane_std'] = df.groupby(['molecule_name', 'atom_index_1'])['bond_angle_plane'].transform('std')
    df[f'molecule_atom_index_1_bond_angle_plane_std_diff'] = df[f'molecule_atom_index_1_bond_angle_plane_std'] - df['bond_angle_plane']
    
    df = reduce_mem_usage(df)
    return df

In [19]:
def distances(df):
    df_p_0 = df[['x_0', 'y_0', 'z_0']].values
    df_p_1 = df[['x_1', 'y_1', 'z_1']].values
    
    df['dist'] = np.linalg.norm(df_p_0 - df_p_1, axis=1)
    df['dist_x'] = (df['x_0'] - df['x_1']) ** 2
    df['dist_y'] = (df['y_0'] - df['y_1']) ** 2
    df['dist_z'] = (df['z_0'] - df['z_1']) ** 2
    
    return df

def map_atom_info(df_1,df_2, atom_idx):
    df = pd.merge(df_1, df_2, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)

    return df

def create_closest(df):
    cols = ["molecule_name","atom_index_0","atom_index_1",'atom_0', 'atom_1', 
                      "dist","x_0","y_0","z_0","x_1","y_1","z_1",
                     'n_bonds_0', 'n_bonds_1', 'rad_dist_diff',
                     'mulliken_atom_0', 'mulliken_atom_1'
                     ]
    df_temp=df.loc[:, cols].copy()
    df_temp_=df_temp.copy()
    df_temp_= df_temp_.rename(columns={'atom_index_0': 'atom_index_1', 'atom_index_1': 'atom_index_0',
                                       'atom_0': 'atom_1', 'atom_1': 'atom_0',
                                       'x_0': 'x_1', 'y_0': 'y_1', 'z_0': 'z_1', 'n_bonds_0': 'n_bonds_1', 
                                       'x_1': 'x_0', 'y_1': 'y_0', 'z_1': 'z_0', 'n_bonds_1': 'n_bonds_0',
                                       'mulliken_atom_0': 'mulliken_atom_1', 'mulliken_atom_1': 'mulliken_atom_0',
                                      })
    df_temp_ = df_temp_[cols]
    df_temp=pd.concat(objs=[df_temp,df_temp_],axis=0)

    df_temp["min_distance"]=df_temp.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df_temp= df_temp[df_temp["min_distance"]==df_temp["dist"]]

    df_temp=df_temp.drop(['atom_0', 'x_0', 'y_0', 'z_0', 'n_bonds_0', 'min_distance', 'dist',
                        'mulliken_atom_0',
                         ], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                     'atom_index_1': 'atom_index_closest',
                                     'atom_1': 'atom_closest',
                                     'distance': 'distance_closest',
                                     'n_bonds_1': 'n_bonds_closest',
                                     'x_1': 'x_closest',
                                     'y_1': 'y_closest',
                                     'z_1': 'z_closest',
                                     'mulliken_atom_1': 'mulliken_closest',
                                     'rad_dist_diff': 'rad_dist_diff_closest' 
                                    })
    df_temp.drop_duplicates(subset=['molecule_name', 'atom_index'], inplace = True)
    for atom_idx in [0,1]:
        df = map_atom_info(df,df_temp, atom_idx)
        df = df.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                        'distance_closest': f'distance_closest_{atom_idx}',
                                        'x_closest': f'x_closest_{atom_idx}',
                                        'y_closest': f'y_closest_{atom_idx}',
                                        'z_closest': f'z_closest_{atom_idx}',
                                        'atom_closest': f'atom_closest_{atom_idx}',
                                        'n_bonds_closest': f'n_bonds_closest_{atom_idx}',
                                        'mulliken_closest': f'mulliken_closest_{atom_idx}',
                                        'rad_dist_diff_closest': f'rad_dist_diff_closest_{atom_idx}',
                               })
    
    return df

def add_cos_features(df):
    df["distance_0"]=((df['x_0']-df['x_closest_0'])**2+(df['y_0']-df['y_closest_0'])**2+(df['z_0']-df['z_closest_0'])**2)**(1/2)
    df["distance_1"]=((df['x_1']-df['x_closest_1'])**2+(df['y_1']-df['y_closest_1'])**2+(df['z_1']-df['z_closest_1'])**2)**(1/2)
    df["vec_0_x"]=(df['x_0']-df['x_closest_0'])/df["distance_0"]
    df["vec_0_y"]=(df['y_0']-df['y_closest_0'])/df["distance_0"]
    df["vec_0_z"]=(df['z_0']-df['z_closest_0'])/df["distance_0"]
    df["vec_1_x"]=(df['x_1']-df['x_closest_1'])/df["distance_1"]
    df["vec_1_y"]=(df['y_1']-df['y_closest_1'])/df["distance_1"]
    df["vec_1_z"]=(df['z_1']-df['z_closest_1'])/df["distance_1"]
    df["vec_x"]=(df['x_1']-df['x_0'])/df["dist"]
    df["vec_y"]=(df['y_1']-df['y_0'])/df["dist"]
    df["vec_z"]=(df['z_1']-df['z_0'])/df["dist"]
    df["cos_0_1"]=df["vec_0_x"]*df["vec_1_x"]+df["vec_0_y"]*df["vec_1_y"]+df["vec_0_z"]*df["vec_1_z"]
    df["cos_0"]=df["vec_0_x"]*df["vec_x"]+df["vec_0_y"]*df["vec_y"]+df["vec_0_z"]*df["vec_z"]
    df["cos_1"]=df["vec_1_x"]*df["vec_x"]+df["vec_1_y"]*df["vec_y"]+df["vec_1_z"]*df["vec_z"]
    df=df.drop(['vec_0_x','vec_0_y','vec_0_z','vec_1_x','vec_1_y','vec_1_z','vec_x','vec_y','vec_z'], axis=1)
    return df

In [20]:
start_time = time.time()

train = distances(train)
test = distances(test)

In [21]:
print('Create closest features')

train = create_closest(train)
test = create_closest(test)

Create closest features


In [22]:
print('Create cos features')

train = add_cos_features(train)
test = add_cos_features(test)

Create cos features


In [None]:
train.columns.values

Coupling features based on Coulomb or Yukawa potentials

In [51]:
train['Coulomb'] = train['multiply_mulliken'] / train['dist']
test['Coulomb'] = test['multiply_mulliken'] / test['dist']

In [54]:
train['Yukawa'] = train['Coulomb'] * np.exp(-train['dist'])
test['Yukawa'] = test['Coulomb'] * np.exp(-test['dist'])

Inverse dist features. These features improve the results

In [87]:
train['inverse_dist_3'] = 1 / (train['dist'] * train['dist'] * train['dist'])
test['inverse_dist_3'] = 1 / (test['dist'] * test['dist'] * test['dist'])

In [88]:
train['inverse_rad_dist_diff'] = 1/ ((train['dist'] - train['rad_0'] - train['rad_1']) ** 2)
test['inverse_rad_dist_diff'] = 1/ ((test['dist'] - test['rad_0'] - test['rad_1']) ** 2)

In [89]:
train['inverse_dist_EN'] = 1/ (train['dist'] * (train['EN_0'] * 0.5 + train['EN_1'] * 0.5) ** 2)
test['inverse_dist_EN'] = 1/ (test['dist'] * (test['EN_0'] * 0.5 + test['EN_1'] * 0.5) ** 2)

In [90]:
train['mole_idx_0_inverse_dist_sum'] = 1 / (train.groupby(['molecule_name', 'atom_index_0'])['inverse_dist_3'].transform('sum'))
train['mole_idx_1_inverse_dist_sum'] = 1 / (train.groupby(['molecule_name', 'atom_index_1'])['inverse_dist_3'].transform('sum'))
train['combine_inverse_dist'] = train['mole_idx_0_inverse_dist_sum'] * train['mole_idx_1_inverse_dist_sum'] / (train['mole_idx_0_inverse_dist_sum'] + train['mole_idx_1_inverse_dist_sum'])

test['mole_idx_0_inverse_dist_sum'] = 1 / (test.groupby(['molecule_name', 'atom_index_0'])['inverse_dist_3'].transform('sum'))
test['mole_idx_1_inverse_dist_sum'] = 1 / (test.groupby(['molecule_name', 'atom_index_1'])['inverse_dist_3'].transform('sum'))
test['combine_inverse_dist'] = test['mole_idx_0_inverse_dist_sum'] * test['mole_idx_1_inverse_dist_sum'] / (test['mole_idx_0_inverse_dist_sum'] + test['mole_idx_1_inverse_dist_sum'])

In [93]:
train['mole_idx_0_inverse_rad_dist_diff_sum'] = 1 / (train.groupby(['molecule_name', 'atom_index_0'])['inverse_rad_dist_diff'].transform('sum'))
train['mole_idx_1_inverse_rad_dist_diff_sum'] = 1 / (train.groupby(['molecule_name', 'atom_index_1'])['inverse_rad_dist_diff'].transform('sum'))
train['combine_inverse_rad_dist_diff'] = train['mole_idx_0_inverse_rad_dist_diff_sum'] * train['mole_idx_1_inverse_rad_dist_diff_sum'] / (train['mole_idx_0_inverse_rad_dist_diff_sum'] + train['mole_idx_1_inverse_rad_dist_diff_sum'])

test['mole_idx_0_inverse_rad_dist_diff_sum'] = 1 / (test.groupby(['molecule_name', 'atom_index_0'])['inverse_rad_dist_diff'].transform('sum'))
test['mole_idx_1_inverse_rad_dist_diff_sum'] = 1 / (test.groupby(['molecule_name', 'atom_index_1'])['inverse_rad_dist_diff'].transform('sum'))
test['combine_inverse_rad_dist_diff'] = test['mole_idx_0_inverse_rad_dist_diff_sum'] * test['mole_idx_1_inverse_rad_dist_diff_sum'] / (test['mole_idx_0_inverse_rad_dist_diff_sum'] + test['mole_idx_1_inverse_rad_dist_diff_sum'])

In [94]:
train['mole_idx_0_inverse_dist_EN_sum'] = 1 / (train.groupby(['molecule_name', 'atom_index_0'])['inverse_dist_EN'].transform('sum'))
train['mole_idx_1_inverse_dist_EN_sum'] = 1 / (train.groupby(['molecule_name', 'atom_index_1'])['inverse_dist_EN'].transform('sum'))
train['combine_inverse_dist_EN'] = train['mole_idx_0_inverse_dist_EN_sum'] * train['mole_idx_1_inverse_dist_EN_sum'] / (train['mole_idx_0_inverse_dist_EN_sum'] + train['mole_idx_1_inverse_dist_EN_sum'])

test['mole_idx_0_inverse_dist_EN_sum'] = 1 / (test.groupby(['molecule_name', 'atom_index_0'])['inverse_dist_EN'].transform('sum'))
test['mole_idx_1_inverse_dist_EN_sum'] = 1 / (test.groupby(['molecule_name', 'atom_index_1'])['inverse_dist_EN'].transform('sum'))
test['combine_inverse_dist_EN'] = test['mole_idx_0_inverse_dist_EN_sum'] * test['mole_idx_1_inverse_dist_EN_sum'] / (test['mole_idx_0_inverse_dist_EN_sum'] + test['mole_idx_1_inverse_dist_EN_sum'])

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

print('Train dataset shape is -> rows: {} cols:{}'.format(train.shape[0],train.shape[1]))
print(f'Exe time: {(time.time() - start_time)/60:.2} min')

In [100]:
train.columns.values

(4658147, 123)

In [28]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 870.70 Mb (75.5% reduction)
Mem. usage decreased to 463.56 Mb (75.8% reduction)


In [98]:
train.rename(columns = {'type_x': 'type'}, inplace = True)
test.rename(columns = {'type_x': 'type'}, inplace = True)

In [None]:
train['multiply_mulliken'] = train['mulliken_atom_0'] * train['mulliken_atom_1']
test['multiply_mulliken'] = test['mulliken_atom_0'] * test['mulliken_atom_1']

In [102]:
train.to_csv('E:/kaggle/Molecular_properties/Data_use/train_QM9_bonds_neighbor_ready.csv')
test.to_csv('E:/kaggle/Molecular_properties/Data_use/test_QM9_bonds_neighbor_ready.csv')

In [29]:
df_target = pd.read_csv('E:/kaggle/Molecular_properties/Data_use/y_tr.csv')

In [None]:
dict_target = {'y_tr': y_tr}
df_target = pd.DataFrame(dict_target)

In [None]:
train.fillna(0, inplace = True)
test.fillna(0, inplace = True)

In [108]:
cols_toDrop = ['molecule_name', 
               'atom_index_0', 
               'atom_index_1',
               'atom_index_0_closest',
               'atom_index_1_closest',
               'atom_0',
              ]

In [None]:
train['type'].head()

In [104]:
train.select_dtypes(include = 'object').columns.values

array(['molecule_name', 'atom_0_closest', 'atom_1_closest', 'atom_0',
       'atom_1', 'bond_0_0closest', 'bond_1_1closest', 'bond_0_1closest',
       'bond_1_0closest', 'bond_1closest_0closest'], dtype=object)

Label encoding categorical features.

In [122]:
#molecules = train.pop('molecule_name')
#train.drop(['atom_index_0', 'atom_index_1', 'atom_index_0_closest', 'atom_index_1_closest', 'atom_0'], axis = 1, inplace = True)
#test.drop(cols_toDrop, axis=1, inplace = True)

#y_tr = train.pop('scalar_coupling_constant')
categorical_features = [#'type', 
                        'atom_1', 'atom_0_closest', 'atom_1_closest', 'bond_0_0closest', 'bond_1_1closest', 'bond_0_1closest',
       'bond_1_0closest', 'bond_1closest_0closest',]

#Label Encoding
for f in categorical_features:
    lbl = LabelEncoder()
    lbl.fit(list(train[f].values) + list(test[f].values))
    train[f] = lbl.transform(list(train[f].values))
    test[f] = lbl.transform(list(test[f].values))
    
types = train['type']

In [None]:
molecules = train.pop('molecule_name')
X_test.drop(['atom_index_0', 'atom_index_1'], axis = 1, inplace = True)
test_test.drop(cols_toDrop, axis=1, inplace = True)

y_tr = train.pop('scalar_coupling_constant')
categorical_features = ['type', 'atom_1']

#Label Encoding
for f in ['type', 'atom_1']:
    lbl = LabelEncoder()
    lbl.fit(list(X_test[f].values) + list(X_test[f].values))
    X_test[f] = lbl.transform(list(X_test[f].values))
    test_test[f] = lbl.transform(list(test_test[f].values))
    
types = train['type']

In [None]:
#y_tr = train_temp.pop('scalar_coupling_constant')
df_molecules = pd.DataFrame(train['type'].copy())
df_molecules['molecule_name'] = molecules

In [None]:
df_molecules.to_csv('E:/kaggle/Molecular_properties/molecules.csv', index=False)

In [None]:
df_molecules = pd.read_csv('E:/kaggle/Molecular_properties/Data_use/molecules.csv')

# Iteratively find the best features by type

There are duplicated columns in the training set. We want to find out them to reduce feature filtering time

In [177]:
def getDuplicateColumns(df):
    duplicateColumnNames = set()
    goodCol = set()
    
    for x in range(df.shape[1]):
        col = df.iloc[:, x]
        for y in range(x + 1, df.shape[1]):
            otherCol = df.iloc[:, y]
            if col.equals(otherCol):
                found = True
                goodCol.add(df.columns.values[x])
                duplicateColumnNames.add(df.columns.values[y])
            
                
    return list(duplicateColumnNames), list(goodCol)

Feature selection according to different types of couplings. We use 3 fold validation scheme basically to save as much time as possible

In [305]:
X_short = pd.DataFrame({'ind': list(train.index), 'type': train['type'].values, 'oof': [0] * len(train), 'target': y_tr.values})
X_short_test = pd.DataFrame({'ind': list(test.index), 'type': test['type'].values, 'prediction': [0] * len(test)})
types_iterate = train['type'].unique()
len(types_iterate)

8

In [306]:
params = {'num_leaves': 200,                           #initial 200                                     
          'min_child_samples': 79,                      #initial 79          
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.9,                    # 0.9
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.25,                        # 0.25        
          "metric": 'mae',                                       
          'reg_alpha': 0.1,                            #initial 0.1       
          'reg_lambda': 0.3                           #initial 0.3                              
         }

In [24]:
n_splits = 3
gkf = GroupKFold(n_splits=n_splits)

In [None]:
t = types_iterate[0]
X_t = train.loc[train['type'] == t][:25000]
X_test_t = test.loc[test['type'] == t][:2000]
y_t = X_short.loc[X_short['type'] == t, 'target'][:25000]
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name'][:25000]
        
features = [col for col in train.columns.values if col not in badcol]
col_tokeep = ['type', 'dist', 'multiply_mulliken', 'rad_dist_diff']
scores = [-1., -1., -1., -1.]
duplicate_features, _ = getDuplicateColumns(X_t[features])

for iteration in range(28):
    print(f'Round {iteration}:')
    score_best = 100
    index_best = 0
    number = 0
    for index in range(len(features)):
        if features[index] in col_tokeep:
            continue
        if len(X_t[features[index]].unique()) == 1:
            continue
        if features[index] in duplicate_features:
            continue
            
        print(f'{number} feature {features[index]} started at {time.ctime()}')
        
        number = number + 1
        features_totest = col_tokeep.copy()
        features_totest.append(features[index])
        X_t_t = X_t[features_totest]
        X_test_t_t = X_test_t[features_totest]
        #y_t = y_tr[:40000]
        #molecules_t = df_molecules[:40000]['molecule_name']

        result_dict_lgb = train_model_regression(X=X_t_t, X_test=X_test_t_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='mae', plot_feature_importance=False,
                                                          verbose=500, early_stopping_rounds=500, n_estimators=1000)
        score_mean = np.mean(result_dict_lgb['scores'])
        if score_mean < score_best:
            score_best = score_mean
            index_best = index

    col_tokeep.append(features[index_best])
    scores.append(score_best)


In [310]:
# good_cols = [cos_0, cos_0_1, distance_1, distance_0, mulliken_closest_1, n_bonds_closest_1, atom_closest_1, mulliken_closest_0
#]
dict_feature = {'feature': col_tokeep, 'score': scores}
df_features_0 = pd.DataFrame(dict_feature)

In [312]:
df_features_0.to_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type0_bonds_QM9_inver_neighbor_huber.csv', index=False)

In [None]:
t = types_iterate[1]
X_t = train.loc[train['type'] == t][:25000]
X_test_t = test.loc[test['type'] == t][:2000]
y_t = X_short.loc[X_short['type'] == t, 'target'][:25000]
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name'][:25000]
        
features = [col for col in train.columns.values if col not in badcol]
col_tokeep = ['type', 'dist', 'multiply_mulliken', 'rad_dist_diff']
scores = [-1., -1., -1., -1.]
duplicate_features, _ = getDuplicateColumns(X_t[features])

for iteration in range(28):
    print(f'Round {iteration}:')
    score_best = 100
    index_best = 0
    number = 0
    for index in range(len(features)):
        if features[index] in col_tokeep:
            continue
        if len(X_t[features[index]].unique()) == 1:
            continue
        if features[index] in duplicate_features:
            continue
            
        print(f'{number} feature {features[index]} started at {time.ctime()}')
        
        number = number + 1
        features_totest = col_tokeep.copy()
        features_totest.append(features[index])
        X_t_t = X_t[features_totest]
        X_test_t_t = X_test_t[features_totest]
        #y_t = y_tr[:40000]
        #molecules_t = df_molecules[:40000]['molecule_name']

        result_dict_lgb = train_model_regression(X=X_t_t, X_test=X_test_t_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='mae', plot_feature_importance=False,
                                                          verbose=500, early_stopping_rounds=500, n_estimators=1000)
        score_mean = np.mean(result_dict_lgb['scores'])
        if score_mean < score_best:
            score_best = score_mean
            index_best = index

    col_tokeep.append(features[index_best])
    scores.append(score_best)

In [314]:
dict_feature = {'feature': col_tokeep, 'score': scores}
df_features_1 = pd.DataFrame(dict_feature)

In [316]:
df_features_1.to_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type1_bonds_QM9_inver_neighbor_huber.csv', index=False)

In [None]:
t = types_iterate[2]
X_t = train.loc[train['type'] == t][:25000]
X_test_t = test.loc[test['type'] == t][:2000]
y_t = X_short.loc[X_short['type'] == t, 'target'][:25000]
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name'][:25000]
        
features = [col for col in train.columns.values if col not in badcol]
col_tokeep = ['type', 'dist', 'multiply_mulliken', 'rad_dist_diff']
scores = [-1., -1., -1., -1.]
duplicate_features, _ = getDuplicateColumns(X_t[features])

for iteration in range(28):
    print(f'Round {iteration}:')
    score_best = 100
    index_best = 0
    number = 0
    for index in range(len(features)):
        if features[index] in col_tokeep:
            continue
        if len(X_t[features[index]].unique()) == 1:
            continue
        if features[index] in duplicate_features:
            continue
            
        print(f'{number} feature {features[index]} started at {time.ctime()}')
        
        number = number + 1
        features_totest = col_tokeep.copy()
        features_totest.append(features[index])
        X_t_t = X_t[features_totest]
        X_test_t_t = X_test_t[features_totest]
        #y_t = y_tr[:40000]
        #molecules_t = df_molecules[:40000]['molecule_name']

        result_dict_lgb = train_model_regression(X=X_t_t, X_test=X_test_t_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='mae', plot_feature_importance=False,
                                                          verbose=500, early_stopping_rounds=500, n_estimators=500)
        score_mean = np.mean(result_dict_lgb['scores'])
        if score_mean < score_best:
            score_best = score_mean
            index_best = index

    col_tokeep.append(features[index_best])
    scores.append(score_best)

In [318]:
dict_feature = {'feature': col_tokeep, 'score': scores}
df_features_2 = pd.DataFrame(dict_feature)

In [320]:
df_features_2.to_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type2_bonds_QM9_inver_neighbor_huber.csv', index=False)

In [None]:
t = types_iterate[3]
X_t = train.loc[train['type'] == t][:25000]
X_test_t = test.loc[test['type'] == t][:2000]
y_t = X_short.loc[X_short['type'] == t, 'target'][:25000]
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name'][:25000]
        
features = [col for col in train.columns.values if col not in badcol]
col_tokeep = ['type', 'dist', 'multiply_mulliken', 'rad_dist_diff']
scores = [-1., -1., -1., -1.]
duplicate_features, _ = getDuplicateColumns(X_t[features])

for iteration in range(28):
    print(f'Round {iteration}:')
    score_best = 100
    index_best = 0
    number = 0
    for index in range(len(features)):
        if features[index] in col_tokeep:
            continue
        if len(X_t[features[index]].unique()) == 1:
            continue
        if features[index] in duplicate_features:
            continue

        print(f'{number} feature {features[index]} started at {time.ctime()}')
        
        number = number + 1
        features_totest = col_tokeep.copy()
        features_totest.append(features[index])
        X_t_t = X_t[features_totest]
        X_test_t_t = X_test_t[features_totest]
        #y_t = y_tr[:40000]
        #molecules_t = df_molecules[:40000]['molecule_name']

        result_dict_lgb = train_model_regression(X=X_t_t, X_test=X_test_t_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='mae', plot_feature_importance=False,
                                                          verbose=500, early_stopping_rounds=500, n_estimators=500)
        score_mean = np.mean(result_dict_lgb['scores'])
        if score_mean < score_best:
            score_best = score_mean
            index_best = index

    col_tokeep.append(features[index_best])
    scores.append(score_best)

In [322]:
dict_feature = {'feature': col_tokeep, 'score': scores}
df_features_3 = pd.DataFrame(dict_feature)

In [324]:
df_features_3.to_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type3_bonds_QM9_inver_neighbor_huber.csv', index=False)

In [None]:
t = types_iterate[4]
X_t = train.loc[train['type'] == t][:25000]
X_test_t = test.loc[test['type'] == t][:2000]
y_t = X_short.loc[X_short['type'] == t, 'target'][:25000]
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name'][:25000]
        
features = [col for col in train.columns.values if col not in badcol]
col_tokeep = ['type', 'dist', 'multiply_mulliken', 'rad_dist_diff']
scores = [-1., -1., -1., -1.]
duplicate_features, _ = getDuplicateColumns(X_t[features])

for iteration in range(28):
    print(f'Round {iteration}:')
    score_best = 100
    index_best = 0
    number = 0
    for index in range(len(features)):
        if features[index] in col_tokeep:
            continue
        if len(X_t[features[index]].unique()) == 1:
            continue
        if features[index] in duplicate_features:
            continue

        print(f'{number} feature {features[index]} started at {time.ctime()}')
        
        number = number + 1
        features_totest = col_tokeep.copy()
        features_totest.append(features[index])
        X_t_t = X_t[features_totest]
        X_test_t_t = X_test_t[features_totest]
        #y_t = y_tr[:40000]
        #molecules_t = df_molecules[:40000]['molecule_name']

        result_dict_lgb = train_model_regression(X=X_t_t, X_test=X_test_t_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='mae', plot_feature_importance=False,
                                                          verbose=500, early_stopping_rounds=500, n_estimators=500)
        score_mean = np.mean(result_dict_lgb['scores'])
        if score_mean < score_best:
            score_best = score_mean
            index_best = index

    col_tokeep.append(features[index_best])
    scores.append(score_best)

In [326]:
dict_feature = {'feature': col_tokeep, 'score': scores}
df_features_4 = pd.DataFrame(dict_feature)

In [328]:
df_features_4.to_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type4_bonds_QM9_inver_neighbor_huber.csv', index=False)

In [None]:
t = types_iterate[5]
X_t = train.loc[train['type'] == t][:25000]
X_test_t = test.loc[test['type'] == t][:1000]
y_t = X_short.loc[X_short['type'] == t, 'target'][:25000]
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name'][:25000]
        
features = [col for col in train.columns.values if col not in badcol]
col_tokeep = ['type', 'dist', 'multiply_mulliken', 'rad_dist_diff']
scores = [-1., -1., -1., -1.]
duplicate_features, _ = getDuplicateColumns(X_t[features])

for iteration in range(28):
    print(f'Round {iteration}:')
    score_best = 100
    index_best = 0
    number = 0
    for index in range(len(features)):
        if features[index] in col_tokeep:
            continue
        if len(X_t[features[index]].unique()) == 1:
            continue
        if features[index] in duplicate_features:
            continue

        print(f'{number} feature {features[index]} started at {time.ctime()}')
        
        number = number + 1
        features_totest = col_tokeep.copy()
        features_totest.append(features[index])
        X_t_t = X_t[features_totest]
        X_test_t_t = X_test_t[features_totest]
        #y_t = y_tr[:40000]
        #molecules_t = df_molecules[:40000]['molecule_name']

        result_dict_lgb = train_model_regression(X=X_t_t, X_test=X_test_t_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='mae', plot_feature_importance=False,
                                                          verbose=500, early_stopping_rounds=500, n_estimators=500)
        score_mean = np.mean(result_dict_lgb['scores'])
        if score_mean < score_best:
            score_best = score_mean
            index_best = index

    col_tokeep.append(features[index_best])
    scores.append(score_best)

In [330]:
dict_feature = {'feature': col_tokeep, 'score': scores}
df_features_5 = pd.DataFrame(dict_feature)

In [332]:
df_features_5.to_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type5_bonds_QM9_inver_neighbor_huber.csv', index=False)

In [None]:
t = types_iterate[6]
X_t = train.loc[train['type'] == t][:25000]
X_test_t = test.loc[test['type'] == t][:2000]
y_t = X_short.loc[X_short['type'] == t, 'target'][:25000]
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name'][:25000]
        
features = [col for col in train.columns.values if col not in badcol]
col_tokeep = ['type', 'dist', 'multiply_mulliken', 'rad_dist_diff']
scores = [-1., -1., -1., -1.]
duplicate_features, _ = getDuplicateColumns(X_t[features])

for iteration in range(28):
    print(f'Round {iteration}:')
    score_best = 100
    index_best = 0
    number = 0
    for index in range(len(features)):
        if features[index] in col_tokeep:
            continue
        if len(X_t[features[index]].unique()) == 1:
            continue
        if features[index] in duplicate_features:
            continue

        print(f'{number} feature {features[index]} started at {time.ctime()}')
        
        number = number + 1
        features_totest = col_tokeep.copy()
        features_totest.append(features[index])
        X_t_t = X_t[features_totest]
        X_test_t_t = X_test_t[features_totest]
        #y_t = y_tr[:40000]
        #molecules_t = df_molecules[:40000]['molecule_name']

        result_dict_lgb = train_model_regression(X=X_t_t, X_test=X_test_t_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='mae', plot_feature_importance=False,
                                                          verbose=500, early_stopping_rounds=500, n_estimators=500)
        score_mean = np.mean(result_dict_lgb['scores'])
        if score_mean < score_best:
            score_best = score_mean
            index_best = index

    col_tokeep.append(features[index_best])
    scores.append(score_best)

In [334]:
dict_feature = {'feature': col_tokeep, 'score': scores}
df_features_6 = pd.DataFrame(dict_feature)

In [336]:
df_features_6.to_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type6_bonds_QM9_inver_neighbor_huber.csv', index=False)

In [None]:
t = types_iterate[7]
X_t = train.loc[train['type'] == t][:25000]
X_test_t = test.loc[test['type'] == t][:1000]
y_t = X_short.loc[X_short['type'] == t, 'target'][:25000]
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name'][:25000]
        
features = [col for col in train.columns.values if col not in badcol]
col_tokeep = ['type', 'dist', 'multiply_mulliken', 'rad_dist_diff']
scores = [-1., -1., -1., -1.]
duplicate_features, _ = getDuplicateColumns(X_t[features])

for iteration in range(28):
    print(f'Round {iteration}:')
    score_best = 100
    index_best = 0
    number = 0
    for index in range(len(features)):
        if features[index] in col_tokeep:
            continue
        if len(X_t[features[index]].unique()) == 1:
            continue
        if features[index] in duplicate_features:
            continue

        print(f'{number} feature {features[index]} started at {time.ctime()}')
        
        number = number + 1
        features_totest = col_tokeep.copy()
        features_totest.append(features[index])
        X_t_t = X_t[features_totest]
        X_test_t_t = X_test_t[features_totest]
        #y_t = y_tr[:40000]
        #molecules_t = df_molecules[:40000]['molecule_name']

        result_dict_lgb = train_model_regression(X=X_t_t, X_test=X_test_t_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='mae', plot_feature_importance=False,
                                                          verbose=500, early_stopping_rounds=500, n_estimators=500)
        score_mean = np.mean(result_dict_lgb['scores'])
        if score_mean < score_best:
            score_best = score_mean
            index_best = index

    col_tokeep.append(features[index_best])
    scores.append(score_best)

In [338]:
dict_feature = {'feature': col_tokeep, 'score': scores}
df_features_7 = pd.DataFrame(dict_feature)

In [340]:
df_features_7.to_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type7_bonds_QM9_inver_neighbor_huber.csv', index=False)

In [39]:
cols_test = [col for col in train.columns.values if col not in ['molecule_name', 'atom_index_0', 'atom_index_1', 'atom_0',
                                                              'atom_1', 'type_0', 'atom_x', 'atom_y',
                                                                'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1',
                                                                'x_closest_1', 'y_closest_1', 'z_closest_1',
                                                                'x_closest_0','y_closest_0', 'z_closest_0',
                                                                'mulliken_mean',
                                                                #'Cv', 'H', 'U', 'U0', 'zpve', 'r2', 'alpha',
                                                               'dist_x', 'dist_y', 'dist_z',
                                                                'EN_0', 'rad_0',
                                                              'oof_fc', 
                                                              'oof_sd', 
                                                              'oof_pso', 
                                                              'oof_dso'
                                                               ]
            ]

In [251]:
features_all = list(df_features_0['feature'].values) + list(df_features_1['feature'].values) + list(df_features_2['feature'].values) + list(df_features_3['feature'].values) + list(df_features_4['feature'].values) + list(df_features_5['feature'].values) + list(df_features_6['feature'].values) + list(df_features_7['feature'].values)

In [258]:
features_tokeep = set(features_all)

In [263]:
train_tokeep = train[features_tokeep]
test_tokeep = test[features_tokeep]

In [47]:
train.to_csv('E:/kaggle/Molecular_properties/Data_use/train_bonds_QM9_neighbor_merge.csv')
test.to_csv('E:/kaggle/Molecular_properties/Data_use/test_bonds_QM9_neighbor_merge.csv')

# Stacking Features by types

In [341]:
X_short = pd.DataFrame({'ind': list(train.index), 'type': train['type'].values, 
                        'oof_fc': [0] * len(train), 'oof_sd': [0] * len(train), 'oof_pso': [0] * len(train), 'oof_dso': [0] * len(train),
                        'target_fc': y_fc.values, 'target_sd': y_sd.values, 'target_pso': y_pso.values, 'target_dso': y_dso.values})
X_short_test = pd.DataFrame({'ind': list(test.index), 'type': test['type'].values, 
                             'prediction_fc': [0] * len(test), 'prediction_sd': [0] * len(test), 'prediction_pso': [0] * len(test), 'prediction_dso': [0] * len(test) })
types_iterate = train['type'].unique()
len(types_iterate)

8

In [347]:
params = {'num_leaves': 200,                           #initial 200                                     
          'min_data_in_leaf': 79,                      #initial 79          
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.9,                    # 0.9
          'subsample': 0.8,                            #initial 0.8
          'learning_rate': 0.25,                        # 0.25        
          "metric": 'mae',                                       
          'reg_alpha': 0.1,                            #initial 0.1       
          'reg_lambda': 0.3                           #initial 0.3                              
         }

In [348]:
def train_secondary(X_short_secondary, X_short_test_secondary, types_iterate):
    features = ['fc', 
                #'sd', 
                #'pso', 
                #'dso'
               ]
    for feature in features:
        
        print('Training of feature ' + feature)
        label = 'target_' + feature
        train_secondary_feature = 'oof_' + feature
        test_secondary_feature = 'prediction_' + feature
        i = 0
        
        for t in types_iterate:
            print(f'Training of type {t}')
            df_features = pd.read_csv(f'E:/kaggle/Molecular_properties/good_features/good_ft_type{i}_bonds_QM9_inver_neighbor_huber.csv')
            good_features = df_features['feature']
            
            X_t = train[good_features].loc[train['type'] == t]
            X_test_t = test[good_features].loc[test['type'] == t]
            y_t = X_short_secondary.loc[X_short['type'] == t, label]
            molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name']
            result_dict_lgb_oof = train_model_regression(X=X_t, X_test=X_test_t, y=y_t, params=params, molecules = molecules_t,
                                                                      folds=gkf, model_type='lgb', eval_metric='group_mae', plot_feature_importance=False,
                                                                      verbose=1000, early_stopping_rounds=500, n_estimators=10000)

            X_short_secondary.loc[X_short_secondary['type'] == t, train_secondary_feature] = result_dict_lgb_oof['oof']
            X_short_test_secondary.loc[X_short_test_secondary['type'] == t, test_secondary_feature] = result_dict_lgb_oof['prediction']
            
            i = i + 1

In [None]:
train_secondary(X_short, X_short_test, types_iterate)

In [None]:
X_short_test.tail()

In [350]:
train['oof_fc'] = X_short['oof_fc']
test['oof_fc'] = X_short_test['prediction_fc']

In [None]:
train['oof_sd'] = X_short['oof_sd']
test['oof_sd'] = X_short_test['prediction_sd']

In [None]:
train['oof_pso'] = X_short['oof_pso']
test['oof_pso'] = X_short_test['prediction_pso']

In [None]:
train['oof_dso'] = X_short['oof_dso']
test['oof_dso'] = X_short_test['prediction_dso']

# Train by types

In [351]:
X_short = pd.DataFrame({'ind': list(train.index), 'type': train['type'].values, 'oof': [0] * len(train), 'target': y_tr.values})
X_short_test = pd.DataFrame({'ind': list(test.index), 'type': test['type'].values, 'prediction': [0] * len(test)})
types_iterate = train['type'].unique()
len(types_iterate)

8

In [None]:
params = {'num_leaves': 125,                                     
          'min_data_in_leaf': 79,                                
          'objective': 'regression',                                  # best: 'regression'         
          'max_depth': -1,                                     # best: -1 
          'learning_rate': 0.2,                                
          "boosting": "gbdt",             
          "bagging_freq": 1,                                   # best 1  
          "bagging_fraction": 0.9,                             # best  0.9  
          'feature_fraction': 1,                             # best 1
          "bagging_seed": 11,                                    
          "metric": 'mae',                                       
          "verbosity": -1,                                       
          'reg_alpha': 0.1,                                    # best 0.1
          'reg_lambda': 0.05                                   # best 0.05                       
         }

In [None]:
# Grouped CV score: 0.6484  10000 step Secondary feature all: 0.1605   10000 step Secondary by type, after feature selection: 0.2411
#10000 step Secondary from all, after feature selection: 0.2594      
#'mae': w/0 fc: 0.1643
#'huber' w/o fc: -0.3177

t = types_iterate[0]
print(f'Training of type {t}')

df_features = pd.read_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type0_bonds_QM9_inver_neighbor_huber.csv')
good_features = list(df_features['feature'].values)
#good_features.append('oof_fc')

X_t = train[good_features].loc[train['type'] == t]
X_test_t = test[good_features].loc[test['type'] == t]
y_t = X_short.loc[X_short['type'] == t, 'target']
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name']
result_dict_lgb = train_model_regression(X=X_t, X_test=X_test_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True,
                                                          verbose=1000, early_stopping_rounds=500, n_estimators=20000)

X_short.loc[X_short['type'] == t, 'oof'] = result_dict_lgb['oof']
X_short_test.loc[X_short_test['type'] == t, 'prediction'] = result_dict_lgb['prediction']

In [117]:
feature_importance_0 = result_dict_lgb['feature_importance']

In [118]:
df = feature_importance_0[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)

In [None]:
cols_toDrop = list(feature_importance_0[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[-15:].index)
cols_toDrop

In [120]:
df.to_csv('E:/kaggle/Molecular_properties/feature_importance_type0_bonds_QM9_regression.csv')

In [None]:
params = {'num_leaves': 125,                                     
          'min_data_in_leaf': 79,                                
          'objective': 'regression',                                  # best: 'regression'         
          'max_depth': -1,                                     # best: -1 
          'learning_rate': 0.2,                                
          "boosting": "gbdt",             
          "bagging_freq": 1,                                   # best 1  
          "bagging_fraction": 0.9,                             # best  0.9  
          'feature_fraction': 1,                             # best 1
          "bagging_seed": 11,                                    
          "metric": 'mae',                                       
          "verbosity": -1,                                       
          'reg_alpha': 0.1,                                    # best 0.1
          'reg_lambda': 0.05                                   # best 0.05                       
         }

In [None]:
#Grouped CV score: -0.9384   -1.3241  Secondary feature by type, after feature selection: -1.2661
#10000 step Secondary from all: -1.218
#'huber': w 'huber' fc: -1.7444

t = types_iterate[1]
print(f'Training of type {t}')

df_features = pd.read_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type1_bonds_QM9_inver_neighbor_huber.csv')
good_features = list(df_features['feature'].values)
good_features.append('oof_fc')

X_t = train[good_features].loc[train['type'] == t]
X_test_t = test[good_features].loc[test['type'] == t]
y_t = X_short.loc[X_short['type'] == t, 'target']
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name']
result_dict_lgb = train_model_regression(X=X_t, X_test=X_test_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True,
                                                          verbose=1000, early_stopping_rounds=500, n_estimators=20000)

X_short.loc[X_short['type'] == t, 'oof'] = result_dict_lgb['oof']
X_short_test.loc[X_short_test['type'] == t, 'prediction'] = result_dict_lgb['prediction']

In [None]:
params = {'num_leaves': 125,                                     
          'min_data_in_leaf': 79,                                
          'objective': 'regression',                                  # best: 'regression'         
          'max_depth': -1,                                     # best: -1 
          'learning_rate': 0.2,                                
          "boosting": "gbdt",             
          "bagging_freq": 1,                                   # best 1  
          "bagging_fraction": 0.9,                             # best  0.9  
          'feature_fraction': 1,                             # best 1
          "bagging_seed": 11,                                    
          "metric": 'mae',                                       
          "verbosity": -1,                                       
          'reg_alpha': 0.1,                                    # best 0.1
          'reg_lambda': 0.05                                   # best 0.05                       
         }

In [None]:
#Grouped CV score: 0.0191   -0.7083  Secondary feature by type, after feature selection: -0.7227
#10000 step Secondary from all: -0.7321
#'huber': w'huber' fc: -1.0626
t = types_iterate[2]
print(f'Training of type {t}')

df_features = pd.read_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type2_bonds_QM9_inver_neighbor_huber.csv')
good_features = list(df_features['feature'].values)
good_features.append('oof_fc')

X_t = train[good_features].loc[train['type'] == t]
X_test_t = test[good_features].loc[test['type'] == t]
y_t = X_short.loc[X_short['type'] == t, 'target']
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name']
result_dict_lgb = train_model_regression(X=X_t, X_test=X_test_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True,
                                                          verbose=1000, early_stopping_rounds=500, n_estimators=20000)

X_short.loc[X_short['type'] == t, 'oof'] = result_dict_lgb['oof']
X_short_test.loc[X_short_test['type'] == t, 'prediction'] = result_dict_lgb['prediction']

In [None]:
params = {'num_leaves': 125,                                     
          'min_data_in_leaf': 79,                                
          'objective': 'regression',                                  # best: 'regression'         
          'max_depth': -1,                                     # best: -1 
          'learning_rate': 0.2,                                
          "boosting": "gbdt",             
          "bagging_freq": 1,                                   # best 1  
          "bagging_fraction": 0.9,                             # best  0.9  
          'feature_fraction': 1,                             # best 1
          "bagging_seed": 11,                                    
          "metric": 'mae',                                       
          "verbosity": -1,                                       
          'reg_alpha': 0.1,                                    # best 0.1
          'reg_lambda': 0.05                                   # best 0.05                       
         }

In [358]:
params = {'num_leaves': 200,                           #initial 200                                     
          'min_child_samples': 79,                      #initial 79          
          'objective': 'huber',                                        
          #'max_depth': -1,                                    
          'colsample_bytree': 0.9,                    # 0.9
          'subsample': 0.8,                            #initial 0.8
          'eta': 0.15,                        # 0.25        
          "metric": 'mae',                                       
          'reg_alpha': 0.1,                            #initial 0.1       
          'reg_lambda': 0.3                           #initial 0.3                              
         }

In [None]:
#Grouped CV score: -0.7305   -1.1433  Secondary feature by type, after feature selection: -1.3263
#10000 step Secondary from all: -1.2675
#'huber': w'huber' fc: -1.8810
t = types_iterate[3]
print(f'Training of type {t}')

df_features = pd.read_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type3_bonds_QM9_inver_neighbor_huber.csv')
good_features = list(df_features['feature'].values)
good_features.append('oof_fc')

X_t = train[good_features].loc[train['type'] == t]
X_test_t = test[good_features].loc[test['type'] == t]
y_t = X_short.loc[X_short['type'] == t, 'target']
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name']
result_dict_lgb = train_model_regression(X=X_t, X_test=X_test_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True,
                                                          verbose=1000, early_stopping_rounds=500, n_estimators=20000)

X_short.loc[X_short['type'] == t, 'oof'] = result_dict_lgb['oof']
X_short_test.loc[X_short_test['type'] == t, 'prediction'] = result_dict_lgb['prediction']

In [None]:
params = {'num_leaves': 125,                                     
          'min_data_in_leaf': 79,                                
          'objective': 'regression',                                  # best: 'regression'         
          'max_depth': -1,                                     # best: -1 
          'learning_rate': 0.2,                                
          "boosting": "gbdt",             
          "bagging_freq": 1,                                   # best 1  
          "bagging_fraction": 0.9,                             # best  0.9  
          'feature_fraction': 1,                             # best 1
          "bagging_seed": 11,                                    
          "metric": 'mae',                                       
          "verbosity": -1,                                       
          'reg_alpha': 0.1,                                    # best 0.1
          'reg_lambda': 0.05                                   # best 0.05                       
         }

In [None]:
#Grouped CV score: -0.2570   -0.5576   Secondary feature by type, after feature selection: -0.6112
#10000 step Secondary from all: -0.5765
#'huber': w 'huber' fc: -1.2730
t = types_iterate[4]
print(f'Training of type {t}')

df_features = pd.read_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type4_bonds_QM9_inver_neighbor_huber.csv')
good_features = list(df_features['feature'].values)
good_features.append('oof_fc')

X_t = train[good_features].loc[train['type'] == t]
X_test_t = test[good_features].loc[test['type'] == t]
y_t = X_short.loc[X_short['type'] == t, 'target']
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name']
result_dict_lgb = train_model_regression(X=X_t, X_test=X_test_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True,
                                                          verbose=1000, early_stopping_rounds=500, n_estimators=20000)

X_short.loc[X_short['type'] == t, 'oof'] = result_dict_lgb['oof']
X_short_test.loc[X_short_test['type'] == t, 'prediction'] = result_dict_lgb['prediction']

In [None]:
params = {'num_leaves': 125,                                     
          'min_data_in_leaf': 79,                                
          'objective': 'regression',                                  # best: 'regression'         
          'max_depth': -1,                                     # best: -1 
          'learning_rate': 0.2,                                
          "boosting": "gbdt",             
          "bagging_freq": 1,                                   # best 1  
          "bagging_fraction": 0.9,                             # best  0.9  
          'feature_fraction': 1,                             # best 1
          "bagging_seed": 11,                                    
          "metric": 'mae',                                       
          "verbosity": -1,                                       
          'reg_alpha': 0.1,                                    # best 0.1
          'reg_lambda': 0.05                                   # best 0.05                       
         }

In [None]:
#Grouped CV score: -0.6655  -1.0659   Secondary feature by type, after feature selection: -1.0359
#10000 step Secondary from all: -1.0109
#'huber': w/o fc: -1.8651 
t = types_iterate[5]
print(f'Training of type {t}')

df_features = pd.read_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type5_bonds_QM9_inver_neighbor_huber.csv')
good_features = list(df_features['feature'].values)
#good_features.append('oof_fc')

X_t = train[good_features].loc[train['type'] == t]
X_test_t = test[good_features].loc[test['type'] == t]
y_t = X_short.loc[X_short['type'] == t, 'target']
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name']
result_dict_lgb = train_model_regression(X=X_t, X_test=X_test_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True,
                                                          verbose=1000, early_stopping_rounds=500, n_estimators=20000)

X_short.loc[X_short['type'] == t, 'oof'] = result_dict_lgb['oof']
X_short_test.loc[X_short_test['type'] == t, 'prediction'] = result_dict_lgb['prediction']

In [None]:
params = {'num_leaves': 125,                                     
          'min_data_in_leaf': 79,                                
          'objective': 'regression',                                  # best: 'regression'         
          'max_depth': -1,                                     # best: -1 
          'learning_rate': 0.2,                                
          "boosting": "gbdt",             
          "bagging_freq": 1,                                   # best 1  
          "bagging_fraction": 0.9,                             # best  0.9  
          'feature_fraction': 1,                             # best 1
          "bagging_seed": 11,                                    
          "metric": 'mae',                                       
          "verbosity": -1,                                       
          'reg_alpha': 0.1,                                    # best 0.1
          'reg_lambda': 0.05                                   # best 0.05                       
         }

In [None]:
#Grouped CV score: -0.2060  -0.4448    Secondary feature by type, after feature selection: -0.4463
#'huber': w 'huber' fc: -1.2960
t = types_iterate[6]
print(f'Training of type {t}')

df_features = pd.read_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type6_bonds_QM9_inver_neighbor_huber.csv')
good_features = list(df_features['feature'].values)
good_features.append('oof_fc')

X_t = train[good_features].loc[train['type'] == t]
X_test_t = test[good_features].loc[test['type'] == t]
y_t = X_short.loc[X_short['type'] == t, 'target']
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name']
result_dict_lgb = train_model_regression(X=X_t, X_test=X_test_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True,
                                                          verbose=1000, early_stopping_rounds=500, n_estimators=20000)

X_short.loc[X_short['type'] == t, 'oof'] = result_dict_lgb['oof']
X_short_test.loc[X_short_test['type'] == t, 'prediction'] = result_dict_lgb['prediction']

In [None]:
params = {'num_leaves': 125,                                     
          'min_data_in_leaf': 79,                                
          'objective': 'regression',                                  # best: 'regression'         
          'max_depth': -1,                                     # best: -1 
          'learning_rate': 0.2,                                
          "boosting": "gbdt",             
          "bagging_freq": 1,                                   # best 1  
          "bagging_fraction": 0.9,                             # best  0.9  
          'feature_fraction': 1,                             # best 1
          "bagging_seed": 11,                                    
          "metric": 'mae',                                       
          "verbosity": -1,                                       
          'reg_alpha': 0.1,                                    # best 0.1
          'reg_lambda': 0.05                                   # best 0.05                       
         }

In [None]:
#Grouped CV score: -1.2157   -1.4679    Secondary feature by type, after feature selection: -1.5517
#'huber': w 'huber' fc: -2.28
t = types_iterate[7]
print(f'Training of type {t}')

df_features = pd.read_csv('E:/kaggle/Molecular_properties/good_features/good_ft_type7_bonds_QM9_inver_neighbor_huber.csv')
good_features = list(df_features['feature'].values)
good_features.append('oof_fc')

X_t = train[good_features].loc[train['type'] == t]
X_test_t = test[good_features].loc[test['type'] == t]
y_t = X_short.loc[X_short['type'] == t, 'target']
molecules_t = df_molecules.loc[df_molecules['type'] == t, 'molecule_name']
result_dict_lgb = train_model_regression(X=X_t, X_test=X_test_t, y=y_t, params=params, molecules = molecules_t,
                                                          folds=gkf, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True,
                                                          verbose=1000, early_stopping_rounds=500, n_estimators=20000)

X_short.loc[X_short['type'] == t, 'oof'] = result_dict_lgb['oof']
X_short_test.loc[X_short_test['type'] == t, 'prediction'] = result_dict_lgb['prediction']

In [368]:
sample_submission = pd.read_csv('E:/kaggle/Molecular_properties/champs-scalar-coupling/sample_submission.csv', index_col='id')
benchmark = sample_submission.copy()
benchmark.reset_index(inplace = True)

In [369]:
benchmark['scalar_coupling_constant'] = X_short_test['prediction']
benchmark.to_csv('train_type_feature_type_bonds_QM9_fc.csv', index=False)
benchmark.head()

Unnamed: 0,id,scalar_coupling_constant
0,4658147,15.275014760451096
1,4658148,196.27048813871608
2,4658149,12.060166492061615
3,4658150,192.9462032727565
4,4658151,13.630952233646903


In [None]:
sample_submission.head()

type_dict = {'1JHC': 0, '1JHN': 1, '2JHC': 2, '2JHH': 3, '2JHN': 4, '3JHC': 5, '3JHH': 6, '3JHN': 7}

In [37]:
train.to_csv('E:/kaggle/Molecular_properties/Data_use/train_20190730_w_SCC.csv')