In [592]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

import os

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from scipy.stats import boxcox
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC

from sklearn.metrics import log_loss, accuracy_score, f1_score, plot_confusion_matrix, confusion_matrix

from imblearn.over_sampling import SMOTE

seed = 1

le = LabelEncoder()

In [593]:
path = 'C:\\Users\\sunil\\Projects\\Machine Hack\\Merchandise Popularity Prediction\\Dataset'

train = pd.read_csv(path + '\\Train.csv')
test = pd.read_csv(path + '\\Test.csv')
sample_sub = pd.read_csv(path + '\\sample_submission.csv')

In [594]:
target = 'popularity'
features = [col for col in train.columns if col not in [target]]

In [595]:
train[target].replace({0:0, 1:1, 3:2, 4:3, 5:4}, inplace = True)

In [596]:
#removing lowest in Score_2

train.drop(15372, axis = 0, inplace = True)

In [597]:
test.loc[ test['Score_2'] == 0, 'Score_2'] = 0.1

In [598]:
df = pd.concat( [train, test], axis = 0).reset_index(drop=True)

In [599]:
df['Score_2'] = np.log(df['Score_2'])

In [600]:
df['time'] = np.log(df['time'])

In [601]:
df.loc[df['Store_Score']<0, 'Store_Score'] = df[df['Store_Score']<0]['Store_Score']*-1
df['Store_Score'] = np.log(df['Store_Score'])

In [602]:
train, test = df[:train.shape[0]].copy(), df[train.shape[0]:].copy()

In [603]:
test = test.reset_index(drop=True)

---
# Predicting For Duplicate Values

In [561]:
test.drop('popularity', axis = 1, inplace=True)

In [562]:
test[target] = None

In [563]:
def Fill_Duplicates(row):
    tmp = train[ (train['Store_Ratio'] == row['Store_Ratio']) &
                 (train['Basket_Ratio'] == row['Basket_Ratio']) &
                 (train['Category_1'] == row['Category_1']) &
                 (train['Store_Score'] == row['Store_Score']) &
                 (train['Category_2'] == row['Category_2']) &
                 (train['Store_Presence'] == row['Store_Presence']) &
                 (train['Score_1'] == row['Score_1']) & 
                 (train['Score_2'] == row['Score_2']) &
                 (train['Score_3'] == row['Score_3']) &
                 (train['Score_4'] == row['Score_4'])]
    if tmp.shape[0] == 0:
        return None
    return tmp[target].mode()[0]

In [564]:
test[target] = test.apply(lambda x: Fill_Duplicates(x), axis = 1)

KeyboardInterrupt: 

In [None]:
filled_test_index = test[~test[target].isna()].index
filled_test_values = test[~test[target].isna()][target].to_list()

---
# Data Preprocessing

In [532]:
trn, val = train_test_split(train, test_size = 0.2, random_state = 1, stratify = train[target])

#### Input for model
X_trn, X_val = trn[features], val[features]

#### Target column
y_trn, y_val = trn[target], val[target]

#### Features for test data that we will be predicting
X_test = test[features]

In [533]:
ext = ExtraTreesClassifier(random_state = 1)
_ = ext.fit(X_trn, y_trn)

preds_val = ext.predict_proba(X_val)

log_loss(y_val, preds_val)

0.4800260762354447

In [450]:
ext = ExtraTreesClassifier(random_state = 1, n_estimators = 2000, max_depth = 35)
_ = ext.fit(X_trn, y_trn)

preds_val = ext.predict_proba(X_val)

log_loss(y_val, preds_val)

0.3383218697845813

---
# Final Step

In [80]:
preds = ext.predict_proba(test[features])

In [81]:
# Filling Probabilities for Manually Predicted Indexes
def Manual_Probability_Filling():
    for i, (index,target_value) in enumerate(zip(preds[filled_test_index], filled_test_values)):
        tmp_ls = []
        tmp_ls = [v for v in range(0,5) if v not in [target_value]]
        preds[filled_test_index[i]][int(target_value)] = 1
        for v in tmp_ls:
            preds[filled_test_index[i]][int(v)] = 0
    return 'Done'

In [82]:
test[target] = test.apply(lambda x: Fill_Duplicates(x), axis = 1)

filled_test_index = test[~test[target].isna()].index
filled_test_values = test[~test[target].isna()][target].to_list()

Manual_Probability_Filling()

'Done'

In [83]:
sample = pd.DataFrame(preds)
sample.to_csv(path+'\\ext_outlier.csv', index=False)

# Feature Engineering

In [604]:
def join(train, test):
    df = pd.concat([train, test], axis = 0).reset_index(drop=True)
    return df

def split(df):
    train_new, test_new = df[:train.shape[0]], df[train.shape[0]:]
    feats = [col for col in train_new.columns if col not in [target]]
    return train_new, test_new, feats

In [605]:
df = join(train,test)

In [606]:
df['mean_presence_per_store'] = df.groupby('Store_Ratio')['Store_Presence'].transform('mean')
df['min_presence_per_store'] = df.groupby('Store_Ratio')['Store_Presence'].transform('min')
df['max_presence_per_store'] = df.groupby('Store_Ratio')['Store_Presence'].transform('max')

In [607]:
train, test, features = split(df)

In [608]:
trn, val = train_test_split(train, test_size = 0.2, random_state = 1, stratify = train[target])

#### Input for model
X_trn, X_val = trn[features], val[features]

#### Target column
y_trn, y_val = trn[target], val[target]

#### Features for test data that we will be predicting
X_test = test[features]

In [609]:
ext = ExtraTreesClassifier(random_state = 1)
_ = ext.fit(X_trn, y_trn)

preds_val = ext.predict_proba(X_val)

log_loss(y_val, preds_val)

0.4132708015231751

In [539]:
df = join(train,test)

In [546]:
df['mean_presence_per_store_score'] = df.groupby('Store_Score')['Store_Presence'].transform('mean')
df['min_presence_per_store_score'] = df.groupby('Store_Score')['Store_Presence'].transform('min')
df['max_presence_per_store_score'] = df.groupby('Store_Score')['Store_Presence'].transform('max')

In [547]:
train, test, features = split(df)

trn, val = train_test_split(train, test_size = 0.2, random_state = 1, stratify = train[target])

#### Input for model
X_trn, X_val = trn[features], val[features]

#### Target column
y_trn, y_val = trn[target], val[target]

#### Features for test data that we will be predicting
X_test = test[features]

In [548]:
ext = ExtraTreesClassifier(random_state = 1)
_ = ext.fit(X_trn, y_trn)

preds_val = ext.predict_proba(X_val)

log_loss(y_val, preds_val)

0.44049723392382617

In [266]:
df

Unnamed: 0,Store_Ratio,Basket_Ratio,Category_1,Store_Score,Category_2,Store_Presence,Score_1,Score_2,Score_3,Score_4,time,popularity,mean_presence_per_store,min_presence_per_store,max_presence_per_store,Store_mean_encoding
0,0.407,0.00380,2,3.579762,1,0.99200,0.944000,-2.314658,0.1100,113.911,12.150163,3.0,0.732370,0.000074,0.995,1.573718
1,0.234,0.10500,0,2.989915,1,0.94400,0.900000,-2.047943,0.0382,76.332,12.136256,3.0,0.743490,0.000003,0.995,1.314413
2,0.668,0.72600,9,2.016502,1,0.03870,0.000000,-2.009915,0.4530,124.075,12.056081,3.0,0.378823,0.000075,0.986,0.886485
3,0.184,0.00561,4,3.536806,1,0.84900,0.931000,-2.198225,0.0641,79.037,12.122691,4.0,0.768343,0.000827,0.995,1.554834
4,0.231,0.13100,6,3.128601,1,0.93700,0.000000,-2.216407,0.0677,109.560,12.136224,2.0,0.759000,0.000002,0.981,1.375381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30342,0.171,0.11800,2,3.233173,1,0.86100,0.871000,-2.373008,0.0539,78.303,12.165454,,0.729218,0.000106,0.986,1.421353
30343,0.744,0.85900,11,1.152469,1,0.02560,0.000063,-2.429283,0.7870,91.977,12.357879,,0.332461,0.002150,0.941,0.506643
30344,0.572,0.85200,6,1.550112,0,0.00105,0.000000,-2.472188,0.3780,129.969,12.284935,,0.355290,0.000071,0.996,0.681453
30345,0.761,0.52500,11,1.931521,1,0.44000,0.000007,-2.384880,0.5310,80.870,12.383403,,0.380058,0.028100,0.922,0.849126


In [None]:
import optuna
def objective(trial):
  param = {'n_estimators': trial.suggest_int('n_estimators', 400, 1500),
           'max_depth': trial.suggest_int('max_depth', 4, 30),
           #'min_samples_split': trial.suggest_int('min_samples_split', 5, 200),
           #'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
           'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 1),
           'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3, 1),
           #'num_leaves' : trial.suggest_uniform('num_leaves', 5, 1000),
           'reg_alpha': trial.suggest_uniform('reg_alpha', 0.1, 200),
           'reg_lambda': trial.suggest_uniform('reg_lambda', 0.1, 200)
          }

  skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42) 

  for fold, (tr_ind, val_ind) in enumerate(skf.split(train[features], train[target])):

    X_train, X_val     = train[features].loc[tr_ind], train[features].loc[val_ind]
    y_train, y_val     = train[target][tr_ind], train[target][val_ind]
    
    model              = XGBClassifier(**param,
                                              random_state=42, n_jobs=-1, eval_metric = 'mlogloss', use_label_encoder = False) 
    model.fit(X_train, y_train)
    val_pred           = model.predict_proba(X_val) 
    accuracy           = log_loss(y_val, val_pred)
    #print(f'\nlog loss score for validation set is {accuracy}')

  return accuracy
  
study = optuna.create_study(direction="minimize") 
study.optimize(objective, n_trials=250)

[32m[I 2021-02-07 15:37:56,224][0m A new study created in memory with name: no-name-0a4aad0f-4874-4fae-ad87-0e14cec99d82[0m
[32m[I 2021-02-07 15:38:40,450][0m Trial 0 finished with value: 0.47798236320959087 and parameters: {'n_estimators': 692, 'max_depth': 29, 'learning_rate': 0.8867298976002695, 'colsample_bytree': 0.9457676004613931, 'reg_alpha': 127.02018140473285, 'reg_lambda': 86.82718043513862}. Best is trial 0 with value: 0.47798236320959087.[0m
[32m[I 2021-02-07 15:39:43,550][0m Trial 1 finished with value: 0.46986859049663393 and parameters: {'n_estimators': 1432, 'max_depth': 24, 'learning_rate': 0.5701483271973637, 'colsample_bytree': 0.5058617990223292, 'reg_alpha': 68.601617373017, 'reg_lambda': 170.6828399494092}. Best is trial 1 with value: 0.46986859049663393.[0m
[32m[I 2021-02-07 15:40:31,766][0m Trial 2 finished with value: 0.4792628825494579 and parameters: {'n_estimators': 994, 'max_depth': 13, 'learning_rate': 0.46085627291137693, 'colsample_bytree': 0

[32m[I 2021-02-07 16:14:02,055][0m Trial 24 finished with value: 0.4504117705155572 and parameters: {'n_estimators': 414, 'max_depth': 27, 'learning_rate': 0.10949565391513089, 'colsample_bytree': 0.8025141346888667, 'reg_alpha': 28.034446822484128, 'reg_lambda': 43.149800580853636}. Best is trial 11 with value: 0.37665214462753027.[0m
[32m[I 2021-02-07 16:19:40,062][0m Trial 25 finished with value: 0.38163182338218754 and parameters: {'n_estimators': 661, 'max_depth': 23, 'learning_rate': 0.0897155944285809, 'colsample_bytree': 0.581012166072796, 'reg_alpha': 0.5089267417404582, 'reg_lambda': 83.59003228016928}. Best is trial 11 with value: 0.37665214462753027.[0m
[32m[I 2021-02-07 16:20:31,497][0m Trial 26 finished with value: 0.4632839312190146 and parameters: {'n_estimators': 716, 'max_depth': 24, 'learning_rate': 0.19510130132789089, 'colsample_bytree': 0.5790764829616019, 'reg_alpha': 46.508581836629546, 'reg_lambda': 140.74135650486807}. Best is trial 11 with value: 0.37

[32m[I 2021-02-07 19:25:00,728][0m Trial 49 finished with value: 0.4026052386053464 and parameters: {'n_estimators': 687, 'max_depth': 25, 'learning_rate': 0.2490690162036857, 'colsample_bytree': 0.9912956868710169, 'reg_alpha': 9.510323059695155, 'reg_lambda': 36.65718965628574}. Best is trial 11 with value: 0.37665214462753027.[0m
[32m[I 2021-02-07 19:25:49,690][0m Trial 50 finished with value: 0.48272684348383404 and parameters: {'n_estimators': 404, 'max_depth': 24, 'learning_rate': 0.30792528494057947, 'colsample_bytree': 0.8336076708798796, 'reg_alpha': 164.54899880682137, 'reg_lambda': 76.91599338106357}. Best is trial 11 with value: 0.37665214462753027.[0m
[32m[I 2021-02-07 20:24:05,699][0m Trial 51 finished with value: 0.39670424756551287 and parameters: {'n_estimators': 605, 'max_depth': 30, 'learning_rate': 0.36660563170238564, 'colsample_bytree': 0.698114668965117, 'reg_alpha': 7.518584552871392, 'reg_lambda': 29.829213215077058}. Best is trial 11 with value: 0.3766

[32m[I 2021-02-07 22:41:39,857][0m Trial 74 finished with value: 0.37518409185278234 and parameters: {'n_estimators': 482, 'max_depth': 22, 'learning_rate': 0.044751285509061475, 'colsample_bytree': 0.6100117137361897, 'reg_alpha': 0.10412908424624456, 'reg_lambda': 20.154156504159438}. Best is trial 74 with value: 0.37518409185278234.[0m
[32m[I 2021-02-07 22:43:28,395][0m Trial 75 finished with value: 0.38776719364218937 and parameters: {'n_estimators': 433, 'max_depth': 22, 'learning_rate': 0.16238167052043545, 'colsample_bytree': 0.6092113292704374, 'reg_alpha': 5.586817064417612, 'reg_lambda': 19.2277125755026}. Best is trial 74 with value: 0.37518409185278234.[0m
[32m[I 2021-02-07 22:47:05,053][0m Trial 76 finished with value: 0.4062017558842889 and parameters: {'n_estimators': 430, 'max_depth': 22, 'learning_rate': 0.048813406570809806, 'colsample_bytree': 0.6179869036340861, 'reg_alpha': 0.12902850545923894, 'reg_lambda': 180.0245660345456}. Best is trial 74 with value: 

[32m[I 2021-02-08 04:29:50,484][0m Trial 99 finished with value: 0.40635298626220884 and parameters: {'n_estimators': 402, 'max_depth': 22, 'learning_rate': 0.1506389782945895, 'colsample_bytree': 0.5460006542018415, 'reg_alpha': 9.958786722869824, 'reg_lambda': 30.958353416545897}. Best is trial 74 with value: 0.37518409185278234.[0m
[32m[I 2021-02-08 04:33:03,030][0m Trial 100 finished with value: 0.437265205326209 and parameters: {'n_estimators': 426, 'max_depth': 26, 'learning_rate': 0.053777149008927695, 'colsample_bytree': 0.9138224289605952, 'reg_alpha': 21.35420312732817, 'reg_lambda': 40.320585984745605}. Best is trial 74 with value: 0.37518409185278234.[0m
[32m[I 2021-02-08 04:38:07,241][0m Trial 101 finished with value: 0.3852343656875915 and parameters: {'n_estimators': 587, 'max_depth': 25, 'learning_rate': 0.10382209604983741, 'colsample_bytree': 0.5151009637778122, 'reg_alpha': 4.9012943873035475, 'reg_lambda': 17.379908316612376}. Best is trial 74 with value: 0.