In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import warnings
import gc
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, f1_score, classification_report, accuracy_score, confusion_matrix
import lightgbm as lgb
# enable garbage collector to aid in memory 
gc.enable()
# eliminate future warnings
warnings.filterwarnings('ignore')


In [None]:
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')
orders = pd.read_csv("orders.csv")

In [None]:
df_train.set_index('user_id', inplace = True)
df_test.set_index('user_id', inplace = True)

In [None]:
# These will be used as the base for X and y variables throughout the notebook
X, y = df_train.drop('reordered', axis=1), df_train.reordered

In [None]:
del df_train
gc.collect()

# Balancing the target class SMOTE

In [None]:
# balance data for better results from sampling 
smote = SMOTE(random_state = 13)
smote_X, smote_y = smote.fit_sample(X, y)
smote_X = pd.DataFrame(smote_X, columns = X.columns )
smote_y= pd.DataFrame(smote_y, columns=['reordered'])
# we can Check the numbers of our data
print("length is",len(smote_X))
print("Number of not reordered",len(smote_y[smote_y['reordered'] == 0]))
print("Number of reordered",len(smote_y[smote_y['reordered'] == 1]))
print("Proportion of not reordered ", len(smote_y[smote_y['reordered'] == 0]) / len(smote_X))
print("Proportion of reordered", len(smote_y[smote_y['reordered'] == 1]) / len(smote_X))
# class is balanced with equal proportions

# LGBM

In [None]:
# splitting data with 20% split for test and using balanced categories of target
X_train, X_test, y_train, y_test = train_test_split(smote_X, smote_y, test_size = .20, random_state = 13)

In [None]:
# lgb accepts a certain format for the model
d_train = lgb.Dataset(X_train, 
                      label = y_train,
                      categorical_feature = ['product_id', 'aisle_id', 'department_id']
                     )
d_test = lgb.Dataset(X_test,
                     y_test,
                     categorical_feature = ['product_id', 'aisle_id', 'department_id'],
                     reference = d_train
                    )

In [None]:
del X, y, smote_X, smote_y
gc.collect()

In [None]:
# hypertuning parameters to be used, arr indicates tuned parameters
params = {'objective': 'binary',
          'metric':'auc', 
          'boosting_type': ['dart'],  
          'boost_from_average': False,
          'learning_rate':[0.005], 
          'num_rounds': 200,
          'max_depth': [10],
          'num_leaves': [93],
          'seed': 13}

In [None]:
# first run training model
lgb_model = lgb.train(params, train_set = d_train)

In [None]:
# predictions made on cv 
lgb_y_pred = lgb_model.predict(X_test)

In [None]:
# assess these predictions 

In [None]:
def tryProbThresholds(clf, X_test, y_test, startProb=0.1, endProb=1.0, incrementProb=0.1):
    """A function used to decide the threshold used to classify reorder or no reorder(Author:Stefan Fiot)"""
    y_hat_probs = clf.predict(X_test)
    best_threshold_prob = 0
    best_f1_score = 0
    for threshold in np.arange(startProb, endProb, incrementProb):
        print("Using {0:.2f} probability threshold for class 1".format(threshold))
        y_hat = (y_hat_probs > threshold).astype(int)
        current_f1_score = f1_score(y_test, y_hat)
        current_accuracy_score = accuracy_score(y_test, y_hat)
        if current_f1_score > best_f1_score:
            best_f1_score = current_f1_score
            best_threshold_prob = threshold
        print("F1: {0:.4f} - Acc: {1:.4f}".format(current_f1_score, current_accuracy_score))
    print("Best F1 score: **{0:.4f}** at probability threshold **{1:.2f}**".format(best_f1_score, best_threshold_prob))

In [None]:
#run function to deliver best threshold to use
tryProbThresholds(lgb_model, X_test, y_test)

In [None]:
#convert into binary values using threshold calculated above 
def to_binary(arr):
    """Converts probability given by model to a 1 or 0 based on a predetermined threshold"""
    new = []
    for i in range(len(arr)):
        if arr[i] >= .35:  
            new.append(1)
        else:
            new.append(0)
    return new

In [None]:
# results of the binary classification used for evaluation
results = to_binary(lgb_y_pred)

In [None]:
# ensure the true test results are the same length as the prediction results
len(results), len(y_test)

In [None]:
# first run base model with balanced class weight
confusion_matrix(y_test, results)

In [None]:
# first run base model with balanced class weight
print(classification_report(y_test, results))

In [None]:
#lightgbm variable importance bar graph
#print list(var_names.values)
pd.Series(lgb_model.feature_importance(), X_train.columns ).sort_values(ascending=False).plot(kind='bar', title='LightGBM Importance');

# Test data predicted on LGBM model

In [None]:
# predicting on the final test df and turning the output into binary 
final_results = to_binary(lgb_model.predict(df_test))

In [None]:
# saving new column for creation of submission df and for inspection
df_test['lgb_predicted'] = final_results
df_test['lgb_probability'] = lgb_model.predict(df_test)
df_test.info()

In [None]:
# Keep only the required columns 
final = df_test[['product_id', 'user_id.1', 'lgb_predicted']]
final.info()

In [None]:
gc.collect()
final.head()

In [None]:
# redo extraction of test observations to rejoin order id
orders_test = orders.loc[orders.eval_set == 'test', ['user_id', 'order_id']]
orders_test.head()

In [None]:
final = final.merge(orders_test, left_on='user_id.1', right_on = 'user_id', how='left')
final.head()

In [None]:
# dropping user id to only include order_id, products and predicitons
final = final.drop(['user_id', 'user_id.1'], axis=1)

#convert product_id as integer
final['product_id'] = final.product_id.astype(int)

In [None]:
d = dict()
for row in final.itertuples():
    if row.lgb_predicted== 1:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in final.order_id:
    if order not in d:
        d[order] = 'None'
        
gc.collect()
d

In [None]:
#Convert the dictionary into a DataFrame
sub = pd.DataFrame.from_dict(d, orient='index')

In [None]:
#Reset index
sub.reset_index(inplace=True)
#Set column names
sub.columns = ['order_id', 'products']
sub.head()

In [None]:
sub.head()

In [None]:
sub.to_csv('lgbfinal_submission.csv', index=False, header=True)