In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import warnings
import gc
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, f1_score, classification_report, accuracy_score, confusion_matrix
import xgboost as xgb

# enable garbage collector to aid in memory 
gc.enable()
# eliminate future warnings
warnings.filterwarnings('ignore')


In [None]:
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

In [None]:
# mean encoding of aisle and department
aisle_mean = (df_train.groupby('aisle_id')['reordered'].mean()).to_frame('aisle').reset_index()
department_mean = (df_train.groupby('department_id')['reordered'].mean()).to_frame('department').reset_index()

In [None]:
# merging new features on test set
df_test = df_test.merge(aisle_mean, on = 'aisle_id', how = 'left')
df_test = df_test.merge(department_mean, on = 'department_id', how = 'left')

In [None]:
# merging new features on training set
df_train = df_train.merge(aisle_mean, on = 'aisle_id', how = 'left')
df_train = df_train.merge(department_mean, on = 'department_id', how = 'left')

In [None]:
# no longer need the actual id's in test or train set
df_test.drop(['aisle_id', 'department_id'], axis = 1, inplace = True)
df_train.drop(['aisle_id', 'department_id'], axis = 1, inplace = True)
df_test.set_index(['user_id', 'product_id'], inplace = True)
df_train.set_index(['user_id', 'product_id'], inplace = True)

In [None]:
# ensuring columns are the same, train has an extra due to reordered column 
df_test.shape, df_train.shape

In [None]:
del aisle_mean, department_mean
gc.collect()

# X and y Variables for models

In [None]:
# These will be used as the base for X and y variables throughout the notebook
X, y = df_train.drop('reordered', axis=1), df_train.reordered

In [None]:
del df_train
gc.collect()

# XGBoost

In [None]:
# new set of train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 13)

In [None]:
# parameters derived from prior grid search cv session
parameters = {
    'eval_metric' : 'logloss',
    'max_depth' : 5,
    'colsample_bytree' : 0.4,
    'subsample' : 0.8, 
    'scale_pos_weight': 0.6
}

In [None]:
# instantiate xgb model
xgb1 = xgb.XGBClassifier(objective='binary:logistic',
                         parameters=parameters,
                         num_boost_round=10)

In [None]:
#fitting the model.
xgb1.fit(X_train, y_train)

In [None]:
def tryProbThresholds(clf, X_test, y_test, startProb=0.1, endProb=1.0, incrementProb=0.1):
    """A function used to decide the threshold to classify reorder or no reorder(Author:Stefan Fiot)"""
    y_hat_probs = clf.predict_proba(X_test)
    best_threshold_prob = 0
    best_f1_score = 0
    for threshold in np.arange(startProb, endProb, incrementProb):
        print("Using {0:.2f} probability threshold for class 1".format(threshold))
        y_hat = (y_hat_probs[:,1] > threshold).astype(int)
        current_f1_score = f1_score(y_test, y_hat)
        current_accuracy_score = accuracy_score(y_test, y_hat)
        if current_f1_score > best_f1_score:
            best_f1_score = current_f1_score
            best_threshold_prob = threshold
        print("F1: {0:.4f} - Acc: {1:.4f}".format(current_f1_score, current_accuracy_score))
    print("Best F1 score: **{0:.4f}** at probability threshold **{1:.2f}**".format(best_f1_score, best_threshold_prob))



In [None]:
#run function to deliver best threshold to use
tryProbThresholds(xgb1, X_test, y_test)

In [None]:
#prediction, #setting a threshold.
xgb_pred = (xgb1.predict_proba(X_test)[:, 1] >= 0.20).astype('int') 

In [None]:
# prints confusion matrix and precision and recall 
cm3 = confusion_matrix(y_test, xgb_pred)

print(cm3)
print('\n')
print("Precision: %0.2f" %(cm3[1, 1] / (cm3[1, 1] + cm3[0, 1])))
print("Recall:    %0.2f"% (cm3[1, 1] / (cm3[1, 1] + cm3[1, 0])))
    
cm4 = confusion_matrix(y_test, xgb_pred, labels=[0, 1])
    
plt.figure()
plot_confusion_matrix(cm4)

In [None]:
#Evaluation.
print('F1 Score: {}'.format(f1_score(xgb_pred, y_test)))
print(classification_report(y_test, xgb_pred))

In [None]:
# Plotting the tuned model to visualize the auc, roc
fpr, tpr,_ = roc_curve(y_test, xgb_pred)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot([0,1],[0,1], linestyle='--', color = 'black')
plt.plot(fpr, tpr, color = 'green')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.gca().set_aspect('equal', adjustable='box')

In [None]:
# plot feature importance
xgb.plot_importance(xgb1)

In [None]:
# making predictions on the test dataset
y_pred_test = (xgb1.predict_proba(df_test.drop(['order_id'], axis=1))[:, 1] >= 0.20).astype('int') #setting a threshold.

In [None]:
# saving new column for creation of submission df
df_test['xgb_predicted'] = y_pred_test 
df_test['xgb_probability'] = xgb1.predict_proba(df_test.drop(['order_id'], axis=1)
df_test.info()

In [None]:
# Reset the index
final = df_test.reset_index()
# Keep only the required columns 
final = final[['product_id', 'user_id', 'xgb_predicted']]

gc.collect()
final.head()

In [None]:
# redo extraction of test to rejoin order id 
orders_test = orders.loc[orders.eval_set == 'test', ['user_id', 'order_id']]
orders_test.head()

In [None]:
final = final.merge(orders_test, on='user_id', how='left')
final = final.drop('user_id', axis=1)
final.head()

In [None]:
#convert product_id as integer
final['product_id'] = final.product_id.astype(int)


In [None]:
# creates a dictionary to then be converted into df by assigning 1, 0, or none
d = dict()
for row in final.itertuples():
    if row.xgb_predicted== 1:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in final.order_id:
    if order not in d:
        d[order] = 'None'
        
gc.collect()
d

In [None]:
#Convert the dictionary into a DataFrame
sub = pd.DataFrame.from_dict(d, orient='index')

#Reset index
sub.reset_index(inplace=True)
#Set column names
sub.columns = ['order_id', 'products']

sub.head()

In [None]:
sub.to_csv('xgbfinal_submission.csv', index=False, header=True)