In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv('Final_df.csv')
df.head()

In [None]:
df.info()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df=df.sort_values(by=['user_id'])

In [None]:
#Feature Engineering

In [None]:
orders_grouped=df.groupby(by=['order_id'],as_index=False).agg({'product_id':'count'})
orders_grouped

In [None]:
orders_grouped.rename(columns={'product_id':'order_size'},inplace=True)
orders_grouped

In [None]:
df=pd.merge(df,orders_grouped,how='left',on=['order_id'])
df

In [None]:
df['cart_priority'] = 1 - df['add_to_cart_order']/(df['order_size']+1)
df

In [None]:
avg_sizes = df[['user_id','order_size']].groupby(['user_id']).mean().reset_index()
avg_sizes.rename(columns={"order_size":"avg_order_size"}, inplace=True)
avg_sizes.head()

In [None]:
df=pd.merge(df,avg_sizes,on=['user_id'],how='left')
df

In [None]:
users=df.groupby(by=['user_id'],as_index=False).agg({'days_since_prior_order':'mean'})
users.rename(columns={'days_since_prior_order':'avg_interval_between_2_orders'},inplace=True)
users

In [None]:
df=pd.merge(df,users,on=['user_id'],how='left')
df

In [None]:
products=df.groupby(by=['product_id'],as_index=False).agg({'reordered':'sum','product_name':'count'})
products

In [None]:
products.rename(columns={'reordered':'product_reordered','product_name':'product_orders'},inplace=True)
products

In [None]:
products['product_reorder_ratio']=products['product_reordered']/products['product_orders']
products

In [None]:
products=products[['product_id','product_reorder_ratio']]
products

In [None]:
df=pd.merge(df,products,on=['product_id'],how='left')
df

In [None]:
df['order_number']=df.groupby(by=['user_id'])['order_id'].rank()
df['order_number']

In [None]:
lastProdOrders = df.groupby(["user_id","product_id"]).agg({'order_number':'max'}).reset_index()
lastProdOrders = lastProdOrders.merge(df, on=['user_id','product_id', 'order_number'], how='left')
print(len(lastProdOrders))
lastProdOrders.head()

In [None]:
lastProdOrders.shape

In [None]:
features=pd.get_dummies(lastProdOrders,columns=['department_id','aisle_id'])
features

In [None]:
features.drop(columns=['aisle','department','eval_set','product_name','reorder_ratio','COUNTER','add_to_cart_order'],inplace=True)

In [None]:
X=features.drop(columns=['reordered','order_id','user_id','product_id'])
X

In [None]:
y=features['reordered']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.25)

In [None]:
#grid search logistic Regression

In [None]:
param_grid = {
     'penalty' : ['l1', 'l2'],
     'C' : [0.001,0.01,0.1,1.0,10,100,1000]}

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
search = GridSearchCV(LogisticRegression(), param_grid, scoring='accuracy', n_jobs=-1, cv=3,verbose=2)
result = search.fit(x_train, y_train)

print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
from sklearn.metrics import accuracy_score
metrics_df=pd.DataFrame()
log_preds=search.predict(x_test)
log_preds_train=search.predict(x_train)
accuracy_log=accuracy_score(log_preds,y_test)
accuracy_train=accuracy_score(log_preds_train,y_train)
metrics_df['Model']='logistic Regression'
metrics_df['train_accuracy']=accuracy_train
metrics_df['test_accuracy']=accuracy_log

In [None]:
accuracy_log

In [None]:
from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1']
print(classification_report(y_test, log_preds, target_names=target_names))

## Roc Curve

In [None]:

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

In [None]:
ns_probs = [0 for _ in range(len(y_test))]
lr_probs = search.predict_proba(x_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

In [None]:
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
!pip install xgboost

In [None]:
# grid search xgb
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [None]:
import xgboost as xgb
xgb = xgb.XGBClassifier(n_estimators=100, learning_rate=0.05, nthread=-1)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, n_jobs=4, cv=skf.split(x_train,y_train), verbose=3, random_state=42)

In [None]:
random_search.fit(x_train, y_train)

In [None]:
random_search.best_estimator_

In [None]:
random_search.best_params_

In [None]:
#fitting the model with best params to find feature importance
import xgboost as xgb
xgb1 = xgb.XGBClassifier(n_estimators=600, learning_rate=0.05, nthread=-1,subsample=0.6,min_child_weight=10,gamma=1,colsample_bytree=0.6)

In [None]:
xgb1.fit(x_train,y_train)

In [None]:
plt.figure(figsize=(20,10))
xgb=plot_importance(xgb1, max_num_features=10)

In [None]:
xgb_pred=random_search.predict(x_test)
xgb_train=random_search.predict(x_train)
accuracy_xgb=accuracy_score(xgb_pred,y_test)
accuracy_train=accuracy_score(xgb_train,y_train)

In [None]:
accuracy_xgb

In [None]:

xgb_probs = random_search.predict_proba(x_test)
# keep probabilities for the positive outcome only
xgb_probs = xgb_probs[:, 1]
# calculate scores
lr_auc = roc_auc_score(y_test, xgb_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
xgb_fpr, xgb_tpr, _ = roc_curve(y_test, lr_probs)

In [None]:
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(xgb_fpr, xgb_tpr, marker='.', label='XGBoost')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()