In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import gc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_curve,auc,precision_recall_curve,average_precision_score,mean_absolute_error,mean_squared_error,r2_score
import scipy.stats as stats


In [None]:
 ord=pd.read_csv(
"/content/orders.csv",
dtype={
"order_id":"int32",
"user_id":"int32",
"order_number":"int16",
"order_dow":"int8",
"order_hour_of_day":"int8",
"days_since_prior_order":"float32",
"eval_set":"category"
}
)
ord.head()


In [None]:
opp=pd.read_csv(
"/content/order_products__prior.csv",
dtype={
"order_id":"int32",
"product_id":"int32",
"add_to_cart_order":"int16",
"reordered":"int8"
}
)

opt=pd.read_csv(
"/content/order_products__train.csv",
dtype={
"order_id":"int32",
"product_id":"int32",
"add_to_cart_order":"int16",
"reordered":"int8"
}
)
print("Orders loaded")


In [None]:
prd=pd.read_csv(
"/content/products.csv",
dtype={
"product_id":"int32",
"aisle_id":"int16",
"department_id":"int16"
}
)
ais=pd.read_csv("aisles.csv")
dep=pd.read_csv("departments.csv")
prd.head()


In [None]:
prd=prd.merge(ais,on="aisle_id",how="left")
prd=prd.merge(dep,on="department_id",how="left")
prd.head()


In [None]:
pri=opp.merge(
ord[["order_id","user_id","order_dow","order_hour_of_day"]],
on="order_id",
how="left"
)
pri.head()


In [None]:
pri=pri.merge(
prd[["product_id","aisle_id","department_id"]],
on="product_id",
how="left"
)
pri.head()


In [None]:
del opp
del prd
gc.collect()
print("Memory cleared")


In [None]:
trn=opt.merge(
ord[["order_id","user_id"]],
on="order_id",
how="left"
)
trn.head()


In [None]:
print("Prior:",pri.shape)
print("Train:",trn.shape)
pri.info()


In [None]:
pri["reordered"].value_counts(normalize=True).plot(kind="bar")
plt.title("Reordered Dist")
plt.show()


In [None]:
ipo=pri.groupby("order_id")["product_id"].count()
ipo.hist(bins=30)
plt.title("Items per Order")
plt.show()


In [None]:
pri["order_hour_of_day"].value_counts().sort_index().plot(kind="bar")
plt.show()

pri["order_dow"].value_counts().sort_index().plot(kind="bar")
plt.show()

pri["department_id"].value_counts().head(10).plot(kind="bar")
plt.show()


In [None]:
usr_f=pri.groupby("user_id").agg(
tot_ord=("order_id","nunique"),
tot_prd=("product_id","count"),
reord_rt=("reordered","mean")
).reset_index()

prd_f=pri.groupby("product_id").agg(
prd_ord=("order_id","count"),
prd_reord=("reordered","mean")
).reset_index()

usr_prd_f=pri.groupby(
["user_id","product_id"]
).agg(
tm_buy=("order_id","count"),
tm_reord=("reordered","sum")
).reset_index()


In [None]:
trn_d=trn.merge(usr_f,on="user_id",how="left")
trn_d=trn_d.merge(prd_f,on="product_id",how="left")
trn_d=trn_d.merge(usr_prd_f,on=["user_id","product_id"],how="left")
trn_d.fillna(0,inplace=True)

X=trn_d.drop(columns=["reordered","order_id"])
y=trn_d["reordered"]


In [None]:
s_sz=300000
trn_s=trn_d.sample(n=s_sz,random_state=42)
Xs=trn_s.drop(columns=["reordered","order_id","tm_reord"])
ys=trn_s["reordered"]
print(Xs.shape)
print(ys.shape)


In [None]:
Xtr,Xte,ytr,yte=train_test_split(
Xs,ys,test_size=0.2,random_state=42,stratify=ys
)
print(Xtr.shape,Xte.shape)


In [None]:
lr=LogisticRegression(max_iter=200,class_weight="balanced",n_jobs=-1)
lr.fit(Xtr,ytr)
ypb=lr.predict(Xte)
ypr=lr.predict_proba(Xte)[:,1]
print(classification_report(yte,ypb))


In [None]:
yp=lr.predict(Xte)
ypo=lr.predict_proba(Xte)[:,1]


In [None]:
print("Acc:",accuracy_score(yte,yp))
print(classification_report(yte,yp))


In [None]:
cm=confusion_matrix(yte,yp)
sns.heatmap(cm,annot=True,fmt="d",cmap="Blues")
plt.title("CM LR")
plt.show()


In [None]:
fpr,tpr,_=roc_curve(yte,ypo)
auc_lr=auc(fpr,tpr)
plt.plot(fpr,tpr,label="AUC=%.3f"%auc_lr)
plt.plot([0,1],[0,1],"--")
plt.legend()
plt.show()


In [None]:
dt=DecisionTreeClassifier(max_depth=10,random_state=42)
dt.fit(Xtr,ytr)
print("DT trained")


In [None]:
dtp=dt.predict(Xte)
dtpo=dt.predict_proba(Xte)[:,1]


In [None]:
print("Acc:",accuracy_score(yte,dtp))
print(classification_report(yte,dtp))


In [None]:
cm_dt=confusion_matrix(yte,dtp)
sns.heatmap(cm_dt,annot=True,fmt="d",cmap="Greens")
plt.title("CM DT")
plt.show()
