In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,  average_precision_score, precision_recall_curve

In [15]:
customer_level=pd.read_csv('processed/customer_level_data.csv')
customer_level.head()

Unnamed: 0,Customer_ID,Recency_Days,Total_Orders,Total_Spend,Total_Quantity,Total_Discount,Avg_Session_Duration,Avg_Pages_Viewed,Avg_Delivery_Time,Avg_Rating,Churn
0,CUST_00001,111,2,1693.28,6,0.0,14.5,9.5,11.0,4.0,1
1,CUST_00002,283,2,809.9,8,71.05,15.0,10.0,5.0,4.0,1
2,CUST_00003,82,2,3030.81,7,0.0,10.5,8.5,6.0,3.5,0
3,CUST_00004,41,1,383.22,5,97.78,16.0,15.0,4.0,5.0,0
4,CUST_00005,278,3,2422.73,8,0.0,12.666667,9.333333,5.666667,3.666667,1


In [16]:
customer_level.Churn.value_counts()

Churn
1    2548
0    2452
Name: count, dtype: int64

In [17]:
# split Target features and input features
Input_f=customer_level.drop(columns=['Customer_ID','Churn','Recency_Days'])
Target_f=customer_level.iloc[:,-1]


In [18]:
# Split training and test data
X_train,X_test,y_train,y_test=train_test_split(Input_f,Target_f,test_size=0.2,random_state=42,stratify=Target_f)


In [19]:


pipeline_lr = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000))
])


In [20]:
pipeline_lr

In [21]:
pipeline_lr.fit(X_train,y_train)

In [22]:
ypredict=pipeline_lr.predict(X_test)

In [23]:
confusion_matrix(y_test,ypredict)

array([[255, 235],
       [ 88, 422]], dtype=int64)

In [24]:
print(classification_report(y_test,ypredict))

              precision    recall  f1-score   support

           0       0.74      0.52      0.61       490
           1       0.64      0.83      0.72       510

    accuracy                           0.68      1000
   macro avg       0.69      0.67      0.67      1000
weighted avg       0.69      0.68      0.67      1000



In [25]:
y_proba = pipeline_lr.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, y_proba))



thresholds = [0.3, 0.4, 0.5, 0.6]

for t in thresholds:
    y_pred_t = (y_proba >= t).astype(int)
    
    print(f"\n--- Threshold: {t} ---")
    print(classification_report(y_test, y_pred_t))
    print(confusion_matrix(y_test, y_pred_t))



ROC-AUC: 0.74734293717487

--- Threshold: 0.3 ---
              precision    recall  f1-score   support

           0       0.85      0.26      0.40       490
           1       0.57      0.95      0.72       510

    accuracy                           0.61      1000
   macro avg       0.71      0.61      0.56      1000
weighted avg       0.71      0.61      0.56      1000

[[128 362]
 [ 23 487]]

--- Threshold: 0.4 ---
              precision    recall  f1-score   support

           0       0.80      0.38      0.52       490
           1       0.60      0.91      0.73       510

    accuracy                           0.65      1000
   macro avg       0.70      0.64      0.62      1000
weighted avg       0.70      0.65      0.62      1000

[[186 304]
 [ 46 464]]

--- Threshold: 0.5 ---
              precision    recall  f1-score   support

           0       0.74      0.52      0.61       490
           1       0.64      0.83      0.72       510

    accuracy                          

In [27]:
final_threshold = 0.4
y_pred_final = (y_proba >= final_threshold).astype(int)


print(confusion_matrix(y_test, y_pred_final))
print(classification_report(y_test, y_pred_final))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))




[[186 304]
 [ 46 464]]
              precision    recall  f1-score   support

           0       0.80      0.38      0.52       490
           1       0.60      0.91      0.73       510

    accuracy                           0.65      1000
   macro avg       0.70      0.64      0.62      1000
weighted avg       0.70      0.65      0.62      1000

ROC-AUC: 0.74734293717487


In [28]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
pipline_rf=Pipeline(steps=[
    ('scalar',StandardScaler()),
    ('model',RandomForestClassifier())
])

In [29]:
pipline_rf.fit(X_train,y_train)

In [30]:
y_pred=pipline_rf.predict(X_test)
confusion_matrix(y_test,y_pred)


array([[296, 194],
       [145, 365]], dtype=int64)

In [31]:
print(classification_report(y_test,ypredict))

              precision    recall  f1-score   support

           0       0.74      0.52      0.61       490
           1       0.64      0.83      0.72       510

    accuracy                           0.68      1000
   macro avg       0.69      0.67      0.67      1000
weighted avg       0.69      0.68      0.67      1000



In [32]:

y_proba = pipline_rf.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

ROC-AUC: 0.7156082432973189


# Save the Model 


In [None]:
# # Model
# import pickle as pk
# file_name='churn_model_lr.pkl'
# with open(file_name,'wb') as obj:
#     pk.dump(pipeline_lr,obj)

    
# # final_threshold = 0.4
# file_name='churn_threshold.pkl'
# with open(file_name,'wb') as obj:
#     pk.dump(final_threshold,obj)
