In [50]:
import os
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import joblib
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder


In [20]:
# Re-define paths after reset
zip_path = "mntdataridewiseproject.zip"
extract_dir = "mntdataridewise_project"

In [21]:
# Create extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

In [22]:
# Extract ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [23]:
# List extracted files and directories
extracted_files = []
for root, dirs, files in os.walk(extract_dir):
    for file in files:
        extracted_files.append(os.path.join(root, file))

extracted_files[:20]  # show first 20 files for preview

['mntdataridewise_project\\drivers.csv',
 'mntdataridewise_project\\drivers.parquet',
 'mntdataridewise_project\\promotions.csv',
 'mntdataridewise_project\\promotions.parquet',
 'mntdataridewise_project\\riders.csv',
 'mntdataridewise_project\\riders.parquet',
 'mntdataridewise_project\\sessions.csv',
 'mntdataridewise_project\\sessions.parquet',
 'mntdataridewise_project\\trips.csv',
 'mntdataridewise_project\\trips.parquet',
 'mntdataridewise_project\\trips_sample.csv']

In [24]:
# Handles class imbalance
from imblearn.over_sampling import SMOTE

##### Step 2 - Define all the configuration

In [25]:
DATA_PATH = "mntdataridewise_project\\riders.csv"
MODEL_PATH = "mntdataridewise_project/models/churn_model.pkl"

RANDOM_STATE = 42
TEST_SIZE = 0.2  #this translates to 20%

#### Step 3 - Load and Explore Data

In [26]:
df = pd.read_csv(DATA_PATH)

In [27]:
# check the columns in a list
print(df.columns.tolist())

# checked for the data types within the data
print(df.dtypes)

# checked for the shape of the data
print("Dataset shape", df.shape)

# checked the first 5 columns of the data
df.head(2)

['user_id', 'signup_date', 'loyalty_status', 'age', 'city', 'avg_rating_given', 'churn_prob', 'referred_by']
user_id              object
signup_date          object
loyalty_status       object
age                 float64
city                 object
avg_rating_given    float64
churn_prob          float64
referred_by          object
dtype: object
Dataset shape (10000, 8)


Unnamed: 0,user_id,signup_date,loyalty_status,age,city,avg_rating_given,churn_prob,referred_by
0,R00000,2025-01-24,Bronze,34.729629,Nairobi,5.0,0.142431,R00001
1,R00001,2024-09-09,Bronze,34.57102,Nairobi,4.7,0.674161,


In [28]:
# count misisng values per column: identify data quality issues
print(df.isnull().sum())

# converting the churn probablity column to binary (1 if > 0.5 else 0)
churn_binary = ( df['churn_prob'] > 0.5).astype(int)

# show the percentage of customer who churned 0/1
print(f"Churn rate: {churn_binary.mean():.2%}")

# show howw many customers are in each city
print (df['city'].value_counts())

user_id                0
signup_date            0
loyalty_status         0
age                    0
city                   0
avg_rating_given       0
churn_prob             0
referred_by         6947
dtype: int64
Churn rate: 10.63%
city
Cairo      3374
Nairobi    3330
Lagos      3296
Name: count, dtype: int64


##### Observations

- Identify data quality before modelling the data
- Revealed class imblance (10.63% churned  vs 89.36% didnt churn)
- Helps understand data distribution

##### Step 4 : Data Quality Checks

Std(Stndard Deviation): Spread of values with your data

- If std is very small or zero --- feature has not variance - useless for prediction

In [29]:
# statistical diagnostics

# step 4a : Mean-Average/ std value
for col in [ 'age', 'avg_rating_given', 'churn_prob']:
    print( f"Mean: {df[col].mean():4f}")         # check for mean(average)
    print( f"Std:  {df[col].std():4f}")          # check for standard deviation

Mean: 35.154221
Std:  9.545528
Mean: 4.461510
Std:  0.429162
Mean: 0.286168
Std:  0.159029


Measures correlation between features and churn target

- Correlation near (0) ---- feature doesn't predict churn
- positive correlation (1) --- good predictor = more churn
- Negative correlation (~1) -- good predictor = less churn

- true positive 
- true negative
- false positive
- false negative

In [30]:
# step 4b : Correlations
correlations = df[['age', 'avg_rating_given']].corrwith(churn_binary)


# step 4c : Shows average churn rate per city 
print( df.groupby('city')[['churn_prob']].agg(['mean', 'count']))

        churn_prob      
              mean count
city                    
Cairo     0.286248  3374
Lagos     0.284265  3296
Nairobi   0.287971  3330


##### Step 5 : Data Preprocessing (AKA --- Feature engineering)


In [31]:
data = df.copy()   #presever the original

In [32]:
df.head()

Unnamed: 0,user_id,signup_date,loyalty_status,age,city,avg_rating_given,churn_prob,referred_by
0,R00000,2025-01-24,Bronze,34.729629,Nairobi,5.0,0.142431,R00001
1,R00001,2024-09-09,Bronze,34.57102,Nairobi,4.7,0.674161,
2,R00002,2024-09-07,Bronze,47.13396,Lagos,4.2,0.510379,
3,R00003,2025-03-17,Bronze,41.658628,Nairobi,4.9,0.244779,
4,R00004,2024-08-20,Silver,40.681709,Lagos,3.9,0.26996,R00002


In [33]:
# step 5a - strip or remove all leading and trailing spaces ("Lagos " --- "Lagos")
data['city'] = data['city'].str.strip().str.title()  # "LAGOS" -- lagos Or "lagos" -- "Lagos" --- Ensure consitency

# fills all the missing values with "Direct"
data['referred_by'] = data['referred_by'].fillna("Direct")

# converts direct to 0 and 'R0001' to 1
data['was_referred'] = ( data['referred_by'] != 'Direct').astype(int)

#### Step 6 : Encode the Remaining Categorical Variables

In [34]:
data.head()

Unnamed: 0,user_id,signup_date,loyalty_status,age,city,avg_rating_given,churn_prob,referred_by,was_referred
0,R00000,2025-01-24,Bronze,34.729629,Nairobi,5.0,0.142431,R00001,1
1,R00001,2024-09-09,Bronze,34.57102,Nairobi,4.7,0.674161,Direct,0
2,R00002,2024-09-07,Bronze,47.13396,Lagos,4.2,0.510379,Direct,0
3,R00003,2025-03-17,Bronze,41.658628,Nairobi,4.9,0.244779,Direct,0
4,R00004,2024-08-20,Silver,40.681709,Lagos,3.9,0.26996,R00002,1


In [35]:
 # step 6a - Initiaze the encoder
city_encoder = LabelEncoder()
data['city_encoder'] = city_encoder.fit_transform(data['city'])

# step 6b - initialize the loyalty encoder
loyalty_encoder = LabelEncoder()
data['loyalty_encoder'] = loyalty_encoder.fit_transform(data['loyalty_status'])

In [36]:
# step 6c - store the encoders for API USE
encoders = {
    'city' : city_encoder,
    'loyalty': loyalty_encoder,
    'valid_cities': list(city_encoder.classes_),
    'valid_loyalty_statuses': list(loyalty_encoder.classes_)
}

In [37]:
print("✓ Categorical encoding completed")
print(f"Cities: {encoders['valid_cities']}")
print(f"Loyalty statuses: {encoders['valid_loyalty_statuses']}")

✓ Categorical encoding completed
Cities: ['Cairo', 'Lagos', 'Nairobi']
Loyalty statuses: ['Bronze', 'Gold', 'Platinum', 'Silver']


### TASK 

- Watch For-loops and while loops in python

# Step 7 - Feature Scaling 

- scaling prevents features that have large ranges from dominating the dataset
- capture complex patterns simple algo miss it

In [38]:
data.head()

Unnamed: 0,user_id,signup_date,loyalty_status,age,city,avg_rating_given,churn_prob,referred_by,was_referred,city_encoder,loyalty_encoder
0,R00000,2025-01-24,Bronze,34.729629,Nairobi,5.0,0.142431,R00001,1,2,0
1,R00001,2024-09-09,Bronze,34.57102,Nairobi,4.7,0.674161,Direct,0,2,0
2,R00002,2024-09-07,Bronze,47.13396,Lagos,4.2,0.510379,Direct,0,1,0
3,R00003,2025-03-17,Bronze,41.658628,Nairobi,4.9,0.244779,Direct,0,2,0
4,R00004,2024-08-20,Silver,40.681709,Lagos,3.9,0.26996,R00002,1,1,3


In [39]:
# Feature scaling -  transforming features to a similar scale mean=0 and std=1
# why scale? - Logistic regression works better when fearures are on similar scale


#Initialize scaler
scaler = StandardScaler()

# FIX: Scale and SAVE the scaler
data[['age', 'avg_rating_given']] = scaler.fit_transform(data[['age', 'avg_rating_given']])

# Add scaler to encoders dictionary
encoders['scaler'] = scaler

print("✓ Feature scaling completed")

✓ Feature scaling completed


# Step 8 : Prepare Features and Target variable

In [42]:
# Use the actual encoded column names present in `data` and tolerate common variants
# prefer 'city_encoder' / 'loyalty_encoder' but fallback if older names exist
feature_cols = ['age', 'avg_rating_given', 'city_encoder', 'loyalty_encoder', 'was_referred']

# create aliased columns if notebook has alternate names
if 'city_encoded' in data.columns and 'city_encoder' not in data.columns:
	data['city_encoder'] = data['city_encoded']
if 'loyalty_encoded' in data.columns and 'loyalty_encoder' not in data.columns:
	data['loyalty_encoder'] = data['loyalty_encoded']

# validate presence
missing = [c for c in feature_cols if c not in data.columns]
if missing:
	raise KeyError(f"Missing feature columns: {missing}")

X = data[feature_cols].copy()
y = (data['churn_prob'] > 0.5).astype(int)

print(f"✓ Features prepared: {feature_cols}")
print(f"  Shape: {X.shape}")

✓ Features prepared: ['age', 'avg_rating_given', 'city_encoder', 'loyalty_encoder', 'was_referred']
  Shape: (10000, 5)


# Step 9 - TRAIN - TEST SPLIT

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

# Step 10 - HANDLE CLASS IMBALANCE 

-- Apply SMOTE to balance the training - create synthetic churn data

- 10.36% churned 86.72% didnt churn 
- 50% churned 50% didnt churn
- synthetic data ---- both sides

In [44]:
# step 1 - initialize SMOTE object
smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=5)

# step 2 - fit and resmaples the training data
# generate sysnthetic examples of the minority class - 10.63% churn rate
# k_negihbors = 5 -- use 5 nearest neighbors to generate synthetic examples
# After applying SMOTE, the new class distribution will be balanced!!

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

class imblanced

churned - 10.63% --- minority class
didny churn - 89.37%   -- majority class


logistic regression - 

Mathematics of DS

# Step 11- Train Logistic Regression Model

In [45]:
model = LogisticRegression(
    random_state=RANDOM_STATE,
    max_iter=1000,                 # more steps to best coefficients
    class_weight='balanced',       # 10.63% minority class penalize it upon any mistake
    solver='liblinear'            #  support L1 regularization and L2 regularization
)

In [46]:
model.fit(X_train_resampled, y_train_resampled)

In [47]:
# Logistic regression learns from  linear relationship between features and target variable
# coefficients -- weights assigned to each feature during training
# Regularization -- L1 and L2 prevent overfitting (memorizing the training data too well)
# class_weight = 'balanced' + SMOTE + Stratification -- double protection against class imbalance

print(f"Model coefficients: {model.coef_.shape}") # weights of each feature(how much it affecting predictions)
print(f"Intercept: {model.intercept_[0]:4f}")  # baseline prediction before you add feature effects

Model coefficients: (1, 5)
Intercept: -0.025947


# Step 12 - Make Predictions (testing what you've trained)


In [48]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # probability of positive class (churn=1)

# Step 13 - Evaluate Model

In [51]:
# if the model makes 100 predictions , how many are correct?
accuracy =accuracy_score(y_test, y_pred)   # comparing actual vs predicted

# Why we use y_proba instead of y_pred for AUC

A- Area
U - under
R - Curve

- how well can the model sperate churners from non- churners 
test all the possible cases - uses probabilities -
logicttic regresssion --- .predict
                      --- .predict_proba

y_pred = model.predict(X_test) -- BINARY PREDICTIONS
[0,1,0,0,1]
0 - wont churn
1 - will churn

y_proba = model.predict_proba(X_test)
[0.23, 0.787,0.45.....]

0.23 = 23% chance /probablity that this person will chur
0.78 = 78% chance that this person will churn


In [52]:
# Area under the curve - how well can your model seperate churner from non-churners
auc = roc_auc_score(y_test, y_proba)

In [53]:
# classification report-- performance overview of all the class metrics
class_report = classification_report(y_test, y_pred)
print(class_report)

              precision    recall  f1-score   support

           0       0.89      0.52      0.66      1787
           1       0.10      0.44      0.16       213

    accuracy                           0.51      2000
   macro avg       0.49      0.48      0.41      2000
weighted avg       0.80      0.51      0.60      2000



1 - Precision - when model says "something will churn" how precise/certain/sure is it? -

- Formula = correct "churn" prediction/ All "churn" prediction

model prediction 200 customers will churn
only 22 customers churned 

precision = 22/200 = 0.11 = 11%
89% of churn alerts are false alarms

2 - Recall --- in all the customers who actually churned . how many did we catch?

formula - caught "churn"/ Total actual churners

213 customers actually churned 
model caught 114 of them
missied 99 of them 

recall = 114/213 = 0.53 = 53%
it was missing 46% of churners 

3 - F1 - score - Average of precision and recall()
formula - (precision  * Recall) / (precision + Recall)
-- balances preciosn and recall into one number
-- precision = 11
--- recall = 12
--- F1 = (11 * 12) / (11 + 12)
 F1 = .......


In [54]:
# A 2X2 tables showing all the prediction outcomes
cm = confusion_matrix(y_test, y_pred)

# displaye confusion matrix using pandas dataframe for better visualization
cm_df = pd.DataFrame(cm, index=['Actual_Non_Churn', 'Actual_Churn'], columns=['Predicted_Non_Churn', 'Predicted_Churn'])
print(cm_df)

                  Predicted_Non_Churn  Predicted_Churn
Actual_Non_Churn                  929              858
Actual_Churn                      119               94


1 - y_test = [1,0,1,0,0,0,1] ---- the original data telling you who churned and who didnt churn

customer 1 : Actually didnt churn (Actual = 0)
customer 2 : Actually did churn (Actual = 1)

we had historical data


2 - Trained our model and all
y_pred = [0,1,0,1,0,0]   ---- this i the data the model predictions

customer 1 : Actually didnt churn (predicted = 0)
customer 2 : Actually did churn (predicted = 1)


Compare Actual vs Predicted
customer    Actual  Predicted  match'   outcome
1            0       0          yes    correct! (True negative)
2            1       1          yes    correct ! (True Positive)


# Step 14 : Feature Importance

In [56]:
#idenitfy features most influencing churn predictions
feature_importance = pd.DataFrame({
    'feature' : feature_cols,
    'coefficient' : model.coef_[0],
    'abs_coefficient' : np.abs(model.coef_[0])
}).sort_values(by='abs_coefficient', ascending=False)

print(feature_importance)

            feature  coefficient  abs_coefficient
2      city_encoder     0.072260         0.072260
1  avg_rating_given     0.046763         0.046763
3   loyalty_encoder    -0.045223         0.045223
4      was_referred    -0.029748         0.029748
0               age    -0.010021         0.010021


#   POSITIVE COEFFICIENT - Higher feature value = Higher churn probability
#  Negave COEFFICIENT - Lower churn probability


# Step 15 : Save Model

In [58]:
# Save the trained model
model_dir = os.path.dirname(MODEL_PATH)
if model_dir:
	os.makedirs(model_dir, exist_ok=True)
joblib.dump(model, MODEL_PATH)
print(f"Saved model to {MODEL_PATH}")

Saved model to mntdataridewise_project/models/churn_model.pkl


In [61]:
# FIX: Calculate actual metrics (not {...})
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

# FIX: Create proper artifacts dictionary
artifacts = {
    'encoders': encoders,  # Contains city, loyalty encoders AND scaler
    'feature_names': feature_cols,
    'feature_importance': feature_importance,
    'metrics': {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc
    },
    'model_version': '1.0.0',
}

# Save artifacts (use a raw string for Windows path and ensure directory exists)
save_path = r"C:\Users\user\Desktop\NEW PROJECT\mntdataridewise_project\models\churn_model.pkl"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
joblib.dump(artifacts, save_path)

print("✓ Model and artifacts saved successfully")
print(f"  Model AUC: {auc:.4f}")

✓ Model and artifacts saved successfully
  Model AUC: 0.4971
