<a href="https://colab.research.google.com/github/emailmenojunk/datascience/blob/main/ChurnPredictionMiniProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
# Install necessary libraries like boto to take care of AWS S3 access
!pip install boto3 pandas s3fs



In [26]:
import pandas as pd
import boto3
from io import StringIO
from google.colab import userdata
import os

# Credentials are stored in the Colab Secrets
# Extract those secrects and use it in teh below code to access S3

try:
    os.environ['AWS_ACCESS_KEY_ID'] = userdata.get('accesskey')
    os.environ['AWS_SECRET_ACCESS_KEY'] = userdata.get('secret')
except KeyError:
    print("ERROR: AWS credentials issue. Please check the Secrects in COLAB")
    raise # Stop execution if there is a credential issue

# --- AWS S3 Bucket Details & Region  ---
S3_BUCKET_NAME = 'amzn-s3-sagemaker-demo-bucket'
# S3 Folder and File Name
S3_KEY = 'SageMakerDemoFolder/churndata.csv'
AWS_REGION = 'us-east-1'

# Initialize S3 client.
s3 = boto3.client('s3', region_name=AWS_REGION)

# Access S3 object and read it directly into a pandas DataFrame
try:
    print(f"Attempting to fetch s3://{S3_BUCKET_NAME}/{S3_KEY}...")
    obj = s3.get_object(Bucket=S3_BUCKET_NAME, Key=S3_KEY)

    # Read the data body and decode it to a string for pandas
    data_body = obj['Body'].read().decode('utf-8')
    data = pd.read_csv(StringIO(data_body))

    print("\n✅ Data successfully loaded from S3.")
    print(f"DF shape: {data.shape}")
    print("\nInitial 5 rows of the data:")
    print(data.head())

except Exception as e:
    print(f"\n❌ An error during S3 data loading: {e}")
    print(f"Is the S3 Key correct ? : '{S3_KEY}'")
    print("Veify AWS permission & region")

Attempting to fetch s3://amzn-s3-sagemaker-demo-bucket/SageMakerDemoFolder/churndata.csv...

✅ Data successfully loaded from S3.
DF shape: (30801, 15)

Initial 5 rows of the data:
   custid  retained   created firstorder lastorder  esent   eopenrate  \
0  6H6T6N         0   9/28/12    8/11/13   8/11/13     29  100.000000   
1  APCENR         1  12/19/10     4/1/11   1/19/14     95   92.631579   
2  7UP6MS         0   10/3/10    12/1/10    7/6/11      0    0.000000   
3  7ZEW8G         0  10/22/10    3/28/11   3/28/11      0    0.000000   
4  8V726M         1  11/27/10   11/29/10   1/28/13     30   90.000000   

   eclickrate  avgorder   ordfreq  paperless  refill  doorstep     favday city  
0    3.448276     14.52  0.000000          0       0         0     Monday  DEL  
1   10.526316     83.69  0.181641          1       1         1     Friday  DEL  
2    0.000000     33.58  0.059908          0       0         0  Wednesday  DEL  
3    0.000000     54.96  0.000000          0       0     

In [27]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Date format %m/%d/%Y) needed
DATE_FORMAT = '%m/%d/%Y'
# Turn any DATE that cannot be parsed into NOT-A-TIME (NaT)
data['created'] = pd.to_datetime(data['created'], format=DATE_FORMAT, errors='coerce')
data['firstorder'] = pd.to_datetime(data['firstorder'], format=DATE_FORMAT, errors='coerce')
data['lastorder'] = pd.to_datetime(data['lastorder'], format=DATE_FORMAT, errors='coerce')

# Define Target Variable and drop column 'retained'
ACTUAL_TARGET_COL_NAME = 'retained'
data['target'] = data[ACTUAL_TARGET_COL_NAME].apply(lambda x: 1 if x == 'Yes' else 0)
data.drop(ACTUAL_TARGET_COL_NAME, axis=1, inplace=True)

print("\nTarget Variable Distribution:")
print(data['target'].value_counts(normalize=True))

# Handle Missing Values in Numerical Columns
data['esent'].fillna(0, inplace=True)
data['eopenrate'].fillna(0, inplace=True)
data['eclickrate'].fillna(0, inplace=True)
data['ordfreq'].fillna(0, inplace=True)

# Handle missing value for 'avgorder') with the median
for col in ['avgorder']:
    data[col].fillna(data[col].median(), inplace=True)

# Feature Engineering
# FEATURE 1: Customer Tenure (Calculate Account Age using account created and today)
data['Customer_Tenure'] = (datetime.today() - data['created']).dt.days
data['Customer_Tenure'].fillna(data['Customer_Tenure'].median(), inplace=True)

# FEATURE 2: Recency (Calculate Recency using Last Order details)
recency_delta = (datetime.today() - data['lastorder']).dt.days
# If Recency is null then fill it with their full Customer_Tenure
data['Recency_Days'] = recency_delta.fillna(data['Customer_Tenure'])

# FEATURE 3: Email Engagement Score (Calculate using Sent * Open/Click rate)
data['Email_Engagement_Score'] = (data['eopenrate'] + data['eclickrate']) * data['esent']

# FEATURE 4: Normalized Order Frequency
# Calculates the average daily order frequency by dividing the order frequency by the customer's adjusted tenure
data['Customer_Tenure_Adj'] = data['Customer_Tenure'].apply(lambda x: x if x > 0 else 1)
data['Avg_Daily_Order_Freq'] = data['ordfreq'] / data['Customer_Tenure_Adj']

#  Clean up / drop unnecessary columns
data.drop(['custid', 'created', 'firstorder', 'lastorder', 'Customer_Tenure_Adj'], axis=1, inplace=True)

print("\n✅ Feature Engineering and Data Cleaning Complete.")
print(f"Final DataFrame Shape: {data.shape}")

# Splitting Features X & Target Y
X = data.drop('target', axis=1)
y = data['target']
# Tranin and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Feature Types
numeric_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include='object').columns
#preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

print("\nData splitting and preprocessing pipelines defined.")


Target Variable Distribution:
target
0    1.0
Name: proportion, dtype: float64

✅ Feature Engineering and Data Cleaning Complete.
Final DataFrame Shape: (30801, 15)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['esent'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['eopenrate'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behave


Data splitting and preprocessing pipelines defined.


In [28]:
from sklearn.ensemble import RandomForestClassifier

# Define Random Forest Classifier Model
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))

])

print("\n...Starting RF Model Training...")
# Train the model
rf_model.fit(X_train, y_train)

print("✅ Training of the model completed.")


...Starting RF Model Training...


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


✅ Training of the model completed.


In [29]:
from sklearn.metrics import roc_auc_score, roc_curve

# predict probabiity on the test set
proba_output = rf_model.predict_proba(X_test)

# Check the shape of the probability output
if proba_output.shape[1] == 1:
    print("predict_proba returned only one column. Assuming this is negative class of churn")
    # If only one column is returned, it is often the negative class of churn
    y_pred_proba = 1 - proba_output[:, 0]
else:
    # This is the normal case: two columns are returned. We take positive class of retained at index 1
    y_pred_proba = proba_output[:, 1]

# calculate the Area Under the Receiver Operating Characteristic Curve (AUC ROC) score.
auc_roc = roc_auc_score(y_test, y_pred_proba)

print(f"\n--- Model Evaluation (AUC ROC) ---")
print(f"AUC ROC Score on Test Set: **{auc_roc:.4f}**")

predict_proba returned only one column. Assuming this is negative class of churn

--- Model Evaluation (AUC ROC) ---
AUC ROC Score on Test Set: **nan**




In [30]:
import joblib

# Save the trained model
model_filename = 'churn_prediction_rf_model.joblib'

print(f"\n✅ File Used For Saving The Model : {model_filename}")

# Load the model
loaded_model = joblib.load(model_filename)

# Get first fow from test set to validate the model
sample_data = X_test.iloc[[0]]

# Make a prediction using the loaded pipeline on the sample data
test_prediction = loaded_model.predict(sample_data)[0]

# Calculate Predicted Probability
proba_output = loaded_model.predict_proba(sample_data)

if proba_output.shape[1] == 1:
    # If size is 1, it is usually Churn
    test_proba_churn = proba_output[0, 0]

    print("  Note: Using P(Index 0) for Churn Probability.")
else:
    # Normal case: two columns are returned. Churn is index 0.
    test_proba_churn = proba_output[0, 0]
    print("  Note: Using P(Index 0) for Churn Probability.")

print("\n--- Deployment Test ---")
print(f"Sample Input - First Row of Test Data:\n{sample_data.to_string(index=False)}")
print(f"Predicted Status (0=Churn, 1=Retained): {test_prediction}")
print(f"Prediction Probability of CHURN: **{test_proba_churn:.4f}**")


✅ File Used For Saving The Model : churn_prediction_rf_model.joblib
  Note: Using P(Index 0) for Churn Probability.

--- Deployment Test ---
Sample Input - First Row of Test Data:
 esent  eopenrate  eclickrate  avgorder  ordfreq  paperless  refill  doorstep favday city  Customer_Tenure  Recency_Days  Email_Engagement_Score  Avg_Daily_Order_Freq
    30  13.333333         0.0     40.02      0.0          1       0         0 Monday  BOM              NaN           NaN                   400.0                   0.0
Predicted Status (0=Churn, 1=Retained): 0
Prediction Probability of CHURN: **1.0000**
