<a href="https://colab.research.google.com/github/emailmenojunk/datascience/blob/main/ChurnPredictionMiniProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries like boto to take care of AWS S3 access
!pip install boto3 pandas s3fs



In [2]:
import pandas as pd
import boto3
from io import StringIO
from google.colab import userdata
import os

# Retrieve AWS credentials securely from Colab Secrets
# These are needed to access the S3 bucket.
try:
    os.environ['AWS_ACCESS_KEY_ID'] = userdata.get('accesskey')
    os.environ['AWS_SECRET_ACCESS_KEY'] = userdata.get('secret')
except KeyError:
    print("ERROR: AWS credentials issue. Please check the Secrects in COLAB")
    raise # Stop execution if there is a credential issue

# --- AWS S3 Bucket Details & Region  ---
S3_BUCKET_NAME = 'amzn-s3-sagemaker-demo-bucket'
# The specific folder and file path within the S3 bucket
S3_KEY = 'SageMakerDemoFolder/churndata.csv'
AWS_REGION = 'us-east-1'

# Initialize the S3 client to interact with AWS S3
s3 = boto3.client('s3', region_name=AWS_REGION)

# Access the S3 object (the CSV file) and read its content directly into a pandas DataFrame
try:
    print(f"Attempting to fetch s3://{S3_BUCKET_NAME}/{S3_KEY}...")
    obj = s3.get_object(Bucket=S3_BUCKET_NAME, Key=S3_KEY)

    # Read the data from the S3 object's body and decode it from bytes to a string
    data_body = obj['Body'].read().decode('utf-8')
    # Use StringIO to treat the string data as a file for pandas to read
    data = pd.read_csv(StringIO(data_body))

    print("\n Data successfully loaded from S3.")
    print(f"DF shape: {data.shape}")
    print("\nInitial 5 rows of the data:")
    print(data.head())

except Exception as e:
    print(f"\n An error during S3 data loading: {e}")
    print(f"Is the S3 Key correct ? : '{S3_KEY}'")
    print("Veify AWS permission & region")

# --- Target Variable Conversion ---
# Convert the 'retained' column into a numerical 'target' variable (1 for Retained, 0 for Churn)
# This is done here as part of the initial data loading and basic cleaning.
ACTUAL_TARGET_COL_NAME = 'retained'
data.loc[:, 'target'] = data[ACTUAL_TARGET_COL_NAME].astype(int)
print("\n--- Target Variable Distribution ---")
print(data['target'].value_counts())
# Drop the original 'retained' column as it's no longer needed
data.drop(ACTUAL_TARGET_COL_NAME, axis=1, inplace=True)
print("\nOriginal 'retained' column dropped, 'target' column created.")

Attempting to fetch s3://amzn-s3-sagemaker-demo-bucket/SageMakerDemoFolder/churndata.csv...

 Data successfully loaded from S3.
DF shape: (30801, 15)

Initial 5 rows of the data:
   custid  retained   created firstorder lastorder  esent   eopenrate  \
0  6H6T6N         0   9/28/12    8/11/13   8/11/13     29  100.000000   
1  APCENR         1  12/19/10     4/1/11   1/19/14     95   92.631579   
2  7UP6MS         0   10/3/10    12/1/10    7/6/11      0    0.000000   
3  7ZEW8G         0  10/22/10    3/28/11   3/28/11      0    0.000000   
4  8V726M         1  11/27/10   11/29/10   1/28/13     30   90.000000   

   eclickrate  avgorder   ordfreq  paperless  refill  doorstep     favday city  
0    3.448276     14.52  0.000000          0       0         0     Monday  DEL  
1   10.526316     83.69  0.181641          1       1         1     Friday  DEL  
2    0.000000     33.58  0.059908          0       0         0  Wednesday  DEL  
3    0.000000     54.96  0.000000          0       0      

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# This step prevents data corruption from previous cells.
try:
    obj = s3.get_object(Bucket=S3_BUCKET_NAME, Key=S3_KEY)
    data = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8')))
    print("Data successfully reloaded and ready for feature engineering")
except NameError:
    print("Error - Ensure Cell 1 (S3 Load) has been run to define 's3' and S3 details.")
    raise # Stop execution if there is a credential issue

# Date format mm/dd/yy (Standardized to resolve original error)
DATE_FORMAT = '%m/%d/%y'

#  Date Conversion (Using .loc )
data.loc[:, 'created'] = pd.to_datetime(data['created'], format=DATE_FORMAT, errors='coerce')
data.loc[:, 'firstorder'] = pd.to_datetime(data['firstorder'], format=DATE_FORMAT, errors='coerce')
data.loc[:, 'lastorder'] = pd.to_datetime(data['lastorder'], format=DATE_FORMAT, errors='coerce')

# Target Conversion Logic
ACTUAL_TARGET_COL_NAME = 'retained'
# 1. Create the 'target' column based on 'retained' and convert to integer
data.loc[:, 'target'] = data[ACTUAL_TARGET_COL_NAME].astype(int)
print("\nTarget Variable Distribution")
print(data['target'].value_counts())
# 2. Drop the original 'retained' column
data.drop(ACTUAL_TARGET_COL_NAME, axis=1, inplace=True)


#  Initial Missing Value Handling (Using .loc )
data.loc[:, 'esent'] = data['esent'].fillna(0)
data.loc[:, 'eopenrate'] = data['eopenrate'].fillna(0)
data.loc[:, 'eclickrate'] = data['eclickrate'].fillna(0)
data.loc[:, 'ordfreq'] = data['ordfreq'].fillna(0)

median_avgorder = data['avgorder'].median()
data.loc[:, 'avgorder'] = data['avgorder'].fillna(median_avgorder)


#  Feature Engineering

# FEATURE 1: Customer Tenure (Account Age in Days)
time_delta_tenure = (datetime.today() - data['created']).astype('timedelta64[ns]')
data.loc[:, 'Customer_Tenure'] = time_delta_tenure.dt.days
median_tenure = data['Customer_Tenure'].median()
data.loc[:, 'Customer_Tenure'] = data['Customer_Tenure'].fillna(median_tenure)


# FEATURE 2: Recency (Days Since Last Order)
recency_delta = (datetime.today() - data['lastorder']).astype('timedelta64[ns]')
data.loc[:, 'Recency_Days'] = recency_delta.dt.days
data.loc[:, 'Recency_Days'] = data['Recency_Days'].fillna(data['Customer_Tenure'])
median_recency = data['Recency_Days'].median()
data.loc[:, 'Recency_Days'] = data['Recency_Days'].fillna(median_recency)


# FEATURE 3: Email Engagement Score (Composite Feature)
data.loc[:, 'Email_Engagement_Score'] = (data['eopenrate'] + data['eclickrate']) * data['esent']
median_ees = data['Email_Engagement_Score'].median()
data.loc[:, 'Email_Engagement_Score'] = data['Email_Engagement_Score'].fillna(median_ees)


# FEATURE 4: Normalized Order Frequency
data.loc[:, 'Customer_Tenure_Adj'] = data['Customer_Tenure'].apply(lambda x: x if x > 0 else 1)
data.loc[:, 'Avg_Daily_Order_Freq'] = data['ordfreq'] / data['Customer_Tenure_Adj']
median_avg_freq = data['Avg_Daily_Order_Freq'].median()
data.loc[:, 'Avg_Daily_Order_Freq'] = data['Avg_Daily_Order_Freq'].fillna(median_avg_freq)


# Clean up / drop unnecessary columns
data.drop(['custid', 'created', 'firstorder', 'lastorder', 'Customer_Tenure_Adj'], axis=1, inplace=True)

print("\n Feature Engineering and Data Cleaning Complete.")
print(f"Final DataFrame Shape: {data.shape}")

# Define Features and Target
X = data.drop('target', axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3, random_state=123, stratify=y
)
print(f"y_test class counts after re-split: \n{y_test.value_counts()}")

# Define the preprocessing steps
numeric_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include='object').columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)
print("\nData splitting and preprocessing pipelines defined.")

Data successfully reloaded and ready for feature engineering

Target Variable Distribution
target
1    24472
0     6329
Name: count, dtype: int64

 Feature Engineering and Data Cleaning Complete.
Final DataFrame Shape: (30801, 15)
y_test class counts after re-split: 
target
1    7342
0    1899
Name: count, dtype: int64

Data splitting and preprocessing pipelines defined.


In [4]:
print("\n Vaidate the First 10 Rows of Engineered Features ")

# Display the first 10 rows of the complete feature set (X)
# Display Days columns
print(X.head(10)[['Customer_Tenure', 'Recency_Days', 'Email_Engagement_Score', 'Avg_Daily_Order_Freq', 'city']].to_string())


 Vaidate the First 10 Rows of Engineered Features 
   Customer_Tenure  Recency_Days  Email_Engagement_Score  Avg_Daily_Order_Freq city
0           4789.0        4472.0                  3000.0              0.000000  DEL
1           5438.0        4311.0                  9800.0              0.000033  DEL
2           5515.0        5239.0                     0.0              0.000011  DEL
3           5496.0        5339.0                     0.0              0.000000  BOM
4           5460.0        4667.0                  3100.0              0.000002  BOM
5           6200.0        4316.0                  4400.0              0.000023  DEL
6           5402.0        4314.0                  3000.0              0.000023  BOM
7           5818.0        4314.0                  2800.0              0.000007  DEL
8           5530.0        5478.0                     0.0              0.000000  DEL
9           5428.0        4351.0                  3500.0              0.000025  DEL


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define Random Forest Classifier Model
# Define Pipeline to combine preprocessing and the classifier
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor), # Apply the preprocessing steps
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')) # Define the Random Forest classifier
])


print("\n...Starting RF Model Training...")
# Train the model using the training data
rf_model.fit(X_train, y_train)

print("Training of the model completed.")


...Starting RF Model Training...
Training of the model completed.


In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer

# 1. Define the F1 scorer
f1_scorer = make_scorer(f1_score, pos_label=1)

print("\n Model Evaluation - 5Fold CV - F1 Score ")

# 2. Perform Cross-Validation on the ENTIRE dataset (X, y)
f1_scores = cross_val_score(
    rf_model, # pipeline
    X, #  feature set
    y, # target set
    cv=5, #  5 folds
    scoring=f1_scorer,
    n_jobs=-1 #  all cores
)

# 3. Calculate the average and standard deviation of the scores
mean_f1 = f1_scores.mean()
std_f1 = f1_scores.std()

print(f"\nIndividual Fold F1 Scores: {f1_scores}")
print(f"\nAverage F1 Score (5-Fold CV): **{mean_f1:.4f}**")
print(f"Standard Deviation: (+/- {std_f1:.4f})")


 Model Evaluation - 5Fold CV - F1 Score 

Individual Fold F1 Scores: [0.98536388 0.99406224 0.89931401 0.90630256 0.95333658]

Average F1 Score (5-Fold CV): **0.9477**
Standard Deviation: (+/- 0.0391)


In [7]:
import joblib
import numpy as np


# Define the filename for the model
model_filename = 'churn_prediction_rf_model.joblib'

# Save the trained model to the file
try:
    joblib.dump(rf_model, model_filename)
    print(f"\n Trained model saved successfully as {model_filename}")
except NameError:
    print("\nError - 'rf_model' not found. Please ensure the model training cell was run.")
except Exception as e:
    print(f"\n An error occurred while saving the model: {e}")
# Load the trained model from the file
model_filename = 'churn_prediction_rf_model.joblib'
try:
    loaded_model = joblib.load(model_filename)
    print(f"\n Model loaded from {model_filename}")
except FileNotFoundError:
    print(f"\n Error: Model file '{model_filename}' not found. Please run previous cells to train and save the model.")
    loaded_model = None # Set to None to avoid errors later


if loaded_model is not None:
    # Get sample data for row 1 from the test set (iloc for row index)
    if len(X_test) > 1:
        sample_data_row_1 = X_test.iloc[[1]]
        original_index_row_1 = X_test.index[1] # Get the original index label
        print(f"\n Successfully selected sample data for row 1 (original index {original_index_row_1}).")

        # Make a prediction using the loaded pipeline on the sample data
        test_prediction_row_1 = loaded_model.predict(sample_data_row_1)[0]

        # Calculate Predicted Probability
        proba_output_row_1 = loaded_model.predict_proba(sample_data_row_1)

        # Handle the case where predict_proba returns only one column
        if proba_output_row_1.shape[1] == 1:
            # If only one column, it's typically the probability of the negative class (Churn)
            test_proba_churn_row_1 = proba_output_row_1[0, 0]
            test_proba_retained_row_1 = 1 - test_proba_churn_row_1 # Probability of Retained is 1 - Churn probability
            print("  Note: predict_proba returned 1 column. Assuming it's Churn probability (Index 0).")
        else:
            # Normal case: two columns are returned. Churn is index 0, Retained is index 1.
            test_proba_churn_row_1 = proba_output_row_1[0, 0]
            test_proba_retained_row_1 = proba_output_row_1[0, 1]
            print("  Note: predict_proba returned 2 columns. Using Index 0 for Churn, Index 1 for Retained.")


        print("\n--- Deployment Test for Sample at Row 1 of Test Data ---")
        print(f"Sample Input (Original Index {original_index_row_1}):\n{sample_data_row_1.to_string(index=False)}")
        print(f"Predicted Status (0=Churn, 1=Retained): {test_prediction_row_1}")
        print(f"Prediction Probability of CHURN: {test_proba_churn_row_1:.4f}")
        print(f"Prediction Probability of RETAINED: **{test_proba_retained_row_1:.4f}**")

    else:
        print("\n Test set does not have a row at index 1 (it might be empty or too small).")

else:
    print("\nCannot perform deployment test as model was not loaded.")


 Trained model saved successfully as churn_prediction_rf_model.joblib

 Model loaded from churn_prediction_rf_model.joblib

 Successfully selected sample data for row 1 (original index 18898).
  Note: predict_proba returned 2 columns. Using Index 0 for Churn, Index 1 for Retained.

--- Deployment Test for Sample at Row 1 of Test Data ---
Sample Input (Original Index 18898):
 esent  eopenrate  eclickrate  avgorder  ordfreq  paperless  refill  doorstep   favday city  Customer_Tenure  Recency_Days  Email_Engagement_Score  Avg_Daily_Order_Freq
    31   6.451613    3.225806     40.02      0.0          1       1         0 Thursday  BOM           4395.0        4395.0                   300.0                   0.0
Predicted Status (0=Churn, 1=Retained): 1
Prediction Probability of CHURN: 0.0000
Prediction Probability of RETAINED: **1.0000**
