<a href="https://colab.research.google.com/github/fxs2596/ECommerce/blob/main/ECommerce_ML_Train_Test_store.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# --- Complete Google Colab Notebook Code for Churn Prediction ---

# --- Step 0: Import Necessary Libraries ---
import pandas as pd
import numpy as np # Import numpy for np.inf, np.nan
# Removed: import plotly.express as px # Removed visualization library
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import datetime # Needed for date calculations


print("Libraries imported successfully.")

# --- Step 1: Load the Prepared ML Dataset ---

# Load the prepared ML dataset CSV
# Make sure 'customer_churn_ml_dataset.csv' is uploaded to your Colab environment
file_name = 'customer_churn_ml_dataset.csv'

try:
    df_ml = pd.read_csv(file_name)
    print(f"\nML dataset '{file_name}' loaded successfully! Shape: {df_ml.shape}")
    print("Columns:", df_ml.columns.tolist())
    print("\nFirst 5 rows of the ML dataset:")
    print(df_ml.head())

except FileNotFoundError:
    print(f"\nError: File '{file_name}' not found. Please upload the CSV file to your Colab environment.")
    # In Colab, you might need to upload it using the file browser on the left
    # or mount Google Drive if the file is there.
    # Example for mounting Google Drive:
    # from google.colab import drive
    # drive.mount('/content/drive')
    # file_name = '/content/drive/My Drive/path/to/your/customer_churn_ml_dataset.csv'
    exit() # Stop execution if file not found

# --- Step 2: Data Preparation and Feature Engineering (if not already in CSV) ---
# Note: Our CSV already contains the engineered features (RFM, Tenure, AOV)
# and the churn label, so this step is primarily for verification or adding new features.
# If you were starting from raw orders data, this section would be more extensive.

print("\nVerifying/Preparing data structure...")
# Ensure essential columns are present
required_cols = ['Recency', 'Frequency', 'Monetary', 'DaysSinceFirstOrder', 'AverageOrderValue', 'churn']
if not all(col in df_ml.columns for col in required_cols):
    print("\nError: Essential columns for ML are missing in the CSV.")
    print("Missing columns:", [col for col in required_cols if col not in df_ml.columns])
    exit()

# Ensure numerical columns are numeric types
numerical_features = ['Recency', 'Frequency', 'Monetary', 'DaysSinceFirstOrder', 'AverageOrderValue'] # Define numerical features list
for col in numerical_features + ['churn']: # Include churn in the loop
     if col in df_ml.columns:
         df_ml[col] = pd.to_numeric(df_ml[col], errors='coerce') # Coerce errors will turn non-numeric into NaN

# Handle potential NaN values created by coercion (e.g., fill with mean, median, or 0)
# For simplicity, fill NaNs in features with 0 (consider more sophisticated imputation for real projects)
df_ml[numerical_features] = df_ml[numerical_features].fillna(0)


# --- Step 3: Separate Features (X) and Target (y) ---

print("\nSeparating features (X) and target (y)...")
# The target variable is 'churn'
target = 'churn'

# Features are the engineered numerical features
ml_features = ['Recency', 'Frequency', 'Monetary', 'DaysSinceFirstOrder', 'AverageOrderValue']
X = df_ml[ml_features]
y = df_ml[target]

print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print("\nFeatures used:", ml_features)


# --- Step 4: Split Data into Training and Testing Sets ---
# We split the df_ml dataset itself for model training and evaluation

print("\nSplitting data into training (80%) and testing (20%) sets...")
# Using stratify=y is important for classification to maintain the same churn distribution in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTrain set shape (X_train, y_train): {X_train.shape}, {y_train.shape}")
print(f"Test set shape (X_test, y_test): {X_test.shape}, {y_test.shape}")


# --- Step 5: Select and Train the Logistic Regression Model ---

print("\nInitializing and training Logistic Regression model...")
# Initialize the Logistic Regression model
# random_state for reproducibility
# solver='liblinear' is often good for small datasets and binary classification
log_reg_model = LogisticRegression(random_state=42, solver='liblinear')

# Train the model on the training data
log_reg_model.fit(X_train, y_train)

print("\nLogistic Regression model training complete.")


# --- Step 6: Make Predictions and Evaluate the Model ---

print("\nMaking predictions on the test set...")
# Predict class labels (0 or 1)
y_pred = log_reg_model.predict(X_test)

# Predict probabilities of the positive class (churn=1)
y_prob = log_reg_model.predict_proba(X_test)[:, 1]

print("\nEvaluating the model...")
# Calculate standard classification metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob) # ROC AUC is a good metric for imbalanced data

print("\n--- Logistic Regression Model Evaluation on Test Set ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

print("\nModel training and evaluation complete.")

# You now have a trained Logistic Regression model (log_reg_model)
# and its performance metrics on the test set.
# This streamlined notebook focuses on the core ML pipeline steps.
# The next step would be to use this trained model to predict churn probability
# for your *current* customers (if you had a separate dataset of current customers
# without a churn label) and potentially integrate those predictions into your dashboard.


Libraries imported successfully.

ML dataset 'customer_churn_ml_dataset.csv' loaded successfully! Shape: (432, 8)
Columns: ['customer_id', 'name', 'Recency', 'Frequency', 'Monetary', 'DaysSinceFirstOrder', 'AverageOrderValue', 'churn']

First 5 rows of the ML dataset:
   customer_id              name  Recency  Frequency  Monetary  \
0            1      Norma Fisher      543          1    227.14   
1            2   Steven Robinson      102          3    555.44   
2            3  Theodore Mcgrath      628          1    308.34   
3            4    Brian Hamilton       72          1     43.37   
4            5       Thomas Moon      165          1    384.37   

   DaysSinceFirstOrder  AverageOrderValue  churn  
0                  543         227.140000      1  
1                  668         185.146667      0  
2                  628         308.340000      1  
3                   72          43.370000      0  
4                  165         384.370000      0  

Verifying/Preparing data st