In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
 
# Load the data
merged_df = pd.read_csv("cleanedData.csv")
 
# Filter data for April 2024 and other dates
test_data_with_date = merged_df[merged_df["trxdate"] == '2024-04']
train_data_with_date = merged_df[merged_df["trxdate"] != '2024-04']
 
# Drop trxdate column
test_data_april = test_data_with_date.drop(columns=["trxdate"])
train_data_april = train_data_with_date.drop(columns=["trxdate"])
 
# Separate features and target variable
X = train_data_april.drop(columns=["Churn"])
y = train_data_april["Churn"]
 
# Function to create different scenarios and evaluate model
def evaluate_model_for_ratios_and_estimators(churn_ratio, n_estimators):
    churn_data = train_data_april[train_data_april["Churn"] == 1]
    non_churn_data = train_data_april[train_data_april["Churn"] == 0]
 
    # Number of non-churn samples to match the desired ratio
    non_churn_sample_size = int(len(churn_data) * (1 - churn_ratio) / churn_ratio)
 
    if non_churn_sample_size > len(non_churn_data):
        non_churn_sample_size = len(non_churn_data)
 
    non_churn_sample = non_churn_data.sample(non_churn_sample_size, random_state=42)
 
    # Combine churn and non-churn data
    balanced_data = pd.concat([churn_data, non_churn_sample])
    X_balanced = balanced_data.drop(columns=["Churn"])
    y_balanced = balanced_data["Churn"]
 
    # Split data into training and validation sets
    X_train, X_temp, y_train, y_temp = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
 
    # Train Random Forest model
    model_rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=20, random_state=42)
    model_rf.fit(X_train, y_train)
 
    # Evaluate model on validation set
    y_val_pred = model_rf.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_precision = precision_score(y_val, y_val_pred)
    val_recall = recall_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred)
 
    # Evaluate model on test set
    y_test_pred = model_rf.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
 
    print(f"Churn Ratio: {churn_ratio * 100}% | n_estimators: {n_estimators}")
    print("Validation Metrics:")
    print("Accuracy:", val_accuracy)
    print("Precision:", val_precision)
    print("Recall:", val_recall)
    print("F1 Score:", val_f1)
    print()
    print("Test Metrics:")
    print("Accuracy:", test_accuracy)
    print("Precision:", test_precision)
    print("Recall:", test_recall)
    print("F1 Score:", test_f1)
    print()
 
    # Predict and evaluate on April 2024 data
    test_data_april_without_churn = test_data_april.drop(columns=["Churn"])
    april_prediction = model_rf.predict(test_data_april_without_churn)
    april_prediction_proba = model_rf.predict_proba(test_data_april_without_churn)[:, 1]
 
    comparison_df = pd.DataFrame({
        "Customer_id": test_data_april["Customer_id"],
        "True_Churn": test_data_april["Churn"],
        "Predicted_Churn": april_prediction,
        "Predicted_Churn_Probability": april_prediction_proba
    })
 
    print("Classification Report for April 2024:")
    print(classification_report(test_data_april["Churn"], april_prediction))
    print("Confusion Matrix for April 2024:")
    print(confusion_matrix(test_data_april["Churn"], april_prediction))
    print()
 
 
# Evaluate for different churn ratios and n_estimators values
ratios = [0.6, 0.5, 0.4, 0.3, 0.2]
n_estimators_values = [50, 100, 200]
for ratio in ratios:
    for n_estimators in n_estimators_values:
        evaluate_model_for_ratios_and_estimators(ratio, n_estimators)

Churn Ratio: 60.0% | n_estimators: 50
Validation Metrics:
Accuracy: 0.8907853296759118
Precision: 0.8797970187123375
Recall: 0.9459505541346973
F1 Score: 0.9116752937309999

Test Metrics:
Accuracy: 0.8924217797643235
Precision: 0.8801894238358327
Recall: 0.9489448604492853
F1 Score: 0.9132749160592908

Classification Report for April 2024:
              precision    recall  f1-score   support

           0       0.98      0.90      0.94     45852
           1       0.51      0.88      0.64      5317

    accuracy                           0.90     51169
   macro avg       0.75      0.89      0.79     51169
weighted avg       0.94      0.90      0.91     51169

Confusion Matrix for April 2024:
[[41320  4532]
 [  638  4679]]

Churn Ratio: 60.0% | n_estimators: 100
Validation Metrics:
Accuracy: 0.8882454536218632
Precision: 0.8749016522423289
Recall: 0.947996589940324
F1 Score: 0.9099836333878887

Test Metrics:
Accuracy: 0.8890694839496139
Precision: 0.877763739734681
Recall: 0.9458815520