In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
 
# Load the data
merged_df = pd.read_csv("cleanedData.csv")
 
# Filter data for April 2024 and other dates
test_data_with_date = merged_df[merged_df["trxdate"] == '2024-04']
train_data_with_date = merged_df[merged_df["trxdate"] != '2024-04']
 
# Drop trxdate column
test_data_april = test_data_with_date.drop(columns=["trxdate"])
train_data_april = train_data_with_date.drop(columns=["trxdate"])
 
# Separate features and target variable
X = train_data_april.drop(columns=["Churn"])
y = train_data_april["Churn"]
 
# Function to create different scenarios and evaluate model
def evaluate_model_for_ratios(churn_ratio):
    churn_data = train_data_april[train_data_april["Churn"] == 1]
    non_churn_data = train_data_april[train_data_april["Churn"] == 0]
 
    # Number of non-churn samples to match the desired ratio
    non_churn_sample_size = int(len(churn_data) * (1 - churn_ratio) / churn_ratio)
 
    if non_churn_sample_size > len(non_churn_data):
        non_churn_sample_size = len(non_churn_data)
 
    non_churn_sample = non_churn_data.sample(non_churn_sample_size, random_state=42)
 
    # Combine churn and non-churn data
    balanced_data = pd.concat([churn_data, non_churn_sample])
    X_balanced = balanced_data.drop(columns=["Churn"])
    y_balanced = balanced_data["Churn"]
 
    # Split data into training and validation sets
    X_train, X_temp, y_train, y_temp = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
 
    # Train Logistic Regression model
    model_lr = LogisticRegression(max_iter=1000, random_state=42)
    model_lr.fit(X_train, y_train)
 
    # Evaluate model on validation set
    y_val_pred = model_lr.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_precision = precision_score(y_val, y_val_pred)
    val_recall = recall_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred)
 
    # Evaluate model on test set
    y_test_pred = model_lr.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
 
    print(f"Churn Ratio: {churn_ratio * 100}%")
    print("Validation Metrics:")
    print("Accuracy:", val_accuracy)
    print("Precision:", val_precision)
    print("Recall:", val_recall)
    print("F1 Score:", val_f1)
    print()
    print("Test Metrics:")
    print("Accuracy:", test_accuracy)
    print("Precision:", test_precision)
    print("Recall:", test_recall)
    print("F1 Score:", test_f1)
    print()
 
    # Predict and evaluate on April 2024 data
    test_data_april_without_churn = test_data_april.drop(columns=["Churn"])
    april_prediction = model_lr.predict(test_data_april_without_churn)
    april_prediction_proba = model_lr.predict_proba(test_data_april_without_churn)[:, 1]
 
    comparison_df = pd.DataFrame({
        "Customer_id": test_data_april["Customer_id"],
        "True_Churn": test_data_april["Churn"],
        "Predicted_Churn": april_prediction,
        "Predicted_Churn_Probability": april_prediction_proba
    })
 
    print("Classification Report for April 2024:")
    print(classification_report(test_data_april["Churn"], april_prediction))
    print("Confusion Matrix for April 2024:")
    print(confusion_matrix(test_data_april["Churn"], april_prediction))
    print()
 
# Evaluate for different churn ratios
ratios = [0.5, 0.4, 0.3, 0.2]
for ratio in ratios:
    evaluate_model_for_ratios(ratio)

Churn Ratio: 50.0%
Validation Metrics:
Accuracy: 0.6352014900101591
Precision: 0.5923319582625576
Recall: 0.8372491853884411
F1 Score: 0.693810843459106

Test Metrics:
Accuracy: 0.6380259036654533
Precision: 0.6014187808103884
Recall: 0.8388395103136005
F1 Score: 0.7005602240896358

Classification Report for April 2024:
              precision    recall  f1-score   support

           0       0.96      0.58      0.72     45852
           1       0.17      0.77      0.28      5317

    accuracy                           0.60     51169
   macro avg       0.57      0.67      0.50     51169
weighted avg       0.87      0.60      0.67     51169

Confusion Matrix for April 2024:
[[26484 19368]
 [ 1218  4099]]

Churn Ratio: 40.0%
Validation Metrics:
Accuracy: 0.6009481882831019
Precision: 0.6386554621848739
Recall: 0.03805708562844266
F1 Score: 0.0718336483931947

Test Metrics:
Accuracy: 0.6099146688338074
Precision: 0.5896656534954408
Recall: 0.03333906169444922
F1 Score: 0.0631099544567339


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Churn Ratio: 20.0%
Validation Metrics:
Accuracy: 0.8002031832035218
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

Test Metrics:
Accuracy: 0.8010903796010972
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

Classification Report for April 2024:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     45852
           1       0.00      0.00      0.00      5317

    accuracy                           0.90     51169
   macro avg       0.45      0.50      0.47     51169
weighted avg       0.80      0.90      0.85     51169

Confusion Matrix for April 2024:
[[45852     0]
 [ 5317     0]]



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
