## Imports and Data Loading

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from joblib import dump

In [12]:
train_df = pd.read_csv('data/train.csv')

## Data preprocessing

**Dummy encoding**

In [13]:
df_num = train_df.copy()

In [33]:
X = df_num.drop('Churn', axis=1)
y = df_num['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

# Retrieve 'CustomerID' from test set data
test_CustomerID = X_test["CustomerID"].copy()

# Retrieve 'Churn' from test set data
test_Churn = y_test.copy()

# Combine 'CustomerID' and 'Churn' into a DataFrame
test_df = pd.DataFrame({'CustomerID': test_CustomerID, 'Churn': test_Churn})

# Save the test set 'CustomerID' and 'Churn' to a CSV file
test_df.to_csv('test_CustomerID_Churn.csv', index=False)

In [14]:
# Select categorical columns
cat_cols = df_num.select_dtypes(include=['object']).columns

for col in cat_cols:
    print(col, ":", df_num[col].nunique())

SubscriptionType : 3
PaymentMethod : 4
PaperlessBilling : 2
ContentType : 3
MultiDeviceAccess : 2
DeviceRegistered : 4
GenrePreference : 5
Gender : 2
ParentalControl : 2
SubtitlesEnabled : 2
CustomerID : 243787


In [15]:
# Let's drop CustomerID
cat_cols = cat_cols.drop('CustomerID')

In [16]:
# Let's create dummies
for col in cat_cols:
    df_num = pd.get_dummies(df_num, columns=[col])

In [17]:
df_processed=df_num.drop('CustomerID', axis=1)

## Training

In [18]:
X = df_processed.drop('Churn', axis=1)
y = df_processed['Churn']

In [19]:
# Spliting the data to train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [20]:
# Let's try oversampling

oversample = SMOTE(random_state=42)
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [21]:
scaler = StandardScaler()

In [22]:
scaler.fit(X_train)

In [23]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
# initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500, random_state=42)

# training the model
rf.fit(X_train, y_train)

# predicting the probabilities for Random Forest Model
y_pred_rf_prob = rf.predict_proba(X_test)[:,1]

## Evaluation

In [25]:
# calculating ROC AUC score
roc_auc_rf = roc_auc_score(y_test, y_pred_rf_prob)
print("ROC AUC Score for Random Forest: ", roc_auc_rf)

ROC AUC Score for Random Forest:  0.7352299942912236


In [26]:
y_pred = rf.predict(X_test)
print("Classification Report: \n", classification_report(y_test, y_pred))

Classification Report: 
               precision    recall  f1-score   support

           0       0.83      0.99      0.90     39968
           1       0.59      0.06      0.12      8790

    accuracy                           0.82     48758
   macro avg       0.71      0.53      0.51     48758
weighted avg       0.79      0.82      0.76     48758



In [27]:
# Let's check the accuracy for the algorithm predicting 1
# Separate features and target from df_processed
features_processed = df_processed.drop('Churn', axis=1)
target_processed = df_processed['Churn']

# First get predictions for entire df_processed data
features_processed_scaled = scaler.transform(features_processed) 
predictions_processed = rf.predict(features_processed_scaled)

# Now make a subset of df_processed where true class is 1
features_processed_pos = features_processed[target_processed==1] 

# Now make a subset of prediction where true class is 1
predictions_processed_pos = predictions_processed[target_processed==1]

# get the accuracy for class 1 predictions
accuracy = accuracy_score(target_processed[target_processed==1], predictions_processed_pos)

print("The accuracy of the classifier on positive class is : ", accuracy)

The accuracy of the classifier on positive class is :  0.8137929473541261


In [28]:
# Let's check the accuracy for the algorithm predicting 0

# Make a subset of df_processed where true class is 0
features_processed_neg = features_processed[target_processed==0] 

# Make a subset of prediction where true class is 0
predictions_processed_neg = predictions_processed[target_processed==0]

# Get the accuracy for class 0 predictions
accuracy = accuracy_score(target_processed[target_processed==0], predictions_processed_neg)

print("The accuracy of the classifier on negative class is : ", accuracy)

The accuracy of the classifier on negative class is :  0.9980461411287292


## Customer ID checking

In [29]:
# choose a customer with a specific id
customer_id = 'GE07ASO3L7' 
customer = train_df[train_df['CustomerID'] == customer_id]

# preprocess data to match model's input
customer_preprocessed = pd.get_dummies(customer.drop('CustomerID', axis=1))
missing_cols = set(df_processed.columns) - set(customer_preprocessed.columns)

# if there are missing columns add them to the dataframe and fill with 0
for c in missing_cols:
    customer_preprocessed[c] = 0

# ensure the customer data has the same columns as the training data
customer_preprocessed = customer_preprocessed[df_processed.columns]

# separate features from the target, drop target from features
customer_features = customer_preprocessed.drop('Churn', axis=1)
customer_target = customer_preprocessed['Churn']

# Scale the customer_features before prediction
customer_features_scaled = scaler.transform(customer_features)

# predict the churn for the specific customer
customer_prediction = rf.predict(customer_features_scaled) 

# print the prediction and true value
print(f"Predicted Churn: {customer_prediction[0]}")
print(f"True Churn: {customer_target.values[0]}")

Predicted Churn: 1
True Churn: 1


In [30]:
# Use the entire processed features for prediction
full_features = df_processed.drop('Churn', axis=1)
full_target = df_processed['Churn']

# Scale your features
full_features_scaled = scaler.transform(full_features)

# Generate predictions for all rows of your train set 
full_predictions = rf.predict(full_features_scaled)

# Print classification report for all rows
full_classification_report = classification_report(full_target, full_predictions)

print("Full Classification Report: \n", full_classification_report)

Full Classification Report: 
               precision    recall  f1-score   support

           0       0.96      1.00      0.98    199605
           1       0.99      0.81      0.89     44182

    accuracy                           0.96    243787
   macro avg       0.97      0.91      0.94    243787
weighted avg       0.97      0.96      0.96    243787



In [31]:
# Generate probabilistic predictions for ROC AUC
full_predictions_proba = rf.predict_proba(full_features_scaled)[:,1]

# Compute ROC AUC
full_roc_auc = roc_auc_score(full_target, full_predictions_proba)

print("Full ROC AUC: ", full_roc_auc)

Full ROC AUC:  0.9844711313875733


## Saving the model

In [61]:
dump(rf, 'model.joblib')

['model.joblib']

In [62]:
# Save the scaler

dump(scaler, 'scaler.joblib')

['scaler.joblib']