### Random Forest - Churn Prediction

In [1]:
import pandas as pd
import numpy as np

In [2]:
model_data = pd.read_csv('C:/Users/Lenovo/Documents/zoppy/hackathon/churn-prediction/docs/model_data.csv')

In [3]:
model_data.head()

Unnamed: 0,numLogin,csm,numTickets,daysEmail,emailIntegration,daysEcommerce,ecommerceIntegration,daysWhatsAppApi,whatsAppApiIntegration,daysGiftback,...,firstCampain,numCampains,receitaInflu,receitaTotal,receitaGiftback,receitacampanha,receitaPainelvendedor,receitaFluxo,ROI,churn
0,110,1,0,8.0,1,0.0,1,8.0,1,-1.0,...,1,10,6616.71,1563709.28,0.0,179.69,0.0,6437.02,0.0,0
1,15,1,0,3.0,1,2.0,1,4.0,1,-1.0,...,1,9,15077.13,360196.97,0.0,15077.13,0.0,0.0,0.0,1
2,12,1,10,2.0,1,1.0,1,2.0,1,-1.0,...,1,7,75121.56,2666049.13,0.0,73289.13,0.0,1832.43,0.0,0
3,38,1,6,121.0,1,2.0,1,8.0,1,21.0,...,1,5,4467.0,450992.92,0.0,4467.0,0.0,0.0,0.0,0
4,74,1,0,1.0,1,1.0,1,3.0,1,16.0,...,1,1,239347.11,3621210.25,32928.24,71214.33,0.0,0.0,1.0,0


In [4]:
# Splitting the data into X and y
x = model_data.drop('churn', axis=1)
y = model_data['churn']

In [5]:
# Splitting the data into churn and non-churn
churn_indices = y[y == 1].index
non_churn_indices = y[y == 0].index

In [6]:
# Randomly selecting the non-churn indices
non_churn_sample = np.random.choice(non_churn_indices, size=len(churn_indices), replace=False)
undersample_indices = np.concatenate([churn_indices, non_churn_sample])

In [7]:
# Getting the number of churn and non-churn
y = y.reset_index(drop=True)
x = x.reset_index(drop=True)

In [8]:
# Balancing the data
x_bal = x.loc[undersample_indices]
y_bal = y.loc[undersample_indices]

In [9]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test
x_train, x_test, y_train, y_test = train_test_split(x_bal, y_bal, test_size=0.2, random_state=42)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Creating the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

from sklearn.model_selection import cross_val_score
# Cross validation
scores = cross_val_score(model, x_bal, y_bal, cv=5)

# Results
print('Accuracy:', scores.mean())
print('Standard deviation:', scores.std())

Accuracy: 0.8411330049261083
Standard deviation: 0.13874233601485383


In [11]:
# Fitting the model
model.fit(x_train, y_train)

In [12]:
# Predicting the test data
y_pred = model.predict(x_test)

# Accuracy
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.896551724137931


In [13]:
# Feature importance
importances = model.feature_importances_
importances = pd.DataFrame(importances, index=x.columns, columns=['importance'])
importances = importances.sort_values(by='importance', ascending=False)
importances

Unnamed: 0,importance
numLogin,0.193135
daysEmail,0.138589
daysGiftback,0.098154
giftbackIntegration,0.088486
daysWhatsAppApi,0.086116
receitaTotal,0.078397
emailIntegration,0.051747
receitaInflu,0.049712
daysEcommerce,0.046994
daysFirstCampain,0.037334
