In [66]:
## Imports:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [67]:
def read_check(X):
    df = pd.read_csv(X)
    print('Total of null records   :', df.isnull().sum())
    print('Total of NaN records   :', df.isna().sum())
    print('\n\nDataset description   :\n\n', df.describe())
    print('\n\nDataset shape   :\n', df.shape)
    
    #for i in df.columns:    
        #display('Unique values of {:10}:'.format(i), df[i].value_counts())

    return df

In [68]:
dataset = read_check('customer_churn.csv')


Total of null records   : Unnamed: 0                      0
customer_id                     0
Name                            0
age                             0
gender                          0
security_no                     0
region_category              8376
membership_category             0
joining_date                    0
joined_through_referral         0
referral_id                     0
preferred_offer_types         447
medium_of_operation             0
last_visit_time                 0
days_since_last_login           0
avg_time_spent                  0
avg_transaction_value           0
avg_frequency_login_days        0
feedback                        0
churn_risk_score            19919
dtype: int64
Total of NaN records   : Unnamed: 0                      0
customer_id                     0
Name                            0
age                             0
gender                          0
security_no                     0
region_category              8376
membership_categor

In [69]:
dataset = dataset.dropna()

In [70]:
dataset.columns

Index(['Unnamed: 0', 'customer_id', 'Name', 'age', 'gender', 'security_no',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'referral_id', 'preferred_offer_types',
       'medium_of_operation', 'last_visit_time', 'days_since_last_login',
       'avg_time_spent', 'avg_transaction_value', 'avg_frequency_login_days',
       'feedback', 'churn_risk_score'],
      dtype='object')

In [71]:
dataset

Unnamed: 0.1,Unnamed: 0,customer_id,Name,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,preferred_offer_types,medium_of_operation,last_visit_time,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,feedback,churn_risk_score
0,0,fffe43004900440034003800350031003500,Aisha Redner,53,F,B1U8QHV,City,No Membership,2016-02-27,No,xxxxxxxx,Without Offers,Both,00:45:49,16,229.060000,34976.54,16.0,Poor Product Quality,4.0
2,2,fffe43004900440031003600380031003900,Dean Campo,60,M,T4RTSE9,City,Gold Membership,2017-12-15,?,CID28283,Without Offers,Desktop,10:41:38,16,30.760000,45521.15,14.0,Too many ads,3.0
3,3,fffe4300490044003500380038003200,Raina Seth,11,M,WAV2SWM,City,Silver Membership,2016-10-08,No,xxxxxxxx,Gift Vouchers/Coupons,Both,06:38:26,18,391.090000,21774.00,18.0,Poor Product Quality,4.0
4,4,fffe43004900440032003300340037003300,Genaro Bemis,42,M,CCCB3HF,City,Gold Membership,2016-10-05,Yes,CID49640,Without Offers,Smartphone,14:29:00,17,118.010000,3745.20,9.0,Poor Customer Service,3.0
5,5,fffe43004900440033003700300032003200,Azucena Sun,29,F,ILWXP40,Town,No Membership,2017-12-03,No,xxxxxxxx,Without Offers,?,17:22:04,15,338.230000,6648.41,33.460205347560084,Too many ads,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56902,56902,fffe43004900440035003100350037003200,Nikki Scroggins,51,F,A8C0KVV,Village,Premium Membership,2016-01-15,Yes,CID50981,Credit/Debit Card Offers,Smartphone,05:51:23,9,-758.185333,27867.16,8.0,Too many ads,3.0
56905,56905,fffe43004900440033003900350034003400,Kristeen Hochmuth,45,F,NBHTMUA,City,Platinum Membership,2015-10-16,No,xxxxxxxx,Credit/Debit Card Offers,Desktop,17:01:03,6,398.470000,27680.30,17.0,Poor Product Quality,3.0
56908,56908,fffe43004900440036003100300031003600,Bulah Cocco,45,M,MEDGTHH,Town,Premium Membership,2017-03-29,No,xxxxxxxx,Without Offers,Smartphone,19:03:30,11,107.030000,28282.06,Error,No reason specified,3.0
56909,56909,fffe43004900440034003900370033003700,Shaquana Koga,49,F,Q134X6L,City,No Membership,2017-10-02,Yes,CID14909,Credit/Debit Card Offers,?,15:55:44,-999,33.250000,24127.95,8.0,Poor Website,4.0


In [72]:
## Colunas que não serão levadas em conta: 'Unnamed: 0', 'Name', 'customer_id'

In [73]:
dataset = dataset.drop(['Unnamed: 0', 'Name', 'customer_id', 'security_no', 'referral_id', 'medium_of_operation'], axis=1)

In [74]:
# Vamos deletar a coluna churn_risk_score, pois ela será nossa target label.
x_data = dataset.drop(['churn_risk_score'], axis=1)
y_data = dataset['churn_risk_score']




In [75]:
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data, test_size = 0.25)


In [76]:
def get_categorical_data(data):
    numerical = data._get_numeric_data().columns
    categorical = list(set(data.columns) - set(numerical))

    return categorical


In [77]:
categorical = get_categorical_data(dataset)

In [78]:
categorical

['joining_date',
 'membership_category',
 'joined_through_referral',
 'feedback',
 'preferred_offer_types',
 'avg_frequency_login_days',
 'gender',
 'last_visit_time',
 'region_category']

In [79]:
## Colunas categóricas serão transformadas em ordinais:
encoder = ce.OrdinalEncoder(categorical)


x_train = encoder.fit_transform(x_train)

x_test = encoder.transform(x_test)

In [80]:
## Nesse código vamos descobrir a melhor quantidade de árvores para nosso dataset, ou seja, 
## vamos rodar um gráfico acurácia vs. n-estimators (quantd. de árvores)
## e vamos selecionar a quantidade com maior acurácia que não seja 1. 

scores =[]
for k in range(1, 200):
    rfc = RandomForestClassifier(n_estimators=k, max_features='log2')
    rfc.fit(x_train, y_train)
    y_pred = rfc.predict(x_test)
    scores.append(accuracy_score(y_test, y_pred))

# Porque o index da lista vai de 0 a n. Logo, somo 1.
n_estimators = scores.index(max(scores))+1



In [None]:
rfc = RandomForestClassifier(n_estimators=n_estimators, max_features='log2')
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)


In [None]:
accuracy

0.5916751787538305

In [None]:
from sklearn.tree import export_graphviz
from six import StringIO 
import pydotplus
from IPython.display import Image

i_tree = 0
dot_data = StringIO()
for tree_in_forest in rfc.estimators_: #rfc random forest classifier
    if (i_tree == 1):        
        export_graphviz(tree_in_forest, out_file=dot_data)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())        
    i_tree = i_tree + 1
Image(graph.create_png())