In [64]:
import pandas as pd
import numpy as np
from utils import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:
# carregando os dados
data_root = pd.read_csv('/home/usuario/Pessoal/alura_challenge_ds/dados/data_cleaned.csv', index_col = [0])

# shape
print(f'Formato do arquivo: {data_root.shape[0]} linhas e {data_root.shape[1]} colunas')

# checking dataset
data_root.head()

Formato do arquivo: 7043 linhas e 21 colunas


Unnamed: 0,id,churn,genero,idoso,parceiro,dependente,permanencia,servico_telefonico,multiplas_linhas,servico_internet,...,backup_online,protecao_dispositivo,suporte_tecnico,tv_streaming,filme_streaming,contrato,conta_papel,metodo_pagamento,mensal,total
0,0002-ORFBO,No,Female,No,Yes,Yes,9,Yes,No,DSL,...,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3
1,0003-MKNFE,No,Male,No,No,No,9,Yes,Yes,DSL,...,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4
2,0004-TLHLJ,Yes,Male,No,No,No,4,Yes,No,Fiber optic,...,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85
3,0011-IGKFF,Yes,Male,Yes,Yes,No,13,Yes,No,Fiber optic,...,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85
4,0013-EXCHZ,Yes,Female,Yes,Yes,No,3,Yes,No,Fiber optic,...,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4


In [40]:
data = data_root.copy()

In [41]:
# columns analysis
columns_descriptions = []

for column in data.columns:
    column_description = column_analysis(data, column)
    columns_descriptions.append(column_description)

pd.DataFrame(columns_descriptions, columns = ['nome', 'cardinalidade', 'dados_unicos', 'dados_nulos', 'tipo'])

Unnamed: 0,nome,cardinalidade,dados_unicos,dados_nulos,tipo
0,id,7043,['0002-ORFBO' '0003-MKNFE' '0004-TLHLJ' ... '9...,0,object
1,churn,2,['No' 'Yes'],0,object
2,genero,2,['Female' 'Male'],0,object
3,idoso,2,['No' 'Yes'],0,object
4,parceiro,2,['Yes' 'No'],0,object
5,dependente,2,['Yes' 'No'],0,object
6,permanencia,73,[ 9 4 13 3 71 63 7 65 54 72 5 56 34 1 45 ...,0,int64
7,servico_telefonico,2,['Yes' 'No'],0,object
8,multiplas_linhas,3,['No' 'Yes' 'No phone service'],0,object
9,servico_internet,3,['DSL' 'Fiber optic' 'No'],0,object


In [42]:
servico_internet_dummies = pd.get_dummies(data.servico_internet, prefix = 'servico_internet', prefix_sep = '_') #, drop_first = True
contrato_dummies = pd.get_dummies(data.contrato, prefix = 'contrato', prefix_sep = '_')
metodo_pagamento_dummies = pd.get_dummies(data.metodo_pagamento, prefix = 'metodo_pagamento')

In [43]:
data.drop(['servico_internet', 'contrato', 'metodo_pagamento'], axis = 1, inplace = True)

data.head()

Unnamed: 0,id,churn,genero,idoso,parceiro,dependente,permanencia,servico_telefonico,multiplas_linhas,seguranca_online,backup_online,protecao_dispositivo,suporte_tecnico,tv_streaming,filme_streaming,conta_papel,mensal,total
0,0002-ORFBO,No,Female,No,Yes,Yes,9,Yes,No,No,Yes,No,Yes,Yes,No,Yes,65.6,593.3
1,0003-MKNFE,No,Male,No,No,No,9,Yes,Yes,No,No,No,No,No,Yes,No,59.9,542.4
2,0004-TLHLJ,Yes,Male,No,No,No,4,Yes,No,No,No,Yes,No,No,No,Yes,73.9,280.85
3,0011-IGKFF,Yes,Male,Yes,Yes,No,13,Yes,No,No,Yes,Yes,No,Yes,Yes,Yes,98.0,1237.85
4,0013-EXCHZ,Yes,Female,Yes,Yes,No,3,Yes,No,No,No,No,Yes,Yes,No,Yes,83.9,267.4


In [44]:
data_complete = pd.concat([data, servico_internet_dummies, contrato_dummies, metodo_pagamento_dummies], axis = 1)

data_complete.head()

Unnamed: 0,id,churn,genero,idoso,parceiro,dependente,permanencia,servico_telefonico,multiplas_linhas,seguranca_online,...,servico_internet_DSL,servico_internet_Fiber optic,servico_internet_No,contrato_Month-to-month,contrato_One year,contrato_Two year,metodo_pagamento_Bank transfer (automatic),metodo_pagamento_Credit card (automatic),metodo_pagamento_Electronic check,metodo_pagamento_Mailed check
0,0002-ORFBO,No,Female,No,Yes,Yes,9,Yes,No,No,...,1,0,0,0,1,0,0,0,0,1
1,0003-MKNFE,No,Male,No,No,No,9,Yes,Yes,No,...,1,0,0,1,0,0,0,0,0,1
2,0004-TLHLJ,Yes,Male,No,No,No,4,Yes,No,No,...,0,1,0,1,0,0,0,0,1,0
3,0011-IGKFF,Yes,Male,Yes,Yes,No,13,Yes,No,No,...,0,1,0,1,0,0,0,0,1,0
4,0013-EXCHZ,Yes,Female,Yes,Yes,No,3,Yes,No,No,...,0,1,0,1,0,0,0,0,0,1


In [45]:
data_complete.replace({
    'No phone service': 0,
    'No internet service': 0,
    'No': 0,
    'Yes': 1,
    'Male': 0,
    'Female': 1
}, inplace = True)

In [46]:
data_complete

Unnamed: 0,id,churn,genero,idoso,parceiro,dependente,permanencia,servico_telefonico,multiplas_linhas,seguranca_online,...,servico_internet_DSL,servico_internet_Fiber optic,servico_internet_No,contrato_Month-to-month,contrato_One year,contrato_Two year,metodo_pagamento_Bank transfer (automatic),metodo_pagamento_Credit card (automatic),metodo_pagamento_Electronic check,metodo_pagamento_Mailed check
0,0002-ORFBO,0,1,0,1,1,9,1,0,0,...,1,0,0,0,1,0,0,0,0,1
1,0003-MKNFE,0,0,0,0,0,9,1,1,0,...,1,0,0,1,0,0,0,0,0,1
2,0004-TLHLJ,1,0,0,0,0,4,1,0,0,...,0,1,0,1,0,0,0,0,1,0
3,0011-IGKFF,1,0,1,1,0,13,1,0,0,...,0,1,0,1,0,0,0,0,1,0
4,0013-EXCHZ,1,1,1,1,0,3,1,0,0,...,0,1,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7262,9987-LUTYD,0,1,0,0,0,13,1,0,1,...,1,0,0,0,1,0,0,0,0,1
7263,9992-RRAMN,1,0,0,1,0,22,1,1,0,...,0,1,0,1,0,0,0,0,1,0
7264,9992-UJOEL,0,0,0,0,0,2,1,0,0,...,1,0,0,1,0,0,0,0,0,1
7265,9993-LHIEB,0,0,0,1,1,67,1,0,1,...,1,0,0,0,0,1,0,0,0,1


In [267]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing


In [265]:
target_column = ['churn']
dichotomic_columns = ['genero', 'idoso', 'parceiro', 'dependente', 'servico_telefonico', 'conta_papel']
polichotomic_columns = ['multiplas_linhas', 'servico_internet', 'seguranca_online', 'backup_online', 'protecao_dispositivo', 'suporte_tecnico', 'tv_streaming', 'filme_streaming', 'contrato', 'metodo_pagamento']
numerical_columns = ['permanencia', 'mensal', 'total']

In [264]:
X, y = data_complete.iloc[:, 2:], data_complete['churn']

In [None]:
normalized = preprocessing.normalize([x_array])
print(normalized)

Unnamed: 0,permanencia,mensal,total
0,9,65.60,593.30
1,9,59.90,542.40
2,4,73.90,280.85
3,13,98.00,1237.85
4,3,83.90,267.40
...,...,...,...
7262,13,55.15,742.90
7263,22,85.10,1873.70
7264,2,50.30,92.75
7265,67,67.85,4627.65


In [261]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)

In [262]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [263]:
pd.DataFrame(x_train)

Unnamed: 0,genero,idoso,parceiro,dependente,permanencia,servico_telefonico,multiplas_linhas,seguranca_online,backup_online,protecao_dispositivo,...,servico_internet_DSL,servico_internet_Fiber optic,servico_internet_No,contrato_Month-to-month,contrato_One year,contrato_Two year,metodo_pagamento_Bank transfer (automatic),metodo_pagamento_Credit card (automatic),metodo_pagamento_Electronic check,metodo_pagamento_Mailed check
1314,1,0,1,1,64,1,1,1,1,1,...,1,0,0,0,0,1,0,0,0,1
6005,0,0,1,1,12,1,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0
5127,1,1,0,0,20,1,1,0,0,0,...,1,0,0,1,0,0,0,0,0,1
5610,0,0,0,0,1,1,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
330,1,1,0,0,45,1,1,0,0,1,...,0,1,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6119,0,0,1,0,68,1,0,0,1,0,...,0,1,0,0,1,0,0,0,1,0
1043,0,0,0,0,4,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,1
27,0,0,1,1,54,1,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
3935,1,0,1,1,71,1,1,1,1,1,...,1,0,0,0,0,1,0,0,0,1


In [254]:
model = LogisticRegression()
# model = DecisionTreeClassifier()

In [255]:
model.fit(x_train, y_train)

In [256]:
y_predict = model.predict(x_test)

In [257]:
# y_predict = np.random.randint(0, high = 2, size = len(y_test))

In [258]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.87      0.88      0.87      1579
           1       0.63      0.59      0.61       534

    accuracy                           0.81      2113
   macro avg       0.75      0.74      0.74      2113
weighted avg       0.81      0.81      0.81      2113

