## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import colors

from sklearn.model_selection import train_test_split, KFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import (classification_report, accuracy_score, precision_score, 
                             recall_score, f1_score)
from sklearn.preprocessing import StandardScaler, OneHotEncoder

## Telco Churn Dataset
### Read

In [2]:
telco_churn = pd.read_csv('telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
telco_churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Feature Engineering

Uma limitação do LDA é a necessidade de entradas numéricas, nesse sentido é necessário converter todas as variáveis categóricas.

Alésm disso existem variáveis que não são desejáveis, como o customerID, que precisam ser removidas.

In [4]:
telco_churn.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
telco_churn.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [6]:
#Drop unecessary columns
telco_churn.drop(['customerID'], axis=1, inplace=True)

In [7]:
# Boolean 
BOOLEAN_COLUMNS = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
                   'PhoneService', 'PaperlessBilling', 'Churn']
telco_churn[BOOLEAN_COLUMNS] = telco_churn[BOOLEAN_COLUMNS].replace(['No', 'Yes', 'Male', 'Female'], [0.0,1.0,0.0,1.0])

In [8]:
# Categorical
CATEGORICAL_COLUMNS = ['MultipleLines', 'InternetService',
                       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                       'StreamingTV', 'StreamingMovies', 'Contract','PaymentMethod']

enc = OneHotEncoder(sparse=False)
out = enc.fit_transform(telco_churn[CATEGORICAL_COLUMNS])

pd.DataFrame(out, columns=enc.get_feature_names(CATEGORICAL_COLUMNS)).head()

Unnamed: 0,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
telco_churn.drop(CATEGORICAL_COLUMNS, axis=1, inplace=True)

telco_churn[enc.get_feature_names(CATEGORICAL_COLUMNS)] = pd.DataFrame(out, columns=enc.get_feature_names(CATEGORICAL_COLUMNS))

In [10]:
telco_churn.dtypes

gender                                     float64
SeniorCitizen                                int64
Partner                                    float64
Dependents                                 float64
tenure                                       int64
PhoneService                               float64
PaperlessBilling                           float64
MonthlyCharges                             float64
TotalCharges                                object
Churn                                      float64
MultipleLines_No                           float64
MultipleLines_No phone service             float64
MultipleLines_Yes                          float64
InternetService_DSL                        float64
InternetService_Fiber optic                float64
InternetService_No                         float64
OnlineSecurity_No                          float64
OnlineSecurity_No internet service         float64
OnlineSecurity_Yes                         float64
OnlineBackup_No                

In [11]:
telco_churn.shape

(7043, 41)

In [12]:
# Toltal Chages as float64
telco_churn['TotalCharges'] = pd.to_numeric(telco_churn['TotalCharges'], errors='coerce')

In [13]:
telco_churn['TotalCharges'].iloc[480:490]

480     225.75
481    2145.00
482    1671.60
483    8003.80
484     680.05
485    6130.85
486    1415.00
487    6201.95
488        NaN
489      74.35
Name: TotalCharges, dtype: float64

In [14]:
telco_churn.dropna(inplace=True)

In [15]:
telco_churn.shape

(7032, 41)

In [16]:
y = telco_churn['Churn']
X = telco_churn.drop('Churn', axis=1)
target_names = ['No', 'Yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Standardize data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Exercício

Dada a base de dados de Churn da Telco, é pedido que seja desenvolvido um modelivo de classificação de churn utilizando o LDA, semelhante ao exercício mostrado em aula.

Os passos para o exercício são:
1. Carregar a base de dados (como é mostrado nesse notebook);
2. Limpar a base de dados (como é mostrado nesse notebook);
3. Treinar o modelo utilizando o LinearDiscriminantAnalysis com (como mostrado no exercício em sala de aula);
4. Exibir um relatório do desempenho do modelo (como mostrado no exercício em sala de aula);
5. Treinar o modelo cross validado utilizando o código na celula abaixo;
6. Plotar o desempenho do modelo ao longo da validação cruzada;

In [1]:
kf = KFold(n_splits=5, shuffle=True)

acc = []
prec=[]
rec = []
f1_aux=[]
for train_index, validation_index in kf.split(X):

    # Standardize data
    sc = StandardScaler()
    X_train = sc.fit_transform(X.iloc[train_index])
    X_validation = sc.transform(X.iloc[validation_index])


    lda_clf = LinearDiscriminantAnalysis(n_components=1)
    lda_clf.fit(X_train, y.iloc[train_index])

    y_validation = lda_clf.predict(X_validation)

    acc.append(accuracy_score(y.iloc[validation_index], y_validation))
    prec.append(precision_score(y.iloc[validation_index], y_validation, average=None))
    rec.append(recall_score(y.iloc[validation_index], y_validation, average=None))
    f1_aux.append(f1_score(y.iloc[validation_index], y_validation, average=None))


NameError: name 'KFold' is not defined