# Machine learning for marketing basics
---

**Aprendizado Supervisionado**
- Classificação - Prever se um cliente irá se desligar (Churned/Not-churned)
- Regressão - Prever compras de clientes para o próximo mês

**Aprendizado Não-supervisionado**
- Segmentação de clientes com base em seu histórico de compra: Podemos detectar padrões e clusters que são homogêneos?


In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

In [2]:
telco_raw = pd.read_csv('datasets/telco.csv')

In [3]:
telco_raw.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
telco_raw.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
telco_raw.nunique()

customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64

In [6]:
telco_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


### Separando variáveis categóricas e numéricas

In [7]:
# Separando a variável identificadora e a target em listas
custid = ['customerID']
target = ['Churn']

In [None]:
# Definindo uma variável como categórica caso tenha menos de 10 valores exclusivos
categorical = telco_raw.nunique()[telco_raw.nunique() < 5].keys().tolist()

# Removendo a variável target da lista
categorical.remove(target[0])

# Armazenando as variáveis restantes na lista numerical
numerical = [col for col in telco_raw.columns
                     if col not in custid + target + categorical]

In [None]:
categorical

In [None]:
numerical

### Encode variáveis categóricas e numéricas

In [None]:
# One-hot encoding variáveis categóricas
telco_raw = pd.get_dummies(data=telco_raw, columns=categorical, drop_first=True)

In [8]:
telco_raw.nunique()

customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64

In [None]:
telco_raw['TotalCharges'] = telco_raw['TotalCharges'].apply(pd.to_numeric, errors='coerce')

In [None]:
# Inicializando StandardScaler
scaler = StandardScaler()

# Fit scaler em colunas numéricas
scaled_numerical = scaler.fit_transform(telco_raw[numerical])

# Dataframe
scaled_numerical = pd.DataFrame(scaled_numerical, columns=numerical)

In [None]:
# Drop não scaled numérico
telco_raw = telco_raw.drop(columns=numerical, axis=1)

# Merge não numérico com scaled numérico
telco = telco_raw.merge(right=scaled_numerical, how='left', left_index=True, right_index=True)

### Explorando churn e separando dados

In [9]:
# Distribuição de churn
telco.groupby(['Churn']).size() / telco.shape[0] * 100

NameError: name 'telco' is not defined

In [None]:
# Separando dados de treino e teste
train, test = train_test_split(telco, test_size = .25)

In [None]:
custid = ['customerID']
target = ['Churn']

# Armazenando o nomes de colunas, excluindo variável target e customerID
cols = [col for col in telco.columns if col not in custid + target]

In [None]:
# Construindo dataset de treino e teste
train_X = train[cols]
train_Y = train[target]
test_X = test[cols]
test_Y = test[target]
X = telco[cols]
Y = telco['Churn']

In [None]:
# Verificando se os dados de treino são 75% 
print(train_X.shape[0] / X.shape[0])

# Verificando se os dados de teste são 25% 
print(test_X.shape[0] / Y.shape[0])

In [None]:
telco.describe()

### Previsão do churn com árvores de decisão

In [None]:
depth_list = list(range(2,15))
depth_tuning = np.zeros((len(depth_list), 4))
depth_tuning[:,0] = depth_list

for index in range(len(depth_list)):    
    mytree = tree.DecisionTreeClassifier(max_depth=depth_list[index])
    mytree.fit(train_X, train_Y)
    
    # Prevendo dados no dataset de teste
    pred_test_Y = mytree.predict(test_X) 
    
    # Cálculo de score da performace do modelo
    depth_tuning[index,1] = accuracy_score(test_Y, pred_test_Y)    
    depth_tuning[index,2] = precision_score(test_Y, pred_test_Y)    
    depth_tuning[index,3] = recall_score(test_Y, pred_test_Y)
    
col_names = ['Max_Depth','Acurácia','Precisão','Recall']
print(pd.DataFrame(depth_tuning, columns=col_names))

In [None]:
# Medindo precisão e recall 
train_precision = round(precision_score(train_Y, pred_train_Y, pos_label='Yes'), 4)
test_precision = round(precision_score(test_Y, pred_test_Y, pos_label='Yes'), 4)
train_recall = round(recall_score(train_Y, pred_train_Y, pos_label='Yes'), 4)
test_recall = round(recall_score(test_Y, pred_test_Y, pos_label='Yes'), 4)

print('Precisão do Treinamento: {}\nRecall Treinamento: {}'.format(train_precision, train_recall))
print('Precisão do Test: {}\nRecall Test: {}'.format(test_precision, test_recall))

### Identificando e interpretando os drivers de churn

In [None]:
# Export graphviz object from the trained decision tree 
exported = tree.export_graphviz(decision_tree=mytree, 
            out_file=None, feature_names=train_X.columns, 
            precision=1, class_names=['Not churn','Churn'], filled = True)

graph = graphviz.Source(exported)

display_image("datasets/decision_tree_rules.png")

### Previsão de churn com regressão logística

In [None]:
logreg = LogisticRegression()
logreg.fit(train_X, train_Y)

# Medindo acuracia do modelo
pred_train_Y = logreg.predict(train_X)
pred_test_Y = logreg.predict(test_X)

train_accuracy = accuracy_score(train_Y, pred_train_Y)
test_accuracy = accuracy_score(test_Y, pred_test_Y)

print('Acurácia de Treinamento:', round(train_accuracy, 4))
print('Acurácia de Teste:', round(test_accuracy, 4))

In [None]:
# Ajuste da regressão logística com regularização L1
# Identificando o coeficiente de penalidade L1 ideal

C = [1, .5, .25, .1, .05, .025, .01, .005, .003, 0.0025]
l1_metrics = np.zeros((len(C), 5))
l1_metrics[:,0] = C

for index in range(0, len(C)):
    logreg = LogisticRegression(penalty='l1', C=C[index], solver='liblinear')
    logreg.fit(train_X, train_Y)
    
    pred_test_Y = logreg.predict(test_X)
    
    l1_metrics[index,1] = np.count_nonzero(logreg.coef_)    
    l1_metrics[index,2] = accuracy_score(test_Y, pred_test_Y)    
    l1_metrics[index,3] = precision_score(test_Y, pred_test_Y, pos_label='Yes', zero_division=1)    
    l1_metrics[index,4] = recall_score(test_Y, pred_test_Y, pos_label='Yes')
    
col_names = ['C','Non-Zero Coeffs','Accuracy','Precision','Recall']
pd.DataFrame(l1_metrics, columns=col_names)

In [None]:
# Medindo precisão e recall 
train_precision = round(precision_score(train_Y, pred_train_Y, pos_label='Yes'), 4)
test_precision = round(precision_score(test_Y, pred_test_Y, pos_label='Yes'), 4)
train_recall = round(recall_score(train_Y, pred_train_Y, pos_label='Yes'), 4)
test_recall = round(recall_score(test_Y, pred_test_Y, pos_label='Yes'), 4)

print('Precisão do Treinamento: {}\nRecall Treinamento: {}'.format(train_precision, train_recall))
print('Precisão do Test: {}\nRecall Test: {}'.format(test_precision, test_recall))

### Coeficientes regressão logística

In [None]:
logreg.coef_

In [None]:
coefficients = pd.concat([pd.DataFrame(train_X.columns),               
                          pd.DataFrame(np.transpose(logit.coef_))],                
                          axis = 1)

coefficients.columns = ['Feature', 'Coefficient']
coefficients['Exp_Coefficient'] = np.exp(coefficients['Coefficient'])
coefficients = coefficients[coefficients['Coefficient']!=0]

print(coefficients.sort_values(by=['Coefficient']))

In [None]:
coefficients = pd.concat([pd.DataFrame(train_X.columns),               
                          pd.DataFrame(np.transpose(logit.coef_))],                
                          axis = 1)

# Combine feature names and coefficients into pandas DataFrame
feature_names = pd.DataFrame(train_X.columns, columns = ['Feature'])
log_coef = pd.DataFrame(np.transpose(logreg.coef_), columns = ['Coefficient'])
coefficients = pd.concat([feature_names, log_coef], axis = 1)

# Calculate exponent of the logistic regression coefficients
coefficients['Exp_Coefficient'] = np.exp(coefficients['Coefficient'])

# Remove coefficients that are equal to zero
coefficients = coefficients[coefficients['Coefficient']!=0]

# Print the values sorted by the exponent coefficient
print(coefficients.sort_values(by=['Coefficient']))

### Customer Lifetime Value (CLV) basics