In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score
import visualizer as viz
import utils

In [None]:
path = './data/'
features_raw = pd.read_csv(path + 'orange_small_train.data', sep = '\t')
features_raw.shape

In [None]:
numerical_features = features_raw.iloc[:, 0:190].copy()
categorical_features = features_raw.iloc[:, 190:].copy()

print('Shape of numerical features: {}\nShape of categorical features: {}'.format(numerical_features.shape, categorical_features.shape))

In [None]:
features_raw.head()

In [None]:
features_raw.describe()

In [None]:
sample_df = features_raw.iloc[:, 170:200].copy()
viz.plot_missing_matrix(sample_df)

In [None]:
viz.plot_missing_bar(sample_df)

# Feature Importance
![features](https://media.giphy.com/media/c6J4HiJ8aNRSBrMYfH/giphy.gif)
Nem toda feature tem o mesmo impacto no modelo, por isso, vamos tentar descobrir quais features têm o maior impacto na predição.
Vamos fazer isso em 2 passos:

1. Remover variáveis com base no número valores distintos
2. Permutações de importância

As variáveis não existirem pode ser preditivo, mas vamos ignorar isso por enquanto e preencher o vazio com a média da instancia para features numericas e com a string 'missing' para as features categóricas (isso já é suficiente para predizer através do vazio)

In [None]:
# Elimina as features que possuem o mesmo valor para todas as intâncias ou são exclusivamente nulas
categorical_features = utils.drop_min_unique_features(categorical_features, 1)

print(categorical_features.shape)
viz.plot_missing_matrix(categorical_features)

In [None]:
# Substitui vazio por 'missing'
categorical_features.fillna('missing', inplace=True)

categorical_features.astype('category', copy=False)

In [None]:
categorical_features.describe()

In [None]:
# Elimina as features que possuem o mesmo valor para todas as intâncias ou são exclusivamente nulas
numerical_features = utils.drop_min_unique_features(numerical_features, 1)

numerical_features.shape

In [None]:
# Substitui vazio pela média
numerical_features.fillna(numerical_features.mean(), inplace=True)

numerical_features.astype('float', copy=False)

In [None]:
numerical_features.describe()

In [None]:
viz.plot_categories_per_feature(categorical_features)

In [None]:
categorical_features = utils.drop_max_unique_features(categorical_features, 6000)
viz.plot_categories_per_feature(categorical_features)

In [None]:
features = pd.concat((numerical_features, categorical_features), axis=1)

Cria uma cópia do dataset para predicao do churn
Calcula a importancia das features (quais features realmente impactam a predição)

In [None]:
churn_features = pd.get_dummies(features)

In [None]:
churn_labels = pd.read_csv(path + 'orange_small_train_churn.labels', header = None)

### Permutações
Vamos "embaralhar" os valores de uma coluna de cada vez e avaliar quanto aquela coluna (isoladamente) afeta nossas predições.
Os valores que tiverem o maior peso representam as features mais importantes e os de menos peso, as menos importantes.

É como se em um jogo de paciência embaralhamos cada fila de cartas e medimos como isso afeta nosso resultado final, depois voltamos ao estado original e embaralhamos a próxima fila e assim por diante até termos embaralhado e medido todas as filas. (Para diminuir a influência do acaso, fazemos esse processo várias vezes)
![solitaire](https://www.hajapaciencia.com.br/static/main/thumbs/paciencia-canadense-1.df4d06d88fe1.png)

In [None]:
churn_features = utils.permutation_importance(churn_features, churn_labels)

In [None]:
churn_features.shape

In [None]:
# Get a small subset (20%) just to check the classifiers
_, X_train, _, y_train = utils.split_dataset(churn_features, churn_labels)

In [None]:
models = utils.get_models()

In [None]:
utils.train_and_report(models, X_train, y_train)

Agora que sabemos o melhor modelo para a tarefa, vamos otimizar alguns parametros. Utilizar grid search p/ otimizar

In [None]:
models = np.array([])
opt_params = dict()
scores = np.array([])

In [None]:
learning_rate = 0.1
n_estimators = None
max_depth = 8
min_samples_split = 250
min_samples_leaf = 20
max_features = 'sqrt'
subsample = 0.8
params = {'n_estimators': range(50, 151, 10)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
n_estimators = opt_params['n_estimators']
max_depth = None
min_samples_split = None
min_samples_leaf = 20
max_features = 'sqrt'
subsample = 0.8
params = {'max_depth': range(3, 12, 2), 'min_samples_split': range(150, 401, 50)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
max_depth = opt_params['max_depth']
min_samples_split = opt_params['min_samples_split']
min_samples_leaf = None
max_features = 'sqrt'
subsample = 0.8
params = {'min_samples_leaf': range(25, 61, 5)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
min_samples_leaf = opt_params['min_samples_leaf']
max_features = None
subsample = 0.8
params = {'max_features': range(21, 31, 1)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
max_features = opt_params['max_features']
subsample = None
params = {'subsample': np.append(np.arange(0.6, 1, 0.05), 1)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
subsample = opt_params['subsample']
learning_rate = 0.1
n_estimators = opt_params['n_estimators']

models, scores = utils.gbc_lr_optimizer(n_estimators = n_estimators,
                                        learning_rate = learning_rate,
                                        min_samples_split = min_samples_split,
                                        min_samples_leaf = min_samples_leaf,
                                        max_depth = max_depth,
                                        max_features = max_features,
                                        subsample = subsample,
                                        params = params)

In [None]:
models = np.append(models, models)
scores = np.append(scores, scores)

In [None]:
print('Best performance: {} | Model: {}'.format(max(scores), scores.argmax()))

In [None]:
X_train, X_test, y_train, y_test = split_dataset(churn_features, churn_labels)

In [None]:
churn_model = models[scores.argmax()]
churn_model.fit(X_train, y_train)

In [None]:
utils.save_model(churn_model, 'final_churn_model.sav')

In [None]:
churn_predictions = churn_model.predict_proba(X_test)
roc_auc_score(y_test, churn_predictions[:, 1])

# Appetency

In [None]:
appetency_features = pd.get_dummies(features)

In [None]:
appetency_labels = pd.read_csv(path + 'orange_small_train_appetency.labels', header = None)

In [None]:
appetency_features = utils.permutation_importance(appetency_features, appetency_labels)

In [None]:
appetency_features.shape

In [None]:
# Get a small subset (20%) just to check the classifiers
_, X_train, _, y_train = utils.split_dataset(appetency_features, appetency_labels)

In [None]:
models = utils.get_models()

In [None]:
utils.train_and_report(models, X_train, y_train)

Agora que sabemos o melhor modelo para a tarefa, vamos otimizar alguns parametros. Utilizar grid search p/ otimizar

In [None]:
models = np.array([])
opt_params = dict()
scores = np.array([])

In [None]:
learning_rate = 0.1
n_estimators = None
max_depth = 8
min_samples_split = 250
min_samples_leaf = 20
max_features = 'sqrt'
subsample = 0.8
params = {'n_estimators': range(50, 151, 10)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
n_estimators = opt_params['n_estimators']
max_depth = None
min_samples_split = None
min_samples_leaf = 20
max_features = 'sqrt'
subsample = 0.8
params = {'max_depth': range(3, 12, 2), 'min_samples_split': range(150, 401, 50)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
max_depth = opt_params['max_depth']
min_samples_split = opt_params['min_samples_split']
min_samples_leaf = None
max_features = 'sqrt'
subsample = 0.8
params = {'min_samples_leaf': range(25, 61, 5)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
min_samples_leaf = opt_params['min_samples_leaf']
max_features = None
subsample = 0.8
params = {'max_features': range(21, 31, 1)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
max_features = opt_params['max_features']
subsample = None
params = {'subsample': np.append(np.arange(0.6, 1, 0.05), 1)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
subsample = opt_params['subsample']
learning_rate = 0.1
n_estimators = opt_params['n_estimators']

models, scores = utils.gbc_lr_optimizer(n_estimators = n_estimators,
                                        learning_rate = learning_rate,
                                        min_samples_split = min_samples_split,
                                        min_samples_leaf = min_samples_leaf,
                                        max_depth = max_depth,
                                        max_features = max_features,
                                        subsample = subsample,
                                        params = params)

In [None]:
models = np.append(models, models)
scores = np.append(scores, scores)

In [None]:
print('Best performance: {} | Model: {}'.format(max(scores), scores.argmax()))

In [None]:
X_train, X_test, y_train, y_test = split_dataset(appetency_features, appetency_labels)

In [None]:
appetency_model = models[scores.argmax()]
appetency_model.fit(X_train, y_train)

In [None]:
utils.save_model(appetency_model, 'final_appetency_model.sav')

In [None]:
appetency_predictions = appetency_model.predict_proba(X_test)
roc_auc_score(y_test, appetency_predictions[:, 1])

# Upselling

In [None]:
upselling_features = pd.get_dummies(features)

In [None]:
upselling_labels = pd.read_csv(path + 'orange_small_train_upselling.labels', header = None)

In [None]:
upselling_features = utils.permutation_importance(upselling_features, upselling_labels)

In [None]:
upselling_features.shape

In [None]:
# Get a small subset (20%) just to check the classifiers
_, X_train, _, y_train = utils.split_dataset(upselling_features, upselling_labels)

In [None]:
models = utils.get_models()

In [None]:
utils.train_and_report(models, X_train, y_train)

Agora que sabemos o melhor modelo para a tarefa, vamos otimizar alguns parametros. Utilizar grid search p/ otimizar

In [None]:
models = np.array([])
opt_params = dict()
scores = np.array([])

In [None]:
learning_rate = 0.1
n_estimators = None
max_depth = 8
min_samples_split = 250
min_samples_leaf = 20
max_features = 'sqrt'
subsample = 0.8
params = {'n_estimators': range(50, 151, 10)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
n_estimators = opt_params['n_estimators']
max_depth = None
min_samples_split = None
min_samples_leaf = 20
max_features = 'sqrt'
subsample = 0.8
params = {'max_depth': range(3, 12, 2), 'min_samples_split': range(150, 401, 50)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
max_depth = opt_params['max_depth']
min_samples_split = opt_params['min_samples_split']
min_samples_leaf = None
max_features = 'sqrt'
subsample = 0.8
params = {'min_samples_leaf': range(25, 61, 5)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
min_samples_leaf = opt_params['min_samples_leaf']
max_features = None
subsample = 0.8
params = {'max_features': range(21, 31, 1)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
max_features = opt_params['max_features']
subsample = None
params = {'subsample': np.append(np.arange(0.6, 1, 0.05), 1)}

model, opt_param, score = utils.gbc_params_optimizer(n_estimators = n_estimators,
                                               learning_rate = learning_rate,
                                               min_samples_split = min_samples_split,
                                               min_samples_leaf = min_samples_leaf,
                                               max_depth = max_depth,
                                               max_features = max_features,
                                               subsample = subsample,
                                               params = params)

In [None]:
models = np.append(models, model)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [None]:
subsample = opt_params['subsample']
learning_rate = 0.1
n_estimators = opt_params['n_estimators']

models, scores = utils.gbc_lr_optimizer(n_estimators = n_estimators,
                                        learning_rate = learning_rate,
                                        min_samples_split = min_samples_split,
                                        min_samples_leaf = min_samples_leaf,
                                        max_depth = max_depth,
                                        max_features = max_features,
                                        subsample = subsample,
                                        params = params)

In [None]:
models = np.append(models, models)
scores = np.append(scores, scores)

In [None]:
print('Best performance: {} | Model: {}'.format(max(scores), scores.argmax()))

In [None]:
X_train, X_test, y_train, y_test = split_dataset(upselling_features, upselling_labels)

In [None]:
upselling_model = models[scores.argmax()]
upselling_model.fit(X_train, y_train)

In [None]:
utils.save_model(upselling_model, 'final_upselling_model.sav')

In [None]:
upselling_predictions = upselling_model.predict_proba(X_test)
roc_auc_score(y_test, upselling_predictions[:, 1])

# Conclusão
Obtivemos uma predição bla bla, teriamos ficado em bla.

### O que poderia ter sido melhor?
- O dataset é anonimizado e as variáveis são renomeadas para `var_n` por tanto não sabemos a que se refere cada variável. Poderíamos obter um melhor resultado se soubéssemos do que se trata cada uma, além de nos permitir ter uma visão de negócios sobre o algoritmo que estamos usando. Por exemplo, poderíamos calcular os _SHAP values_ de diferentes variáveis para entender como cada uma afeta a experiência do cliente e sua chance de churnar, fazer um upsell ou comprar novos produtos.
    - Digamos que a variável BLA se refere ao NPS (Net Promoter Score, uma forma de medir a chance de um cliente te recomendar a um amigo), plotando o gráfico dos valores SHAP vemos que claramente um valor baixo de NPS está relacionado com churn. Podemos então fazer um estudo mais aprofundado para o melhoramento do nosso atendimento ao cliente, aumentando o NPS e, por consequencia, diminuindo o churn.