# Importamos dataset

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

uploaded = files.upload()

KeyboardInterrupt: ignored

In [13]:
# importamos el CHURN

churn = pd.read_csv("/content/Churn_Modelling.csv")

In [23]:
churn.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [18]:
churn.drop(columns = ['RowNumber', 'CustomerId', 'Surname'], inplace=True)

In [19]:
churn.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


# Preparamos la data

In [20]:
X = churn.drop('Exited', axis=1).values
y = churn['Exited'].values

In [None]:
# importamos la libreria para poder realizar el One Hot Encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Como podemos ver, las columnas que tienen los valores categoricos son la segunda y la tercera (Geography y Gender)

cl = ColumnTransformer(transformers = [('OHE', OneHotEncoder(drop='first'), [1, 2])], remainder = 'passthrough')
                                       
X = cl.fit_transform(X)

In [None]:
ohe_columns = cl.get_feature_names()
ohe_columns

In [None]:
churn_columns = [col for col in churn.columns if col not in ['Geography', 'Gender', 'Exited']]
churn_columns

In [None]:
final_columns = ohe_columns[:3] + churn_columns
final_columns

In [None]:
X = pd.DataFrame(X, columns=final_columns)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    stratify = y,
                                                    random_state = 25)

# Construimos el modelo

In [None]:
# importamos las librerias necesarias para el modelo

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
# definimos los parametros del arbol

dt = DecisionTreeClassifier(criterion='gini', 
                            max_depth=3,
                            min_samples_leaf = 0.08)

# entrenamos el modelo como siempre en sklearn
dt.fit(X_train, y_train)

In [None]:
# utilizamos la libreria que importamos de tree, para graficar los splits del arbol

plt.figure(figsize=(500,180))

tree.plot_tree(dt.fit(X_train, y_train), feature_names=X.columns)

In [None]:
# vamos a conocer algunas propiedades del arbol

print(f"La profundidad del arbol entrenado es {dt.get_depth()}")
print(f"La cantidad de nodos terminales es {dt.get_n_leaves()}")

In [None]:
dt.get_params(deep=True)

In [None]:
print(tree.export_text(dt, feature_names=final_columns))

# Prediccion

In [None]:
# importamos las librerias para obtener metricas de los resultados

from sklearn.metrics import confusion_matrix, accuracy_score

## Umbral = 0.5

### Prediccion umbral

In [None]:
# por defecto, el metodo predict realiza la clasificacion de la probabilidad con un umbral de 0.5

y_pred = dt.predict(X_test)

In [None]:
# calculamos el accuracy del modelo

accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
# descomponemos la matriz de confusion en los cuatro posibles resultados

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [None]:
# Revisamos el calculo del accuracy con los datos que nos entrega la matriz

(tn + tp) / (tn + fp + fn + tp)

In [None]:
p_tp = tp / (tn + fp + fn + tp)
p_fn = fn / (tn + fp + fn + tp)
p_tn = tn / (tn + fp + fn + tp)
p_fp = fp / (tn + fp + fn + tp)

In [None]:
p_tp

In [None]:
p_fn

In [None]:
p_tn

In [None]:
p_fp

### Matriz de beneficio 

In [None]:
# vamos a definir los beneficios/costos asociados a cada alternativa

b_tp = 750
b_tn = 100
c_fp = -200
c_fn = -950

In [None]:
# obtenemos la proporcion de cada clase, para esto vamos a utilizar un metodo de pandas llamado .value_counts
# el argumento de normalize = True nos permite obtener en vez del recuento de valores, la proporcion.

proporcion = churn['Exited'].value_counts(normalize = True)
proporcion

In [None]:
valor_esperado = p_tp * b_tp + p_fn * c_fn + p_tn * b_tn + p_fp * c_fp
valor_esperado

## Umbral = 0.2

### Prediccion umbral

In [None]:
# vamos a tener que especializar la probabilidad en el valor que deseamos, en este caso 0.2

y_pred = dt.predict_proba(X_test)[:, 1] > 0.2

In [None]:
# calculamos el accuracy del modelo

accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred) / 2000

In [None]:
# descomponemos la matriz de confusion en los cuatro posibles resultados

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [None]:
p_tp = tp / (tn + fp + fn + tp)
p_fn = fn / (tn + fp + fn + tp)
p_tn = tn / (tn + fp + fn + tp)
p_fp = fp / (tn + fp + fn + tp)

### Matriz de beneficio

In [None]:
# como ya tenemos todas las demas matrices construidas, solo nos resta volver a realizar el calculo del valor esperado

valor_esperado = p_tp * b_tp + p_fn * c_fn + p_tn * b_tn + p_fp * c_fp
valor_esperado

## Umbral = 0.1

### Prediccion

In [None]:
# vamos a tener que especializar la probabilidad en el valor que deseamos, en este caso 0.1

y_pred = dt.predict_proba(X_test)[:, 1] > 0.1

In [None]:
# calculamos el accuracy del modelo

accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
# descomponemos la matriz de confusion en los cuatro posibles resultados

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [None]:
p_tp = tp / (tn + fp + fn + tp)
p_fn = fn / (tn + fp + fn + tp)
p_tn = tn / (tn + fp + fn + tp)
p_fp = fp / (tn + fp + fn + tp)

### Matriz de beneficio

In [None]:
# como ya tenemos todas las demas matrices construidas, solo nos resta volver a realizar el calculo del valor esperado

valor_esperado = p_tp * b_tp + p_fn * c_fn + p_tn * b_tn + p_fp * c_fp
valor_esperado

## ROC CURVE

In [None]:
# para construir la roc curve, importaremos la libreria y luego utilizaremos la libreria para obtener 

from sklearn.metrics import roc_curve

y_pred_prob = dt.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show();

In [None]:
thresholds

In [None]:
fpr

In [None]:
tpr

### AUC

In [None]:
# una metrica importante es el area debajo del a curva roc y para calcularla utilizaremos la libreria
from sklearn.metrics import roc_auc_score

y_pred_prob = dt.predict_proba(X_test)[:,1]
print(f'El area debajo de la curva ROC es: {roc_auc_score(y_test, y_pred_prob):.3f}')

# Actividades propuestas

1- Probar diferentes combinaciones de hiperparametros y encontrar la que mejor se ajusta a este problema.

2- Determinar cuales son las principales variables que estan influyendo en la decision del arbol.

3- Generar la prediccion tanto para el test como para el train set, buscando visualizar la relacion de los hiperparametros con el concepto de overfitting y underfitting.