In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as ss
from scipy.stats import chi2_contingency

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

In [None]:
data80 = pd.read_csv('/content/drive/MyDrive/docs/churn-bigml-80.csv')
data20 = pd.read_csv('/content/drive/MyDrive/docs/churn-bigml-20.csv')
data = pd.concat([data80,data20])

In [None]:
data.head(10)

In [None]:
data.info()

In [None]:
#data['Churn'] = data['Churn'].astype(object)

Dispersion

In [None]:
def data_dispersion(data):
  #coeficiente de variación
  print("Dispersion de las variables:")
  max = -1
  max_c = ""
  for i, col  in enumerate(data.columns):
    res = data[col].std(ddof=0)/data[col].mean()
    print(col,": ", res)
    if(max<=res):
      max = res
      max_c = col
  print("La variable con mayor dispersion es:", max_c)

In [None]:
data_dispersion(data.iloc[:,5:-1])

Dispersion de las variables:
Number vmail messages :  1.689874679146379
Total day minutes :  0.30292970727479
Total day calls :  0.19979035933482467
Total day charge :  0.3029236413908617
Total eve minutes :  0.25229449801158227
Total eve calls :  0.19896891955124832
Total eve charge :  0.25229085075256713
Total night minutes :  0.2517336923181814
Total night calls :  0.1954462187794541
Total night charge :  0.25173687349157997
Total intl minutes :  0.2726717404977856
Total intl calls :  0.5493634604337725
Total intl charge :  0.27261252323018564
Customer service calls :  0.8415960563782503
La variable con mayor dispersion es: Number vmail messages


Distribucion Normal

In [None]:
def data_shapiro(data):  
  print("Distribución Normal de las variables:")
  normal_dist = []
  not_normal_dist = []    
  for i, col  in enumerate(data.columns):
    res = ss.shapiro(data[col])
    print(col," pvalor: ", res[1])
    if(res[1] > 0.05):
      print("SI se aproxima a una distribución normal")
      normal_dist.append(col)
    else:
      print("NO se aproxima a una distribución normal")
      not_normal_dist.append(col)
    print("")
  return normal_dist, not_normal_dist

In [None]:
normal_dist, not_normal_dist = data_shapiro(data.iloc[:,5:-1])
print("Normal:",normal_dist)
print("No Normal",not_normal_dist)

Distribución Normal de las variables:
Number vmail messages  pvalor:  0.0
NO se aproxima a una distribución normal

Total day minutes  pvalor:  0.6374387741088867
SI se aproxima a una distribución normal

Total day calls  pvalor:  0.0002773392479866743
NO se aproxima a una distribución normal

Total day charge  pvalor:  0.641385018825531
SI se aproxima a una distribución normal

Total eve minutes  pvalor:  0.711155354976654
SI se aproxima a una distribución normal

Total eve calls  pvalor:  0.00886891596019268
NO se aproxima a una distribución normal

Total eve charge  pvalor:  0.7044191360473633
SI se aproxima a una distribución normal

Total night minutes  pvalor:  0.6227363348007202
SI se aproxima a una distribución normal

Total night calls  pvalor:  0.25032323598861694
SI se aproxima a una distribución normal

Total night charge  pvalor:  0.6202784180641174
SI se aproxima a una distribución normal

Total intl minutes  pvalor:  8.136633727895415e-11
NO se aproxima a una distribució

Correlacion

In [None]:
# Si las variables se aproximan a una distribución normal => Pearson
# Si las variables no se aproximan a una distribución normal => Spearman
def data_correlation_matrix(data, m="spearman"):
  return(data.corr(method = m))

#ss.pearsonr(data['Age'],data['EstimatedSalary'])
#ss.spearmanr(data['Age'],data['EstimatedSalary'])

def data_correlation_test(data, m="spearman"):
  for i, coli  in enumerate(data.columns):
    for j, colj  in enumerate(data.columns):      
      if(i<j):
        if(m == "spearman"):
          print(f"Correlacion de {coli} con {colj}:\t", ss.spearmanr(data[coli],data[colj]))
        else:
          print(f"Correlacion de {coli} con {colj}:\t", ss.pearsonr(data[coli],data[colj]))
  
#Si el pvalor < 5% => Se rechaza la Ho, al 95% de confianza, las variables están correlacionadas
#Si el pvalor > 5% => No se rechaza la Ho, al 95% de confianza, las variables no están correlacionadas

In [None]:
data_correlation_matrix(data[not_normal_dist], "spearman")

Unnamed: 0,Number vmail messages,Total day calls,Total eve calls,Total intl minutes,Total intl calls,Total intl charge,Customer service calls
Number vmail messages,1.0,-0.012396,-0.007148,-0.001565,0.006248,-0.001565,-0.019639
Total day calls,-0.012396,1.0,0.013807,0.015139,0.004148,0.015139,-0.020957
Total eve calls,-0.007148,0.013807,1.0,0.000213,0.014862,0.000213,0.002697
Total intl minutes,-0.001565,0.015139,0.000213,1.0,0.0179,1.0,-0.017374
Total intl calls,0.006248,0.004148,0.014862,0.0179,1.0,0.0179,-0.000598
Total intl charge,-0.001565,0.015139,0.000213,1.0,0.0179,1.0,-0.017374
Customer service calls,-0.019639,-0.020957,0.002697,-0.017374,-0.000598,-0.017374,1.0


In [None]:
#sns.heatmap(data_correlation_matrix(data[not_normal_dist], "spearman"))
#plt.show()

In [None]:
data_correlation_matrix(data[normal_dist], "pearson")

In [None]:
#sns.heatmap(data_correlation_matrix(data[normal_dist], "pearson"))
#plt.show()

In [None]:
data_correlation_test(data[not_normal_dist], "spearman")

Correlacion de Number vmail messages con Total day calls:	 SpearmanrResult(correlation=-0.012396183682162191, pvalue=0.4743512543301165)
Correlacion de Number vmail messages con Total eve calls:	 SpearmanrResult(correlation=-0.007148498961164186, pvalue=0.6799373590877378)
Correlacion de Number vmail messages con Total intl minutes:	 SpearmanrResult(correlation=-0.0015647670421516729, pvalue=0.9280460233194087)
Correlacion de Number vmail messages con Total intl calls:	 SpearmanrResult(correlation=0.006247535512933033, pvalue=0.7184345113989625)
Correlacion de Number vmail messages con Total intl charge:	 SpearmanrResult(correlation=-0.0015647670421516729, pvalue=0.9280460233194087)
Correlacion de Number vmail messages con Customer service calls:	 SpearmanrResult(correlation=-0.019639487465458686, pvalue=0.25699860952874515)
Correlacion de Total day calls con Total eve calls:	 SpearmanrResult(correlation=0.013807165107475983, pvalue=0.425534568256245)
Correlacion de Total day calls con

In [None]:
data_correlation_test(data[normal_dist], "pearson")

Chi-Cuadrado

In [None]:
#Chi-cuadrado: variables cualitativas
#Si pvalor < 5% => Se rechaza Ho, al 95% de confianza, existe dependencia entre las variables
#Si pvalor > 5% => No se rechaza Ho, al 95% de confianza, no existe dependencia entre las variables

def data_chi_cuadrado(data, target):
  for col in (data.columns):
    contigencia = pd.crosstab(target, data[col]) 
    c, p, dof, expected = chi2_contingency(contigencia) 
    print(f"{col}:\t p-value of Chi2 test: {p}")
    if(p<0.05):
      print(f"SI existe dependencia entre la variable {col} y el target {target.name}")    
    else:
      print(f"NO existe dependencia entre la variable {col} y el target {target.name}")
    print("")  

In [None]:
data_chi_cuadrado(data[['State','International plan','Voice mail plan']], data['Churn'])

State:	 p-value of Chi2 test: 0.002296221552011188
SI existe dependencia entre la variable State y el target Churn

International plan:	 p-value of Chi2 test: 2.4931077033159556e-50
SI existe dependencia entre la variable International plan y el target Churn

Voice mail plan:	 p-value of Chi2 test: 5.15063965903898e-09
SI existe dependencia entre la variable Voice mail plan y el target Churn



Violin plot

In [None]:
#Violin: target vs cuantitativas
for i, col in enumerate(data.iloc[:,5:-1].columns):
  sns.violinplot(x='Churn', y=col,data=data)  
  plt.show()

MODELOS ML

In [None]:
X_train = data80.drop(columns='Churn')
y_train = data80['Churn']

X_test = data20.drop(columns='Churn')
y_test = data20['Churn']

Naive Bayes

In [None]:
#Cuantitativas
classifier = GaussianNB().fit(X_train.iloc[:,5:], y_train)
y_pred = classifier.predict(X_test.iloc[:,5:])
y_pred_prob = classifier.predict_proba(X_test.iloc[:,5:])

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_prob[:,1])
false_positive_rate1 = false_positive_rate
true_positive_rate1 = true_positive_rate

roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc1 = roc_auc

print("AUC: ",roc_auc)

titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(classifier, X_test.iloc[:,5:], y_test,
                                 #display_labels=y_test,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

Decision Tree

In [None]:
#Cuantitativas
param_grid = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}
grid = GridSearchCV(tree.DecisionTreeClassifier(),param_grid,refit=True,verbose=0,cv=5)
grid.fit(X_train.iloc[:,5:],y_train)
print(grid.best_estimator_)

DecisionTreeClassifier(max_depth=5, min_samples_leaf=10)


In [None]:
classifier = tree.DecisionTreeClassifier(max_depth=5, min_samples_leaf=10).fit(X_train.iloc[:,5:], y_train)
plt.figure(figsize = (20,8))
tree.plot_tree(classifier)
plt.show()

In [None]:
y_pred = classifier.predict(X_test.iloc[:,5:])
y_pred_prob = classifier.predict_proba(X_test.iloc[:,5:])

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_prob[:,1])
false_positive_rate2 = false_positive_rate
true_positive_rate2 = true_positive_rate

roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc2 = roc_auc

print("AUC: ",roc_auc)

titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(classifier, X_test.iloc[:,5:], y_test,
                                 #display_labels=y_test,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

KNN

clasico

In [None]:
#Cuantitativas
X_trainK = StandardScaler().fit_transform(X_train.iloc[:,5:])
X_testK = StandardScaler().fit_transform(X_test.iloc[:,5:])

#Tuning
k_range = range(1, 31)
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train.iloc[:,5:], y_train, cv=5, scoring='accuracy')
    k_scores.append(scores.mean())

best_Acc = max(k_scores)
best_k = k_scores.index(best_Acc)+1
print(f"Best k: {best_k}, Accuracy: {best_Acc}")
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

In [None]:
classifier = KNeighborsClassifier(n_neighbors=best_k).fit(X_trainK, y_train)
y_pred = classifier.predict(X_testK)
y_pred_prob = classifier.predict_proba(X_testK)

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_prob[:,1])
false_positive_rate3 = false_positive_rate
true_positive_rate3 = true_positive_rate

roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc3 = roc_auc

print("AUC: ",roc_auc)

titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(classifier, X_testK, y_test,
                                 #display_labels=y_test,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

mejor k basado en: AUC vs k

In [None]:
X_trainK = StandardScaler().fit_transform(X_train.iloc[:,5:])
X_testK = StandardScaler().fit_transform(X_test.iloc[:,5:])

k_range = range(1, 100)
k_scores = []

for k in k_range:
  classifier = KNeighborsClassifier(n_neighbors=k).fit(X_trainK, y_train)
  y_pred = classifier.predict(X_testK)
  y_pred_prob = classifier.predict_proba(X_testK)
  false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_prob[:,1])  
  roc_auc = auc(false_positive_rate, true_positive_rate)  
  k_scores.append(roc_auc)

best_AUC = max(k_scores)
best_k = k_scores.index(best_AUC)+1
print(f"Best k: {best_k}, AUC: {best_AUC}")
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('AUC')
plt.show()

In [None]:
classifier = KNeighborsClassifier(n_neighbors=best_k).fit(X_trainK, y_train)
y_pred = classifier.predict(X_testK)
y_pred_prob = classifier.predict_proba(X_testK)

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_prob[:,1])
false_positive_rate3 = false_positive_rate
true_positive_rate3 = true_positive_rate

roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc3 = roc_auc

print("AUC: ",roc_auc)

titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(classifier, X_testK, y_test,
                                 #display_labels=y_test,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

Neural Network

In [None]:
#Tuning
param_grid = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
grid = GridSearchCV(MLPClassifier(max_iter=500),param_grid,refit=True,verbose=0,cv=5)
grid.fit(X_train.iloc[:,5:],y_train)
print(grid.best_estimator_)

MLPClassifier(alpha=0.05, hidden_layer_sizes=(10, 30, 10), max_iter=500)
