In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
import numpy as np
from scipy import stats

In [5]:
dataset = pd.read_csv("../data/train/telecom_churn_train.csv")
X = dataset.drop(['Churn'], axis=1)
y = dataset['Churn']

In [6]:
X = X.astype(float)
smote = SMOTE(sampling_strategy = 'minority')
X_smote, y_smote = smote.fit_resample(X, y)
X_smote.shape, y_smote.shape

((5700, 10), (5700,))

In [7]:
random_forest_results = {}
forest_acuracy, forest_recall = [], []
for i in range(30):
    kfold = KFold(n_splits=10, shuffle=True, random_state=i)
    random_forest = RandomForestClassifier()
    random_forest_scores = cross_val_score(random_forest, X_smote, y_smote, cv=kfold)
    random_forest_recalls = cross_val_score(random_forest, X_smote, y_smote, cv=kfold, scoring='recall')
    forest_acuracy.append(random_forest_scores.mean())
    forest_recall.append(random_forest_recalls.mean())

In [15]:
random_forest_results['accuracy'] = forest_acuracy
random_forest_results['recall'] = forest_recall

In [16]:
# Médias
acuracia_media_forest = np.mean(random_forest_results['accuracy'])
recall_media_forest = np.mean(random_forest_results['recall'])

# Mediana
acuracia_mediana_forest = np.median(random_forest_results['accuracy'])
recall_mediana_forest = np.median(random_forest_results['recall'])

# Moda
acuracia_moda_forest = stats.mode(random_forest_results['accuracy'])
recall_moda_forest = stats.mode(random_forest_results['recall'])

# Desvio Padrão
acuracia_desvio_forest = np.std(random_forest_results['accuracy'])
recall_desvio_forest = np.std(random_forest_results['recall'])

# Coeficiente de Variação
acuracia_cv_forest = acuracia_desvio_forest/acuracia_media_forest
recall_cv_forest = recall_desvio_forest/recall_media_forest

In [18]:
recall_media_forest, recall_mediana_forest

(np.float64(0.9451565906977372), np.float64(0.945483855642542))

In [20]:
y_proba = cross_val_predict(
        random_forest,
        X_smote,
        y_smote,
        cv=KFold(n_splits=10, shuffle=True),
        method="predict_proba"
    )[:, 1]

In [21]:
y_proba

array([0.01, 0.05, 0.04, ..., 1.  , 0.52, 0.93], shape=(5700,))

In [31]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score
threshold = 0.2
y_pred = (y_proba >= threshold).astype(int)
cm = confusion_matrix(y_smote, y_pred)
recall = recall_score(y_smote, y_pred)
precision = precision_score(y_smote, y_pred)
recall, precision, cm

(0.988421052631579,
 0.8659698739624961,
 array([[2414,  436],
        [  33, 2817]]))