# Supervised Learning (Multilabel)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from scipy import interp
from itertools import cycle

In [None]:
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')
display(X.head(),y.head())

In [None]:
#Primera matriz de correlación.
corr = X.corr()
figure = (15, 13)
fig, ax = pyplot.subplots(figsize=figure)
sns.set()
ax = sns.heatmap(corr,annot=True,cmap="Oranges")

In [None]:
#Eliminación de features no discriminantes (>0.9)
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if abs(corr.iloc[i,j]) >= 0.9:
            if columns[j]:
                columns[j] = False
selected_columns = X.columns[columns]
X = X[selected_columns]

In [None]:
#Segunda matriz de correlación.
corr2 = X.corr()
figure = (15, 13)
fig, ax = pyplot.subplots(figsize=figure)
sns.set()
ax = sns.heatmap(corr2,annot=True,cmap="Greens")

In [None]:
#Se eliminan nuevas columnas a partir del análisis visual de la segunda matriz de correlación.
X = X.drop(['yearID','stint','Games','Strike Out','Double Plays Induced','Home Runs'],axis=1)

In [None]:
#Tercera matriz de correlación.
corr3 = X.corr()
figure = (15, 13)
fig, ax = pyplot.subplots(figsize=figure)
sns.set()
ax = sns.heatmap(corr3,annot=True,cmap="Reds")

# Evaluación de Modelos

In [None]:
#Logistic Regression multilabel model function with crossvalidation
def LRs(X,y,ns):
    lr = LogisticRegression(solver='liblinear')
    cls = OneVsRestClassifier(lr)
    scores = cross_val_score(cls,X,y,cv=ns)
    return print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

#Solver options: ‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’

In [None]:
%%time
#Evaluación del modelo de regresión logística.
LRs(X,y,10)

In [None]:
#Random Forest multilabel model function with crossvalidation
def RFs(X,y,n,ns):
    cls = OneVsRestClassifier(RandomForestClassifier(n_estimators=n))
    scores = cross_val_score(cls,X,y,cv=ns)
    return print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

In [None]:
%%time
#Evaluación del modelo Random Forest (n_estimators = 20).
RFs(X,y,20,5)

In [None]:
#K Neighbors multilabel model function with crossvalidation
def KNNs(X,y,k,ns):
    cls = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=k))
    scores = cross_val_score(cls,X,y,cv=ns)
    return print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

In [None]:
%%time
#Evaluación del modelo Random Forest (n_neighbors = 10).
KNNs(X,y,10,5)

# Entrenamiento de Modelos

In [None]:
#Se crean los dataframe de entrenamiento y de prueba.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10,random_state=42)
display(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
#Entrenamiento del modelo con los datos
cls_RandomForestClassifier = OneVsRestClassifier(RandomForestClassifier(n_estimators=20))
cls_RandomForestClassifier.fit(X_train,y_train)

In [None]:
#Nuestra predicción
y_pred = cls_RandomForestClassifier.predict(X_test)
y_pred

In [None]:
#Cálculo de métricas
act_val_rf = y_test.values.argmax(axis=1)
pred_rf = cls_RandomForestClassifier.predict(X_test).argmax(axis=1)

In [None]:
#Precision, Recall, F1-Score
def metrics_df(actual_values,predictions):
    precision = pd.DataFrame(precision_score(actual_values, predictions,average=None))
    recall = pd.DataFrame(recall_score(actual_values, predictions,average=None))
    f1_s = pd.DataFrame(f1_score(actual_values, predictions,average=None))
    metrics = pd.concat([precision, recall, f1_s], axis=1)
    metrics.index = ['No World Series Team','G1 World Series Team','G2 World Series Team','G3 World Series Team']
    metrics.columns = ['precision', 'recall', 'f1_score']
    metrics.columns.name = 'WorldSeries'
    return metrics

In [None]:
%%time
metrics_df(act_val_rf,pred_rf)

In [None]:
#ROC curve function
def roc_curve_multiclass(n_classes,X_train,X_test,y_train,y_test,y_score):
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    #Agregación de los falsos positivos
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    #Interpolación de las ROC Curve en este punto
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Computo del AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    #ROC curves
    plt.figure()
    fig, ax = plt.subplots(figsize=(15,15))
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4)

    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Some extension of Receiver operating characteristic to multi-class')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
#Parámetros ROC curve
y_binarize = label_binarize(y, classes=['No World Series Team','G1 World Series Team',
                                  'G2 World Series Team','G3 World Series Team'])
n_classes = y.shape[1]

# Compute macro-average ROC curve and ROC area
X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y_binarize, test_size=0.20,random_state=13)

cls_ = OneVsRestClassifier(RandomForestClassifier(n_estimators=20))
cls_.fit(X_train_,y_train_)
y_score = cls_.predict(X_test_)

In [None]:
#Graficamos ROC curve
roc_curve_multiclass(n_classes,X_train_, X_test_, y_train_, y_test_, y_score)

In [None]:
#Función para generar matriz de confusión
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    #print(cm)

    fig, ax = plt.subplots(figsize=(7,7))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
#Generamos matriz de confusión
plot_confusion_matrix(act_val_rf,pred_rf,y.columns,cmap='Blues')