## Modelo de aprendizaje automático utilizando Fusión a nivel de Clasificador (Late Fusion)

Teniendo en cuenta las heterogeneidad de las características (demográficas y académicas), se entrenaron cuatro clasificadores expertos en un tipo de característica (2 expertos en métricas) y al final se entrenó un clasificador teniendo en cuenta las probabilidades obtenidas en los clasficidadores iniciales.

- Algoritmo de Random Forest para características demográficas y métricas demográficas
- Algoritmo de Gradient Boosting para características académicas y métricas académicas
- Algoritmo de red neuronal configuración Multilayer Perceptron

In [1]:
%matplotlib notebook
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sb
import math
import sys
import glob
import pickle
import matplotlib.pyplot as plt
import scipy.stats as stats
from matplotlib import cm
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
from sklearn.model_selection import cross_validate
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report,precision_recall_curve,precision_recall_curve,precision_score,recall_score,accuracy_score,f1_score,confusion_matrix, roc_curve, auc
import itertools
from scipy import interp
from sklearn.preprocessing import label_binarize

In [2]:
data_ini = pd.read_csv('/Users/Downloads/Dataset/Final_Dataset/Datasets/Data_1449_416_edad.csv')
#data_comunas = pd.read_csv('/Users/Downloads/Dataset/Final_Dataset/Datasets/Data_2254_416_Comunas_edad.csv')

data_ini_ = data_ini.copy()

data_ini_['course_n'] = data_ini.course.replace({'SEXTO':6,'SEPTIMO':7,'OCTAVO':8,'NOVENO':9,
                                                                 'DECIMO':10,'ONCE':11})

data_demog = data_ini_[['age_frac','brother_school','gender_n','score_sisben','distance','course_n','year',
                        'grade_n','Class']]

data_acad = data_ini[['math','natural_sciences','english','spanish','peace_chair','social_sciences',
                      'religion','arts','sports','technology','entrepreneurship','ethic', 'Class']]


In [11]:
data_metric_d = data_demog.copy()

#data_metric_d['m1_d'] = np.where((data_metric_d['score_sisben']>=0) & (data_metric_d['score_sisben']<=11),
#                                            1,np.where((data_metric_d['score_sisben']>11) & (data_metric_d['score_sisben']<=22),
#                                            2,np.where((data_metric_d['score_sisben']>22) & (data_metric_d['score_sisben']<=43),
#                                            3,np.where((data_metric_d['score_sisben']>43) & (data_metric_d['score_sisben']<=65),
#                                            4,np.where((data_metric_d['score_sisben']>65) & (data_metric_d['score_sisben']<=79),
#5,6)))))

#bins = [0,0.5,5]
#labels=[0,1]
#data_metric_d['m1_d'] = pd.cut(data_metric_d['brother_school'], bins=bins, labels=labels, include_lowest=True)
#data_final_demo_m['Métrica_d2*'] = logodds
df = np.array(data_metric_d['brother_school']+0.0000000001)

fitted_data, fitted_lambda = stats.boxcox(df)

pd_hc = pd.DataFrame(fitted_data, columns=['cox_hec'])

data_metric_d = pd.concat([data_metric_d, pd_hc], axis=1)

data_metric_d['m1_d'] = data_metric_d['age_frac'] - (data_metric_d['course_n'] + 5)

#data_final_demo_m['Métrica_d4*'] = logodds
#data_metric_d['m2_d'] = np.where((data_metric_d['course_n']==data_metric_d['grade_n']),0,
#                                  np.where((((2019-data_metric_d['year'])+data_metric_d['course_n'])-data_metric_d['grade_n'])<0,
#                                            0,((2019-data_metric_d['year'])+data_metric_d['course_n'])-data_metric_d['grade_n']))

data_metric_d['m2_d'] = (2019-data_metric_d['year']+data_metric_d['course_n'])-data_metric_d['grade_n']

data_metric_d_ = data_metric_d[['cox_hec','m1_d','m2_d','Class']]

data_metric_d_



Unnamed: 0,cox_hec,m1_d,m2_d,Class
0,1.000000e-10,0.70278,0,0
1,-5.503162e+01,0.85833,0,0
2,6.772553e-01,1.06944,0,0
3,-5.503162e+01,0.34444,0,0
4,-5.503162e+01,0.88333,2,0
...,...,...,...,...
1860,6.772553e-01,0.86667,0,1
1861,-5.503162e+01,0.83611,0,1
1862,1.000000e-10,4.22222,0,1
1863,-5.503162e+01,1.88333,0,1


In [None]:
data_metric_a = data_acad.copy()

data_metric_a['mCM'] = (data_metric_a['natural_sciences']+data_metric_a['math'])/2
data_metric_a['mSC'] = (data_metric_a['social_sciences']+data_metric_a['peace_chair'])/2 
data_metric_a['mCh'] = (data_metric_a['spanish']+data_metric_a['english'])/2
data_metric_a['mEr'] = (data_metric_a['ethic']+data_metric_a['religion'])/2
data_metric_a['mAt'] = (data_metric_a['arts']+data_metric_a['technology'])/2

data_metric_a['mah']=(data_metric_a['natural_sciences']+data_metric_a['math']+data_metric_a['social_sciences']+data_metric_a['spanish']+data_metric_a['english'])/5
data_metric_a['mbh']=(data_metric_a['technology']+data_metric_a['religion']+data_metric_a['peace_chair']+data_metric_a['arts']+data_metric_a['ethic']+data_metric_a['entrepreneurship']+data_metric_a['sports'])/7


data_metric_a['m1_a'] = (data_metric_a['natural_sciences']/5)*((data_metric_a['mah']+data_metric_a['mbh'])/2)

data_metric_a['m2_a'] = (data_metric_a['entrepreneurship']/5)*((data_metric_a['mah']+data_metric_a['mbh'])/2)

data_metric_a['m3_a'] = (data_metric_a['math']/5)*((data_metric_a['mah']+data_metric_a['mbh'])/2)

data_metric_a['m4_a'] = (data_metric_a['sports']/5)*((data_metric_a['mah']+data_metric_a['mbh'])/2)


data_metric_a_ = data_metric_a[['mCM','mSC','mCh','mEr','mAt','mah','mbh',
                               'm1_a','m2_a','m3_a','m4_a','Class']]

data_metric_a_

In [None]:
data_demog_f = pd.DataFrame(data_demog).drop('Class', axis=1)

y_d = data_demog[['Class']]

dtrain, dtest, dytrain, dytest = train_test_split(data_demog_f, y_d, random_state=0, test_size = 0.3)

dytrain['Class'].value_counts()


In [None]:
data_metric_df = pd.DataFrame(data_metric_d_).drop('Class', axis=1)

y_md = data_metric_d_[['Class']]

mdtrain, mdtest, mdytrain, mdytest = train_test_split(data_metric_df, y_md, random_state=0, test_size = 0.3)

mdytrain['Class'].value_counts()

In [None]:
data_acad_f = pd.DataFrame(data_acad).drop('Class', axis=1)

y_a = data_acad[['Class']]

atrain, atest, aytrain, aytest = train_test_split(data_acad_f, y_a, random_state=0, test_size = 0.3)

ydtrain['Class'].value_counts()

In [None]:
data_metric_af = pd.DataFrame(data_metric_a_).drop('Class', axis=1)

y_ma = data_metric_a_[['Class']]

matrain, matest, maytrain, maytest = train_test_split(data_metric_af, y_ma, random_state=0, test_size = 0.3)

ydtrain['Class'].value_counts()

In [None]:
sc = StandardScaler().fit(dtrain)
X_train_d = sc.transform(dtrain)
X_test_d = sc.transform(dtest)

filename_model_d = 'model_api_demog.pkl'

tuned_parameters = [{'max_features': ['sqrt'], 'min_samples_leaf': [0.001],
                     'n_estimators': [700], 'max_depth':[50],
                     'min_samples_split':[2]}]
clf_demog = GridSearchCV(RandomForestClassifier(bootstrap = True,class_weight='balanced'), 
                                 tuned_parameters, cv=5, scoring='roc_auc')

clf_demog.fit(X_train_d, dytrain)

joblib.dump(clf_demog, filename_model_d)
print(clf_demog.best_params_)

y_true1, y_pred1 = dytest, clf_demog.predict(X_test_d)
print(classification_report(y_true1, y_pred1))

print(precision_score(y_true1, y_pred1))
print(recall_score(y_true1, y_pred1))
print(f1_score(y_true1, y_pred1))
print(accuracy_score(y_true1, y_pred1))
confusion_matrix(y_true1, y_pred1)

In [None]:
sc = StandardScaler().fit(mdtrain)
X_train_md = sc.transform(mdtrain)
X_test_md = sc.transform(mdtest)

filename_model_md = 'model_api_md.pkl'

tuned_parameters = [{'max_features': [3], 'min_samples_leaf': [0.1],
                     'n_estimators': [50], 'max_depth':[10],
                     'min_samples_split':[10]}]
clf_md = GridSearchCV(RandomForestClassifier(bootstrap = True,class_weight='balanced'), 
                                 tuned_parameters, cv=5, scoring='roc_auc')

clf_md.fit(X_train_md, mdytrain)

joblib.dump(clf_md, filename_model_md)
print(clf_md.best_params_)

y_true1, y_pred1 = mdytest, clf_md.predict(X_test_md)
print(classification_report(y_true1, y_pred1))

print(precision_score(y_true1, y_pred1))
print(recall_score(y_true1, y_pred1))
print(f1_score(y_true1, y_pred1))
print(accuracy_score(y_true1, y_pred1))
confusion_matrix(y_true1, y_pred1)

In [None]:
sc = StandardScaler().fit(atrain)
X_train_a = sc.transform(atrain)
X_test_a = sc.transform(atest)

filename_model_a = 'model_api_a.pkl'
tuned_parameters = [{'max_features': ['sqrt'], 'min_samples_leaf': [0.01],
                     'n_estimators': [100], 'max_depth':[4],'learning_rate':[0.01],
                     'min_samples_split':[2]}]
                   
clf_acad = GridSearchCV(GradientBoostingClassifier(),tuned_parameters, cv=5, scoring='roc_auc')

clf_acad.fit(X_train_a, aytrain)

joblib.dump(clf_acad, filename_model_a)
print(clf_acad.best_params_)

y_true1, y_pred1 = aytest, clf_acad.predict(X_test_a)
print(classification_report(y_true1, y_pred1))

print(precision_score(y_true1, y_pred1))
print(recall_score(y_true1, y_pred1))
print(f1_score(y_true1, y_pred1))
print(accuracy_score(y_true1, y_pred1))
confusion_matrix(y_true1, y_pred1)


In [None]:
sc = StandardScaler().fit(matrain)
X_train_ma = sc.transform(matrain)
X_test_ma = sc.transform(matest)

filename_model_ma = 'model_api_ma.pkl'
tuned_parameters = [{'max_features': ['sqrt'], 'min_samples_leaf': [0.003],
                     'n_estimators': [200], 'max_depth':[4],'learning_rate':[0.008],
                     'min_samples_split':[2]}]
                
clf_ma = GridSearchCV(GradientBoostingClassifier(),tuned_parameters, cv=5, scoring='roc_auc')

clf_ma.fit(X_train_ma, maytrain)

joblib.dump(clf_ma, filename_model_ma)
print(clf_ma.best_params_)

y_true1, y_pred1 = maytest, clf_ma.predict(X_test_ma)
print(classification_report(y_true1, y_pred1))

print(precision_score(y_true1, y_pred1))
print(recall_score(y_true1, y_pred1))
print(f1_score(y_true1, y_pred1))
print(accuracy_score(y_true1, y_pred1))
confusion_matrix(y_true1, y_pred1)



In [None]:
# Evalúa los dos clasificadores entrenados previamente (para caractrísticas sociales y académicas) con la función "predict_proba".
# La función "predict_proba" devuelve una probabilidad de clasificación para cada una de las clases
# Para la representación late fusion se concatenan los vectores de probabilidad 
# con la confianza de cada una de las clases para ambos descriptores

late_train_dam = np.hstack((clf_demog.predict_proba(X_train_d),clf_acad.predict_proba(X_train_a), 
                                  clf_md.predict_proba(X_train_md),clf_ma.predict_proba(X_train_ma)))
late_test_dam =  np.hstack((clf_demog.predict_proba(X_test_d),clf_acad.predict_proba(X_test_a),
                                  clf_md.predict_proba(X_test_md),clf_ma.predict_proba(X_test_ma)))


# Entrena y evalúa el clasificador final a partir de la representación late fusion
stdSlr3 = StandardScaler().fit(late_train_dam)
late_train_sc = stdSlr3.transform(late_train_dam)
late_test_sc =  stdSlr3.transform(late_test_dam)

filename_latefusion = 'model_latefusion.pkl'

#filename_late_all = 'clf_latefusion_metrics_svm.pkl'

#tuned_parameters3 = [{'kernel': ['rbf'], 'gamma': [0.00001,0.01],
#                     'C': [1e4, 1e5,1e6]}]

#clf_m = GridSearchCV(svm.SVC(probability=True), tuned_parameters3, cv=5, scoring='roc_auc')
#tuned_parameters3 = [{'kernel': ['rbf'], 'gamma': [0.01,0.1,1,3.75,4.25],
#                     'C': [100,200,300,500]}]

#clf_m = GridSearchCV(svm.SVC(probability=True), tuned_parameters3, cv=5, scoring='roc_auc')
#tuned_parameters = [{'max_features': ['sqrt'], 'min_samples_leaf': [0.01],
#                     'n_estimators': [100,500], 'max_depth':[50],'learning_rate':[0.1],
#                     'min_samples_split':[2]}]
#clf_m = GridSearchCV(GradientBoostingClassifier(random_state=0),tuned_parameters, cv=5, scoring='roc_auc')

tuned_parameters = [{'activation': ['relu'], 
                     'solver': ['adam'],
                     'hidden_layer_sizes': [[11,9,4]],
                     'alpha': [0.1], 'batch_size': [64]}]
model_latefusion = GridSearchCV(MLPClassifier(random_state=0), tuned_parameters, cv=5, scoring='roc_auc')

model_latefusion.fit(late_train_sc, aytrain)

joblib.dump(model_latefusion, filename_latefusion)

print(model_latefusion.best_params_)

y_true3_m, y_pred3_m = aytest, model_latefusion.predict(late_test_sc)
#y_true, y_pred = y_test_1, clf_LATE.predict(late_test_scaled)
print(classification_report(y_true3_m, y_pred3_m))
print(precision_score(y_true3_m, y_pred3_m))
print(recall_score(y_true3_m, y_pred3_m))
print(f1_score(y_true3_m, y_pred3_m))
print(accuracy_score(y_true3_m, y_pred3_m))
confusion_matrix(y_true3_m, y_pred3_m)