In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as kr
import keras
import matplotlib.pyplot as plt
import joblib
import warnings

# Modelos
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB
from keras.models import Sequential
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    RepeatedKFold,
    GridSearchCV,
    RandomizedSearchCV
)

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    ConfusionMatrixDisplay,
    precision_score,
    recall_score,
    f1_score
)
from keras.layers import Dropout
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from keras.models import load_model
from keras.utils import np_utils
from tqdm import tqdm
warnings.filterwarnings('ignore')

ImportError: cannot import name 'np_utils' from 'keras.utils' (/usr/local/lib/python3.10/dist-packages/keras/utils/__init__.py)

In [None]:
def get_X_y(df):
    FEATURES = ['A_t','B_t','C_t','D_t','E_t','Valor_1','Valor_2','Valor_3',
               'Valor_4','Valor_5','Valor_6','Valor_7','Valor_8','Valor_9','Valor_10']
    TARGET = 'label'
    #TARGET = 'INTENSIDAD'
    X = df[FEATURES]
    y = df[TARGET]

    return X, y

In [None]:
df_prueba = pd.read_csv("Ruta_Dataset/tu_archivo.csv", delimiter=";")

In [None]:
df_prueba.columns

In [None]:
df_prueba['label'].value_counts()

In [None]:
# Fusionar intensidades si es necesario
if len(df_prueba['label'].value_counts()) != 2:
    df_prueba['label'] = df_prueba['label'].apply(lambda x: 1 if x==2 else (x))
    df_prueba['label'] = df_prueba['label'].apply(lambda x: 1 if x==3 else (x))
    df_prueba['label'] = df_prueba['label'].apply(lambda x: 1 if x==4 else (x))
    df_prueba['label'] = df_prueba['label'].apply(lambda x: 1 if x==5 else (x))
    df_prueba['label'] = df_prueba['label'].apply(lambda x: 1 if x==6 else (x))

In [None]:
df_prueba['label'].value_counts()

# BUCLE MODELO BINARIO

In [None]:
# Separar las intensidades 1 y 0 (odio / no odio)

intensidad_0_df_bucle = df_prueba[df_prueba['label'] == 0.0]
intensidad_1_df_bucle = df_prueba[df_prueba['label'] == 1.0]

rows_count_i0 = len(intensidad_0_df_bucle.index)
rows_count_i1 = len(intensidad_1_df_bucle.index)

list_row_count = [
    rows_count_i0,
    rows_count_i1,
]
print(f'Total: {len(df_prueba.index)}')
print(f'Intensidad 0: {rows_count_i0}')
print(f'Intensidad 1: {rows_count_i1}')

In [None]:
# Bucle para obtener el mejor 'trozo' del dataset de no odio, aqui se hacen entrenamientos
# independientes, como se ha explicado en el informe 1
inicio = 0
minimo = min(list_row_count)
accuracy_list_gb = []
metrics_list = []
for i in tqdm(range(inicio, rows_count_i0, minimo)):
    if i == 0:
        i = minimo
    if inicio != i:
        intensidad_0_df_total = df_prueba[df_prueba['label'] == 0.0]
        intensidad_0_df = intensidad_0_df_total.iloc[inicio:i]
        balanced_df = pd.concat([intensidad_0_df, intensidad_1_df_bucle])

        X, y = get_X_y(balanced_df)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

        gradient = GradientBoostingClassifier(
            n_estimators = 100,
            max_depth = 3,
            learning_rate = 0.1,
            max_features = 'sqrt'
         )

        gradient.fit(X_train, y_train)
        pred = gradient.predict(X_test)
        acc_score = accuracy_score(pred, y_test)

        precision_micro = precision_score(y_test, pred, average='binary')
        #print(f'Precision micro: ' "{0:.4f}".format(precision_micro))

        recall_micro = recall_score(y_test, pred, average='binary')
        #print(f'Recall micro: ' "{0:.4f}".format(recall_micro))

        f1_micro = f1_score(y_test, pred, average='binary')
        #print(f'F1 micro: ' "{0:.4f}".format(f1_micro))

        metrics_list.append({
            'Inicio':inicio,
            'Fin':i,
            'Precision':precision_micro,
            'Recall':recall_micro,
            'F1':f1_micro,
            'Accuracy':acc_score
        })

        if acc_score >= 0.85:
            accuracy_list_gb.append({
                'Inicio':inicio,
                'Fin':i,
                'Accuracy':acc_score
            })

    inicio = i

In [None]:
accuracy_list_gb

In [None]:
# Recorrer la lista de diccionarios y guardar los valores de Accuracy en una lista
# (transformar lista de diccionarios en list solo de accuracy)
lista_valores_acc = [item['Accuracy'] for item in accuracy_list_gb]
lista_valores_acc

In [None]:
# Obtener valores Inicio y Fin en función de valor máximo de Accuracy
maximo = max(lista_valores_acc)
inicio_entrenar = 0
fin_entrenar = 0
for diccionario in accuracy_list_gb:
    if diccionario['Accuracy'] == maximo:
        inicio_entrenar = diccionario['Inicio']
        fin_entrenar = diccionario['Fin']
print(f"Para el valor de Accuracy {maximo}, Inicio es {inicio_entrenar} y Fin es {fin_entrenar}.")

In [None]:
# Obtener el trozo del dataset de no odio donde mayor accuracy se haya obtenido, unirlo con el de odio
# y separar en X e Y para entrenar los modelos solo con esa parte.
intensidad_0_df_total = df_prueba[df_prueba['label'] == 0.0]
intensidad_0_df = intensidad_0_df_total.iloc[inicio_entrenar:fin_entrenar]
balanced_df = pd.concat([intensidad_0_df, intensidad_1_df_bucle])

X, y = get_X_y(balanced_df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
len(intensidad_0_df)

In [None]:
len(intensidad_1_df_bucle)

In [None]:
len(balanced_df)

In [None]:
y_train.value_counts()

In [None]:
# Entrenar modelo con valores Inicio y Fin según el mayor Accuracy

gradient = GradientBoostingClassifier(
    n_estimators = 100,
    max_depth = 3,
    learning_rate = 0.1,
    max_features = 'sqrt'
)

gradient.fit(X_train, y_train)
pred = gradient.predict(X_test)
acc_score = accuracy_score(pred, y_test)
precision = classification_report(y_test, pred)

print('Accuracy:',acc_score)

recall_micro = recall_score(y_test, pred, average='micro')
print(f'Recall micro: ' "{0:.4f}".format(recall_micro))

precision_micro = precision_score(y_test, pred, average='micro')
print(f'Precision micro: ' "{0:.4f}".format(precision_micro))

f1_micro = f1_score(y_test, pred, average='micro')
print(f'F1 micro: ' "{0:.4f}".format(f1_micro))

print(precision)

In [None]:
labels = [0,1]
GradientBoosting = confusion_matrix(y_true = y_test, y_pred = pred, normalize='true', labels = labels)

confusionMatrixDisplay = ConfusionMatrixDisplay(
    confusion_matrix = GradientBoosting
)

confusionMatrixDisplay.plot(cmap='Blues')
plt.show()

RANDOM FOREST

In [None]:
param_grid_rf = {
    'n_estimators': [100, 200, 300, 400],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [2,4,5,6],
    'criterion' :['gini', 'entropy']
}

rf =RandomForestClassifier()

forest_grid = GridSearchCV(estimator = rf, param_grid=param_grid_rf, cv= 5)
forest_grid.fit(X_train, y_train)
best_params_RF = forest_grid.best_params_
best_model_RF = forest_grid.best_estimator_
print(f'Mejores parámetros: {best_params_RF}')
predictions = best_model_RF.predict(X_test)


rf_acc = accuracy_score(predictions, y_test)
print('Accuracy',rf_acc)

report = classification_report(y_test, predictions)
print('Reporte de clasificación:\n', report)

In [None]:
labels = [0,1]
RandomForest = confusion_matrix(y_true = y_test, y_pred = predictions, normalize='true', labels = labels)

confusionMatrixDisplay = ConfusionMatrixDisplay(
    confusion_matrix = RandomForest
)

confusionMatrixDisplay.plot(cmap='Blues')
plt.show()

NAIVE BAYES

In [None]:
gnb = BernoulliNB(
            alpha=1.0,
            binarize=0.0,
            fit_prior=True,
            class_prior=None,
        )

gnb.fit(X_train, y_train)
pred_nb = gnb.predict(X_test)
gnb_acc = accuracy_score(pred, y_test)
print('Accuracy',gnb_acc)

In [None]:
report_nb = classification_report(y_test, pred_nb)
print('Reporte de clasificación:\n', report_nb)

In [None]:
labels = [0,1]
NaiveBayes = confusion_matrix(y_true = y_test, y_pred = pred_nb, normalize='true', labels = labels)

confusionMatrixDisplay = ConfusionMatrixDisplay(
    confusion_matrix = NaiveBayes
)

confusionMatrixDisplay.plot(cmap='Blues')
plt.show()

MLP

In [None]:
opt_sgd = SGD(
    learning_rate=0.01,
    momentum=0.9,
    nesterov=True
)

opt_adam = Adam(
    learning_rate=0.001,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-07,
    amsgrad=False,
)

# Crear modelo
model_mlp = Sequential()
model_mlp.add(Dense(64, input_dim=15, activation='relu')) # Capa de entrada
model_mlp.add(Dense(32, activation='relu'))
model_mlp.add(Dense(1, activation='sigmoid')) # Capa de salida

# Compilar modelo
model_mlp.compile(
    loss='binary_crossentropy',
    #optimizer='rmsprop',
    optimizer=opt_adam,
    metrics=['accuracy', AUC()])


# Callback para parada temprana
early_stop = EarlyStopping(monitor='loss', patience=100, restore_best_weights=True)

# Entrenar modelo
history = model_mlp.fit(
    X_train, y_train, epochs=5000,
    validation_data=(X_test, y_test),
    verbose=1,
    callbacks=[early_stop])

predicted_mlp_balanced = model_mlp.predict(X_test)

# Evaluar modelo
print('===== Train =====')
result_train = model_mlp.evaluate(X_train, y_train)
print('===== Test =====')
result_test = model_mlp.evaluate(X_test, y_test)

In [None]:
# Convertir valores continuos del resultado en valores binarios
y_pred_binary = (predicted_mlp_balanced > 0.5)

In [None]:
report_mlp = classification_report(y_test, y_pred_binary)
print('Reporte de clasificación:\n', report_mlp)

In [None]:
labels = [0,1]
MLP_matrix = confusion_matrix(y_true = y_test, y_pred = y_pred_binary, normalize='true', labels = labels)

confusionMatrixDisplay = ConfusionMatrixDisplay(
    confusion_matrix = MLP_matrix
)

confusionMatrixDisplay.plot(cmap='Blues')
plt.show()

CART

In [None]:
cart = DecisionTreeClassifier()

cart.fit(X_train, y_train)

cart_pred = cart.predict(X_test)

accuracy = accuracy_score(y_test, cart_pred)
print("Accuracy:", accuracy)

In [None]:
report_cart = classification_report(y_test, cart_pred)
print('Reporte de clasificación:\n', report_cart)

In [None]:
labels = [0,1]
cart_matrix = confusion_matrix(y_true = y_test, y_pred = cart_pred, normalize='true', labels = labels)

confusionMatrixDisplay = ConfusionMatrixDisplay(
    confusion_matrix = cart_matrix
)

confusionMatrixDisplay.plot(cmap='Blues')
plt.show()

SVM

In [None]:
param_grid_svm = {
        "C": [1e3, 1e5],
        "gamma":[1e-4, 1e-1],
    }

svm_grid = RandomizedSearchCV(SVC(kernel="rbf", class_weight="balanced"), param_grid_svm, n_iter=4)
svm_grid.fit(X_train, y_train)
predict_svm_grid = svm_grid.predict(X_test)
acc_svm = accuracy_score(y_test, predict_svm_grid)
print(acc_svm)

In [None]:
labels = [0,1]
SVM_matrix = confusion_matrix(y_true = y_test, y_pred = predict_svm_grid, normalize='true', labels = labels)

confusionMatrixDisplay = ConfusionMatrixDisplay(
    confusion_matrix = SVM_matrix
)

confusionMatrixDisplay.plot(cmap='Blues')
plt.show()

In [None]:
report_svm = classification_report(y_test, predict_svm_grid)
print('Reporte de clasificación:\n', report_svm)

In [None]:
# Guardar modelo si se necesita
#joblib.dump(gradient, 'modelos_guardados/GradientBoosting_binario.pkl')