# Trabajo de Fin de Grado: Aplicación de Inteligencia artificial a la predicción de los efectos de la radiación en sistemas digitales complejos

### Pablo Darós Pallarés

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsRestClassifier


Se importan los dataframes

In [None]:
df_one_hot = pd.read_csv('radecs_one_hot_label.csv', delimiter=",")

df_unique_label = pd.read_csv('radecs_unique_label.csv', delimiter=",")

pd.set_option('display.max_colwidth', None)

Se genera la X e Y para entrenamiento y test

In [None]:
# Funcion para comprobar si los valores de SEE son lists o str
def parse_see_value(value):
    if isinstance(value, str):
        try:
            return eval(value)
        except:
            return [value] 
    elif isinstance(value, list):
        return value
    else:
        return [value]

# Se comprueba si SEE es un list o str
df_unique_label['SEE'] = df_unique_label['SEE'].apply(parse_see_value)

# Convertir SEE para un modelo multietiqueta
mlb = MultiLabelBinarizer()
see_encoded = mlb.fit_transform(df_unique_label['SEE'])

# Se añaden las nuevas columnas al df
see_encoded_df = pd.DataFrame(see_encoded, columns=mlb.classes_, index=df_unique_label.index)
df_unique_label = pd.concat([df_unique_label, see_encoded_df], axis=1)

# Eliminar la columna original
df_unique_label.drop('SEE', axis=1, inplace=True)

# Se convierten las columnas categóricas a numéricas
categorical_cols = df_unique_label.select_dtypes(include=['object']).columns
df_unique_label = pd.get_dummies(df_unique_label, columns=categorical_cols, drop_first=True)

# Se eliminan carácteres especiales
df_unique_label.columns = df_unique_label.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)

# Asegurar que cada columna tiene un nombre unico
def make_unique(column_names):
    seen = {}
    for i, name in enumerate(column_names):
        if name in seen:
            seen[name] += 1
            new_name = f"{name}_{seen[name]}"
            while new_name in seen:
                seen[name] += 1
                new_name = f"{name}_{seen[name]}"
            column_names[i] = new_name
        else:
            seen[name] = 1
    return column_names

df_unique_label.columns = make_unique(list(df_unique_label.columns))

# Se formatean correctamente los nombres de las clases
class_names = [name.replace(':', '').replace(',', '') for name in mlb.classes_]

# Comentar estas lineas si se desean obtener los resultados originales ----------
# Se eliminan estas 7 etiquetas debido a ser demasiado infrecuentes
removed_labels = ['SEUAlpha', 'SEUPAlpha', 'SEUPNAlpha', 'SEUe', 'SETLP', 'SEFIHPL', 'SEGRN']
class_names = [name for name in class_names if name not in removed_labels]
# ----------

# Se crean X e Y
X = df_unique_label.drop(columns=class_names, axis=1)
y = df_unique_label[class_names].values

# Se divide en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Se crea la validación cruzada
kf = KFold(n_splits=5, shuffle=True, random_state=42)


##### Random Forest

In [None]:
# Optimización de hiperparámetros
rf_model = RandomForestClassifier()
rf_params = {
    'estimator__n_estimators': [50, 100],
    'estimator__max_depth': [None, 10, 20]
}

# Configurar GridSearchCV
rf_clf = OneVsRestClassifier(rf_model)
rf_grid_search = GridSearchCV(rf_clf, rf_params, scoring=make_scorer(f1_score, average='micro', zero_division=0), cv=kf, n_jobs=-1, verbose=1)

rf_grid_search.fit(X_train, y_train)

# Encontrar el mejor resultado
rf_best_model = rf_grid_search.best_estimator_
rf_y_pred = rf_best_model.predict(X_test)
rf_report = classification_report(y_test, rf_y_pred, target_names=class_names, zero_division=0)

print("Random Forest")
print(f"Mejores hiperparámetros: {rf_grid_search.best_params_}")
print(rf_report)


##### Gradient Boosting

In [None]:
# Optimización de hiperparámetros
gb_model = GradientBoostingClassifier()
gb_params = {
    'estimator__n_estimators': [50, 100],
    'estimator__learning_rate': [0.1, 0.01],
    'estimator__max_depth': [3, 5]
}

# Configurar GridSearchCV
gb_clf = OneVsRestClassifier(gb_model)
gb_grid_search = GridSearchCV(gb_clf, gb_params, scoring=make_scorer(f1_score, average='micro', zero_division=0), cv=kf, n_jobs=-1, verbose=1)

gb_grid_search.fit(X_train, y_train)

# Encontrar el mejor resultado
gb_best_model = gb_grid_search.best_estimator_
gb_y_pred = gb_best_model.predict(X_test)
gb_report = classification_report(y_test, gb_y_pred, target_names=class_names, zero_division=0)

print("Gradient Boosting")
print(f"Mejores hiperparámetros: {gb_grid_search.best_params_}")
print(gb_report)


##### SVM

In [None]:
# Optimización de hiperparámetros
svm_model = SVC(probability=True)
svm_params = {
    'estimator__C': [0.1, 1, 10],
    'estimator__kernel': ['linear']
}

# Configurar GridSearchCV
svm_clf = OneVsRestClassifier(svm_model)
svm_grid_search = GridSearchCV(svm_clf, svm_params, scoring=make_scorer(f1_score, average='micro', zero_division=0), cv=kf, n_jobs=-1, verbose=1)

svm_grid_search.fit(X_train, y_train)

# Encontrar el mejor resultado
svm_best_model = svm_grid_search.best_estimator_
svm_y_pred = svm_best_model.predict(X_test)
svm_report = classification_report(y_test, svm_y_pred, target_names=class_names, zero_division=0)

print("SVM (Linear)")
print(f"Mejores hiperparámetros: {svm_grid_search.best_params_}")
print(svm_report)


##### MLP

In [None]:
# Optimización de hiperparámetros
mlp_model = MLPClassifier(max_iter=1000)
mlp_params = {
    'estimator__hidden_layer_sizes': [(50,), (100,)],
    'estimator__alpha': [0.0001, 0.001]
}

# Configurar GridSearchCV
mlp_clf = OneVsRestClassifier(mlp_model)
mlp_grid_search = GridSearchCV(mlp_clf, mlp_params, scoring=make_scorer(f1_score, average='micro', zero_division=0), cv=kf, n_jobs=-1, verbose=1)

mlp_grid_search.fit(X_train, y_train)

# Encontrar el mejor resultado
mlp_best_model = mlp_grid_search.best_estimator_
mlp_y_pred = mlp_best_model.predict(X_test)
mlp_report = classification_report(y_test, mlp_y_pred, target_names=class_names, zero_division=0)

print("MLP Classifier")
print(f"Mejores hiperparámetros: {mlp_grid_search.best_params_}")
print(mlp_report)

##### K-Nearest Neighbors

In [None]:
# Optimización de hiperparámetros
knn_model = KNeighborsClassifier()
knn_params = {
    'estimator__n_neighbors': [3, 5, 7]
}

# Configurar GridSearchCV
knn_clf = OneVsRestClassifier(knn_model)
knn_grid_search = GridSearchCV(knn_clf, knn_params, scoring=make_scorer(f1_score, average='micro', zero_division=0), cv=kf, n_jobs=-1, verbose=1)

knn_grid_search.fit(X_train, y_train)

# Encontrar el mejor resultado
knn_best_model = knn_grid_search.best_estimator_
knn_y_pred = knn_best_model.predict(X_test)
knn_report = classification_report(y_test, knn_y_pred, target_names=class_names, zero_division=0)

print("K-Nearest Neighbors")
print(f"Mejores hiperparámetros: {knn_grid_search.best_params_}")
print(knn_report)


##### XGBoost

In [None]:
# Optimización de hiperparámetros
xgb_model = xgb.XGBClassifier()
xgb_params = {
    'estimator__n_estimators': [50, 100],
    'estimator__learning_rate': [0.1, 0.01],
    'estimator__max_depth': [3, 5]
}

# Configurar GridSearchCV
xgb_clf = OneVsRestClassifier(xgb_model)
xgb_grid_search = GridSearchCV(xgb_clf, xgb_params, scoring=make_scorer(f1_score, average='micro', zero_division=0), cv=kf, n_jobs=-1, verbose=1)

xgb_grid_search.fit(X_train, y_train)

# Encontrar el mejor resultado
xgb_best_model = xgb_grid_search.best_estimator_
xgb_y_pred = xgb_best_model.predict(X_test)
xgb_report = classification_report(y_test, xgb_y_pred, target_names=class_names, zero_division=0)

print("XGBoost")
print(f"Mejores hiperparámetros: {xgb_grid_search.best_params_}")
print(xgb_report)


##### LightGBM

In [None]:
# Optimización de hiperparámetros
lgb_model = lgb.LGBMClassifier()
lgb_params = {
    'estimator__n_estimators': [50, 100],
    'estimator__learning_rate': [0.1, 0.01],
    'estimator__max_depth': [3, 5]
}

# Configurar GridSearchCV
lgb_clf = OneVsRestClassifier(lgb_model)
lgb_grid_search = GridSearchCV(lgb_clf, lgb_params, scoring=make_scorer(f1_score, average='micro', zero_division=0), cv=kf, n_jobs=-1, verbose=1)

lgb_grid_search.fit(X_train, y_train)

# Encontrar el mejor resultado
lgb_best_model = lgb_grid_search.best_estimator_
lgb_y_pred = lgb_best_model.predict(X_test)
lgb_report = classification_report(y_test, lgb_y_pred, target_names=class_names, zero_division=0)

print("LightGBM")
print(f"Mejores hiperparámetros: {lgb_grid_search.best_params_}")
print(lgb_report)


##### CatBoost

In [None]:
# Optimización de hiperparámetros
cat_model = CatBoostClassifier(verbose=0)
cat_params = {
    'estimator__iterations': [100, 200],
    'estimator__learning_rate': [0.1, 0.01]
}

# Configurar GridSearchCV
cat_clf = OneVsRestClassifier(cat_model)
cat_grid_search = GridSearchCV(cat_clf, cat_params, scoring=make_scorer(f1_score, average='micro', zero_division=0), cv=kf, n_jobs=-1, verbose=1)

cat_grid_search.fit(X_train, y_train)

# Encontrar el mejor resultado
cat_best_model = cat_grid_search.best_estimator_
cat_y_pred = cat_best_model.predict(X_test)
cat_report = classification_report(y_test, cat_y_pred, target_names=class_names, zero_division=0)

print("CatBoost")
print(f"Mejores hiperparámetros: {cat_grid_search.best_params_}")
print(cat_report)
