In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

In [2]:
X_train_english = pd.read_csv("./processed_embeddings/train_en.csv", sep=';')
y_val_english = pd.read_csv("./processed_embeddings/val_en.csv", sep=';')


X_train_spanish = pd.read_csv("./processed_embeddings/train_es.csv", sep=';')
y_val_spanish = pd.read_csv("./processed_embeddings/val_es.csv", sep=';')

In [3]:
def convert_string_to_list(string):
    try:
        # Primero eliminamos los corchetes y dividimos la cadena por espacios
        clean_str = string.strip('[]')
        # Luego dividimos por espacios y convertimos cada elemento en un float
        num_list = [float(item) for item in clean_str.split() if item]
        return num_list
    except:
        # En caso de un error, retornamos NaN para identificar fácilmente el problema
        return np.nan

In [4]:
# Aplicar esta función a cada elemento en la columna de embedding
X_train_english['lsa_embedding'] = X_train_english['lsa_embedding'].apply(convert_string_to_list)
y_val_english['lsa_embedding'] = y_val_english['lsa_embedding'].apply(convert_string_to_list)

# Aplicar esta función a cada elemento en la columna de embedding
X_train_english['fasttext_embeeding'] = X_train_english['fasttext_embeeding'].apply(convert_string_to_list)
y_val_english['fasttext_embeeding'] = y_val_english['fasttext_embeeding'].apply(convert_string_to_list)

# Aplicar esta función a cada elemento en la columna de embedding
X_train_english['roberta_embedding'] = X_train_english['roberta_embedding'].apply(convert_string_to_list)
y_val_english['roberta_embedding'] = y_val_english['roberta_embedding'].apply(convert_string_to_list)

In [5]:
# Aplicar esta función a cada elemento en la columna de embedding
X_train_spanish['lsa_embedding'] = X_train_spanish['lsa_embedding'].apply(convert_string_to_list)
y_val_spanish['lsa_embedding'] = y_val_spanish['lsa_embedding'].apply(convert_string_to_list)

# Aplicar esta función a cada elemento en la columna de embedding
X_train_spanish['fasttext_embeeding'] = X_train_spanish['fasttext_embeeding'].apply(convert_string_to_list)
y_val_spanish['fasttext_embeeding'] = y_val_spanish['fasttext_embeeding'].apply(convert_string_to_list)

# Aplicar esta función a cada elemento en la columna de embedding
X_train_spanish['roberta_embedding'] = X_train_spanish['roberta_embedding'].apply(convert_string_to_list)
y_val_spanish['roberta_embedding'] = y_val_spanish['roberta_embedding'].apply(convert_string_to_list)

In [6]:
X_train_en_lsa_embeddings = pd.DataFrame(X_train_english['lsa_embedding'].tolist())
X_train_en_fasttext_embeeding = pd.DataFrame(X_train_english['fasttext_embeeding'].tolist()) 
X_train_en_roberta_embedding = pd.DataFrame(X_train_english['roberta_embedding'].tolist()) 
y_train_en = X_train_english['label']

X_val_en_lsa_embeddings = pd.DataFrame(y_val_english['lsa_embedding'].tolist())
X_val_en_fasttext_embeeding = pd.DataFrame(y_val_english['fasttext_embeeding'].tolist()) 
X_val_en_roberta_embedding = pd.DataFrame(y_val_english['roberta_embedding'].tolist()) 
y_val_en = y_val_english['label']

In [7]:
X_train_es_lsa_embeddings = pd.DataFrame(X_train_spanish['lsa_embedding'].tolist())
X_train_es_fasttext_embeeding = pd.DataFrame(X_train_spanish['fasttext_embeeding'].tolist()) 
X_train_es_roberta_embedding = pd.DataFrame(X_train_spanish['roberta_embedding'].tolist()) 
y_train_es = X_train_spanish['label']

X_val_es_lsa_embeddings = pd.DataFrame(y_val_spanish['lsa_embedding'].tolist())
X_val_es_fasttext_embeeding = pd.DataFrame(y_val_spanish['fasttext_embeeding'].tolist()) 
X_val_es_roberta_embedding = pd.DataFrame(y_val_spanish['roberta_embedding'].tolist()) 
y_val_es = y_val_spanish['label']

In [8]:
def run_grid_search_and_print_mcc(model, X_train, y_train, X_test, y_test, param_grid, scoring, name, lang, cv=5):
    """
    Realiza una búsqueda de hiperparámetros usando GridSearchCV para cualquier modelo y 
    imprime el coeficiente de correlación de Matthews para el conjunto de prueba.

    Args:
    model (BaseEstimator): Modelo de scikit-learn para entrenar.
    X_train (array-like): Datos de entrenamiento características.
    y_train (array-like): Datos de entrenamiento etiquetas.
    X_test (array-like): Datos de prueba características.
    y_test (array-like): Datos de prueba etiquetas.
    param_grid (dict): Diccionario con parámetros para probar en el modelo.
    scoring (callable): Función de puntuación para evaluar los modelos.
    cv (int): Número de divisiones de validación cruzada.
    """
    # Crea un objeto GridSearchCV con el modelo y los parámetros dados
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring=scoring)
    
    # Entrena GridSearchCV con los datos de entrenamiento
    grid_search.fit(X_train, y_train)
    
    # Utiliza el mejor modelo para hacer predicciones en el conjunto de prueba
    best_clf = grid_search.best_estimator_
    predicted = best_clf.predict(X_test)
    
    # Calcula el MCC usando las etiquetas verdaderas y las predicciones
    mcc_score = matthews_corrcoef(y_test, predicted)
    
    # Imprime el MCC
    print(f"Matthews Correlation Coefficient (MCC) for the embedding {name} in {lang}:", mcc_score)
    
    return grid_search.best_params_

### LOGISTIC REGRESSION

In [9]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignorar FutureWarning y ConvergenceWarning específicos de scikit-learn
warnings.filterwarnings('ignore', category=FutureWarning, module='sklearn')
warnings.filterwarnings('ignore', category=ConvergenceWarning, module='sklearn')

In [10]:
lang_es = "spanish"
lang_en = "english"

lsa = "LSA"
fastext = "Fasttext"
roberta = "Roberta"

In [11]:
# Define los grupos de parámetros válidos
param_grid = [
    {'solver': ['liblinear', 'saga'], 'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'solver': ['newton-cg', 'lbfgs', 'sag'], 'penalty': ['l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'solver': ['saga'], 'penalty': ['none'], 'C': [1]}  # `C` es irrelevante para 'none' pero debe incluirse
]
scorer = make_scorer(matthews_corrcoef)

In [12]:
hiper_log_lsa = run_grid_search_and_print_mcc(LogisticRegression(), X_train_en_lsa_embeddings, 
                              y_train_en, X_val_en_lsa_embeddings, y_val_en, param_grid, scorer, lsa, lang_en)
hiper_log_fast = run_grid_search_and_print_mcc(LogisticRegression(), X_train_en_fasttext_embeeding, 
                              y_train_en, X_val_en_fasttext_embeeding, y_val_en, param_grid, scorer, fastext, lang_en)
hiper_log_robert = run_grid_search_and_print_mcc(LogisticRegression(), X_train_en_roberta_embedding, 
                              y_train_en, X_val_en_roberta_embedding, y_val_en, param_grid, scorer, roberta, lang_en)

Matthews Correlation Coefficient (MCC) for the embedding LSA in english: 0.6582930253968028
Matthews Correlation Coefficient (MCC) for the embedding Fasttext in english: 0.669558107154059
Matthews Correlation Coefficient (MCC) for the embedding Roberta in english: 0.7175062291658645


In [13]:
hiper_log_lsa

{'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}

In [14]:
hiper_log_fast

{'C': 100, 'penalty': 'l2', 'solver': 'saga'}

In [15]:
hiper_log_robert

{'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}

In [16]:
hiper_log_lsa_es = run_grid_search_and_print_mcc(LogisticRegression(), X_train_es_lsa_embeddings, 
                              y_train_es, X_val_es_lsa_embeddings, y_val_es, param_grid, scorer, lsa, lang_es)
hiper_log_fast_es = run_grid_search_and_print_mcc(LogisticRegression(), X_train_es_fasttext_embeeding, 
                              y_train_es, X_val_es_fasttext_embeeding, y_val_es, param_grid, scorer, fastext, lang_es)
hiper_log_robert_es = run_grid_search_and_print_mcc(LogisticRegression(), X_train_es_roberta_embedding, 
                              y_train_es, X_val_es_roberta_embedding, y_val_es, param_grid, scorer, roberta, lang_es)

Matthews Correlation Coefficient (MCC) for the embedding LSA in spanish: 0.5556823194048562
Matthews Correlation Coefficient (MCC) for the embedding Fasttext in spanish: 0.4399942969754765
Matthews Correlation Coefficient (MCC) for the embedding Roberta in spanish: 0.49269777147660165


In [17]:
hiper_log_lsa_es

{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}

In [18]:
hiper_log_fast_es

{'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}

In [19]:
hiper_log_robert_es

{'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}

### SVC

In [11]:
# Define los hiperparámetros que quieres probar
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Tipo de kernel
    'C': [0.1, 1, 10, 100],  # Parámetro de regularización
}

scorer = make_scorer(matthews_corrcoef,sample_weight=None)

In [12]:
hiper_svc_lsa = run_grid_search_and_print_mcc(svm.SVC(), X_train_en_lsa_embeddings, 
                              y_train_en, X_val_en_lsa_embeddings, y_val_en, param_grid, scorer, lsa, lang_en)
hiper_svc_fast = run_grid_search_and_print_mcc(svm.SVC(), X_train_en_fasttext_embeeding, 
                              y_train_en, X_val_en_fasttext_embeeding, y_val_en, param_grid, scorer, fastext, lang_en)
hiper_svc_robert = run_grid_search_and_print_mcc(svm.SVC(), X_train_en_roberta_embedding, 
                              y_train_en, X_val_en_roberta_embedding, y_val_en, param_grid, scorer, roberta, lang_en)

Matthews Correlation Coefficient (MCC) for the embedding LSA in english: 0.6525402876390591
Matthews Correlation Coefficient (MCC) for the embedding Fasttext in english: 0.6667374826170468
Matthews Correlation Coefficient (MCC) for the embedding Roberta in english: 0.7146855973480196


In [13]:
hiper_svc_lsa

{'C': 10, 'kernel': 'linear'}

In [14]:
hiper_svc_fast

{'C': 10, 'kernel': 'rbf'}

In [15]:
hiper_svc_robert

{'C': 10, 'kernel': 'linear'}

In [16]:
hiper_svc_lsa_es = run_grid_search_and_print_mcc(svm.SVC(), X_train_es_lsa_embeddings, 
                              y_train_es, X_val_es_lsa_embeddings, y_val_es, param_grid, scorer, lsa, lang_es)
hiper_svc_fast_es = run_grid_search_and_print_mcc(svm.SVC(), X_train_es_fasttext_embeeding, 
                              y_train_es, X_val_es_fasttext_embeeding, y_val_es, param_grid, scorer, fastext, lang_es)
hiper_svc_robert_es = run_grid_search_and_print_mcc(svm.SVC(), X_train_es_roberta_embedding, 
                              y_train_es, X_val_es_roberta_embedding, y_val_es, param_grid, scorer, roberta, lang_es)

Matthews Correlation Coefficient (MCC) for the embedding LSA in spanish: 0.5528023059760292
Matthews Correlation Coefficient (MCC) for the embedding Fasttext in spanish: 0.4520701518033282
Matthews Correlation Coefficient (MCC) for the embedding Roberta in spanish: 0.5188058664764762


In [17]:
hiper_svc_lsa_es

{'C': 1, 'kernel': 'rbf'}

In [18]:
hiper_svc_fast_es

{'C': 10, 'kernel': 'rbf'}

In [19]:
hiper_svc_robert_es

{'C': 100, 'kernel': 'poly'}

### DECISION TREES

In [20]:
# Definir el espacio de parámetros
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['sqrt', 'log2']
}

scorer = make_scorer(matthews_corrcoef,sample_weight=None)

In [21]:
hiper_tree_lsa = run_grid_search_and_print_mcc(DecisionTreeClassifier(), X_train_en_lsa_embeddings, 
                              y_train_en, X_val_en_lsa_embeddings, y_val_en, param_grid, scorer, lsa, lang_en)
hiper_tree_fast = run_grid_search_and_print_mcc(DecisionTreeClassifier(), X_train_en_fasttext_embeeding, 
                              y_train_en, X_val_en_fasttext_embeeding, y_val_en, param_grid, scorer, fastext, lang_en)
hiper_tree_robert = run_grid_search_and_print_mcc(DecisionTreeClassifier(), X_train_en_roberta_embedding, 
                              y_train_en, X_val_en_roberta_embedding, y_val_en, param_grid, scorer, roberta, lang_en)

Matthews Correlation Coefficient (MCC) for the embedding LSA in english: 0.37208900678431023
Matthews Correlation Coefficient (MCC) for the embedding Fasttext in english: 0.35858640814772186
Matthews Correlation Coefficient (MCC) for the embedding Roberta in english: 0.38178136281691333


In [22]:
hiper_tree_lsa

{'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [23]:
hiper_tree_fast

{'criterion': 'gini',
 'max_depth': 30,
 'max_features': 'sqrt',
 'min_samples_leaf': 5,
 'min_samples_split': 20}

In [24]:
hiper_tree_robert

{'criterion': 'entropy',
 'max_depth': 30,
 'max_features': 'sqrt',
 'min_samples_leaf': 10,
 'min_samples_split': 20}

In [25]:
hiper_tree_lsa_es = run_grid_search_and_print_mcc(DecisionTreeClassifier(), X_train_es_lsa_embeddings, 
                              y_train_es, X_val_es_lsa_embeddings, y_val_es, param_grid, scorer, lsa, lang_es)
hiper_tree_fast_es = run_grid_search_and_print_mcc(DecisionTreeClassifier(), X_train_es_fasttext_embeeding, 
                              y_train_es, X_val_es_fasttext_embeeding, y_val_es, param_grid, scorer, fastext, lang_es)
hiper_tree_robert_es = run_grid_search_and_print_mcc(DecisionTreeClassifier(), X_train_es_roberta_embedding, 
                              y_train_es, X_val_es_roberta_embedding, y_val_es, param_grid, scorer, roberta, lang_es)

Matthews Correlation Coefficient (MCC) for the embedding LSA in spanish: 0.2430080172058709
Matthews Correlation Coefficient (MCC) for the embedding Fasttext in spanish: 0.1376497024564274
Matthews Correlation Coefficient (MCC) for the embedding Roberta in spanish: 0.31627695149055063


In [26]:
hiper_tree_lsa_es

{'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 5,
 'min_samples_split': 10}

In [27]:
hiper_tree_fast_es

{'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 5,
 'min_samples_split': 20}

In [28]:
hiper_tree_robert_es

{'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 10,
 'min_samples_split': 2}

### STACKING

In [29]:
# Definir los clasificadores base
estimators = [ ('svc', make_pipeline(StandardScaler(), SVC(probability=True))), 
              ('lr', LogisticRegression()), ('dt', DecisionTreeClassifier()) ]

# Definir el meta-clasificador
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5 )

clf.fit(X_train_en_lsa_embeddings, y_train_en)
y_pred = clf.predict(X_val_en_lsa_embeddings)
mcc = matthews_corrcoef(y_val_en, y_pred)
print(f"Matthews Correlation Coefficient (MCC) of Stacking Classifier:", mcc)

Matthews Correlation Coefficient (MCC) of Stacking Classifier: 0.6809691909977089


In [30]:
# Obtener los hiperparámetros de los estimadores base
for name, estimator in clf.named_estimators_.items():
    print(f"Hiperparámetros del estimador base '{name}':")
    print(estimator.get_params())

# Obtener los hiperparámetros del estimador final
print("Hiperparámetros del estimador final:")
print(clf.final_estimator_.get_params())

Hiperparámetros del estimador base 'svc':
{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('svc', SVC(probability=True))], 'verbose': False, 'standardscaler': StandardScaler(), 'svc': SVC(probability=True), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'svc__C': 1.0, 'svc__break_ties': False, 'svc__cache_size': 200, 'svc__class_weight': None, 'svc__coef0': 0.0, 'svc__decision_function_shape': 'ovr', 'svc__degree': 3, 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__max_iter': -1, 'svc__probability': True, 'svc__random_state': None, 'svc__shrinking': True, 'svc__tol': 0.001, 'svc__verbose': False}
Hiperparámetros del estimador base 'lr':
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Hiperp

In [31]:
# Definir los clasificadores base
estimators = [ ('svc', make_pipeline(StandardScaler(), SVC(probability=True))), 
              ('lr', LogisticRegression()), ('dt', DecisionTreeClassifier()) ]

# Definir el meta-clasificador
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5 )

clf.fit(X_train_en_fasttext_embeeding, y_train_en)
y_pred = clf.predict(X_val_en_fasttext_embeeding)
mcc = matthews_corrcoef(y_val_en, y_pred)
print(f"Matthews Correlation Coefficient (MCC) of Stacking Classifier:", mcc)

Matthews Correlation Coefficient (MCC) of Stacking Classifier: 0.6611548400972423


In [32]:
# Obtener los hiperparámetros de los estimadores base
for name, estimator in clf.named_estimators_.items():
    print(f"Hiperparámetros del estimador base '{name}':")
    print(estimator.get_params())

# Obtener los hiperparámetros del estimador final
print("Hiperparámetros del estimador final:")
print(clf.final_estimator_.get_params())

Hiperparámetros del estimador base 'svc':
{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('svc', SVC(probability=True))], 'verbose': False, 'standardscaler': StandardScaler(), 'svc': SVC(probability=True), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'svc__C': 1.0, 'svc__break_ties': False, 'svc__cache_size': 200, 'svc__class_weight': None, 'svc__coef0': 0.0, 'svc__decision_function_shape': 'ovr', 'svc__degree': 3, 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__max_iter': -1, 'svc__probability': True, 'svc__random_state': None, 'svc__shrinking': True, 'svc__tol': 0.001, 'svc__verbose': False}
Hiperparámetros del estimador base 'lr':
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Hiperp

In [33]:
# Definir los clasificadores base
estimators = [ ('svc', make_pipeline(StandardScaler(), SVC(probability=True))), 
              ('lr', LogisticRegression()), ('dt', DecisionTreeClassifier()) ]

# Definir el meta-clasificador
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5 )

clf.fit(X_train_en_roberta_embedding, y_train_en)
y_pred = clf.predict(X_val_en_roberta_embedding)
mcc = matthews_corrcoef(y_val_en, y_pred)
print(f"Matthews Correlation Coefficient (MCC) of Stacking Classifier:", mcc)

Matthews Correlation Coefficient (MCC) of Stacking Classifier: 0.7231480544883964


In [34]:
# Obtener los hiperparámetros de los estimadores base
for name, estimator in clf.named_estimators_.items():
    print(f"Hiperparámetros del estimador base '{name}':")
    print(estimator.get_params())

# Obtener los hiperparámetros del estimador final
print("Hiperparámetros del estimador final:")
print(clf.final_estimator_.get_params())

Hiperparámetros del estimador base 'svc':
{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('svc', SVC(probability=True))], 'verbose': False, 'standardscaler': StandardScaler(), 'svc': SVC(probability=True), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'svc__C': 1.0, 'svc__break_ties': False, 'svc__cache_size': 200, 'svc__class_weight': None, 'svc__coef0': 0.0, 'svc__decision_function_shape': 'ovr', 'svc__degree': 3, 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__max_iter': -1, 'svc__probability': True, 'svc__random_state': None, 'svc__shrinking': True, 'svc__tol': 0.001, 'svc__verbose': False}
Hiperparámetros del estimador base 'lr':
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Hiperp

In [35]:
# Definir los clasificadores base
estimators = [ ('svc', make_pipeline(StandardScaler(), SVC(probability=True))), 
              ('lr', LogisticRegression()), ('dt', DecisionTreeClassifier()) ]

# Definir el meta-clasificador
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5 )

clf.fit(X_train_es_lsa_embeddings, y_train_es)
y_pred = clf.predict(X_val_es_lsa_embeddings)
mcc = matthews_corrcoef(y_val_es, y_pred)
print(f"Matthews Correlation Coefficient (MCC) of Stacking Classifier:",mcc)

Matthews Correlation Coefficient (MCC) of Stacking Classifier: 0.5419839227229034


In [36]:
# Obtener los hiperparámetros de los estimadores base
for name, estimator in clf.named_estimators_.items():
    print(f"Hiperparámetros del estimador base '{name}':")
    print(estimator.get_params())

# Obtener los hiperparámetros del estimador final
print("Hiperparámetros del estimador final:")
print(clf.final_estimator_.get_params())

Hiperparámetros del estimador base 'svc':
{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('svc', SVC(probability=True))], 'verbose': False, 'standardscaler': StandardScaler(), 'svc': SVC(probability=True), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'svc__C': 1.0, 'svc__break_ties': False, 'svc__cache_size': 200, 'svc__class_weight': None, 'svc__coef0': 0.0, 'svc__decision_function_shape': 'ovr', 'svc__degree': 3, 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__max_iter': -1, 'svc__probability': True, 'svc__random_state': None, 'svc__shrinking': True, 'svc__tol': 0.001, 'svc__verbose': False}
Hiperparámetros del estimador base 'lr':
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Hiperp

In [37]:
# Definir los clasificadores base
estimators = [ ('svc', make_pipeline(StandardScaler(), SVC(probability=True))), 
              ('lr', LogisticRegression()), ('dt', DecisionTreeClassifier()) ]

# Definir el meta-clasificador
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5 )

clf.fit(X_train_es_fasttext_embeeding, y_train_es)
y_pred = clf.predict(X_val_es_fasttext_embeeding)
mcc = matthews_corrcoef(y_val_es, y_pred)
print(f"Matthews Correlation Coefficient (MCC) of Stacking Classifier:", mcc)

Matthews Correlation Coefficient (MCC) of Stacking Classifier: 0.46920012563829894


In [38]:
# Obtener los hiperparámetros de los estimadores base
for name, estimator in clf.named_estimators_.items():
    print(f"Hiperparámetros del estimador base '{name}':")
    print(estimator.get_params())

# Obtener los hiperparámetros del estimador final
print("Hiperparámetros del estimador final:")
print(clf.final_estimator_.get_params())

Hiperparámetros del estimador base 'svc':
{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('svc', SVC(probability=True))], 'verbose': False, 'standardscaler': StandardScaler(), 'svc': SVC(probability=True), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'svc__C': 1.0, 'svc__break_ties': False, 'svc__cache_size': 200, 'svc__class_weight': None, 'svc__coef0': 0.0, 'svc__decision_function_shape': 'ovr', 'svc__degree': 3, 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__max_iter': -1, 'svc__probability': True, 'svc__random_state': None, 'svc__shrinking': True, 'svc__tol': 0.001, 'svc__verbose': False}
Hiperparámetros del estimador base 'lr':
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Hiperp

In [39]:
# Definir los clasificadores base
estimators = [ ('svc', make_pipeline(StandardScaler(), SVC(probability=True))), 
              ('lr', LogisticRegression()), ('dt', DecisionTreeClassifier()) ]

# Definir el meta-clasificador
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5 )

clf.fit(X_train_es_roberta_embedding, y_train_es)
y_pred = clf.predict(X_val_es_roberta_embedding)
mcc = matthews_corrcoef(y_val_es, y_pred)
print(f"Matthews Correlation Coefficient (MCC) of Stacking Classifier:", mcc)

Matthews Correlation Coefficient (MCC) of Stacking Classifier: 0.5158144128839315


In [40]:
# Obtener los hiperparámetros de los estimadores base
for name, estimator in clf.named_estimators_.items():
    print(f"Hiperparámetros del estimador base '{name}':")
    print(estimator.get_params())

# Obtener los hiperparámetros del estimador final
print("Hiperparámetros del estimador final:")
print(clf.final_estimator_.get_params())

Hiperparámetros del estimador base 'svc':
{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('svc', SVC(probability=True))], 'verbose': False, 'standardscaler': StandardScaler(), 'svc': SVC(probability=True), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'svc__C': 1.0, 'svc__break_ties': False, 'svc__cache_size': 200, 'svc__class_weight': None, 'svc__coef0': 0.0, 'svc__decision_function_shape': 'ovr', 'svc__degree': 3, 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__max_iter': -1, 'svc__probability': True, 'svc__random_state': None, 'svc__shrinking': True, 'svc__tol': 0.001, 'svc__verbose': False}
Hiperparámetros del estimador base 'lr':
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Hiperp