In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# Cargar el conjunto de datos
data = pd.read_csv("data/Lluvia_full.csv")

# Codificar la variable LluviaMan como 0 y 1
data['LluviaMan'] = data['LluviaMan'].map({'No': 0, 'Sí': 1})

# Separar los datos en conjuntos de entrenamiento y validación (70:30)
X = data.drop('LluviaMan', axis=1)
y = data['LluviaMan']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=2023)

# Seleccionar variables numéricas y categóricas
numeric_vars = ["MinTemp", "MaxTemp", "Lluvia", "Evaporacion", "Sol", "VelRafaga", "Vel9am", "Vel3pm",
                "Hum9am", "Hum3pm", "Pres9am", "Pre3pm", "Nub9am", "Nub3pm", "Temp9am", "Temp3pm"]
categorical_vars = ["Koppen", "Estacion"]

# Escalar las variables numéricas
scaler = StandardScaler()
X_train[numeric_vars] = scaler.fit_transform(X_train[numeric_vars])
X_test[numeric_vars] = scaler.transform(X_test[numeric_vars])

# Transformar variables categóricas en one-hot encoding
encoder = OneHotEncoder(sparse=False, drop='first')
X_train_encoded = encoder.fit_transform(X_train[categorical_vars])
X_test_encoded = encoder.transform(X_test[categorical_vars])

# Concatenar las variables codificadas con las numéricas
X_train_final = pd.concat([X_train.drop(categorical_vars, axis=1), pd.DataFrame(X_train_encoded)], axis=1)
X_test_final = pd.concat([X_test.drop(categorical_vars, axis=1), pd.DataFrame(X_test_encoded)], axis=1)

# Parte 2: Ajuste de modelos y optimización de hiperparámetros

# Ajustar el modelo SVM
svm_params = {'C': [0.01, 0.1, 1, 10], 'kernel': ['linear']}
svm_model = GridSearchCV(SVC(probability=True), param_grid=svm_params, scoring='roc_auc', cv=5)
svm_model.fit(X_train_final, y_train)

# Ajustar el modelo Árbol de Clasificación
tree_params = {'min_samples_split': [2, 5, 10], 'ccp_alpha': [0.01, 0.1, 0.2], 'max_depth': [5, 10, 15]}
tree_model = GridSearchCV(DecisionTreeClassifier(), param_grid=tree_params, scoring='roc_auc', cv=5)
tree_model.fit(X_train_final, y_train)

# Ajustar el modelo Random Forest
rf_params = {'n_estimators': [100, 200, 300], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_split': [2, 5, 10]}
rf_model = GridSearchCV(RandomForestClassifier(), param_grid=rf_params, scoring='roc_auc', cv=5)
rf_model.fit(X_train_final, y_train)

# Parte 3: Comparación de modelos utilizando el conjunto de prueba

# Realizar predicciones con cada modelo
svm_pred = svm_model.predict(X_test_final)
tree_pred = tree_model.predict(X_test_final)
rf_pred = rf_model.predict(X_test_final)

# Calcular F1-score para cada modelo
svm_f1 = f1_score(y_test, svm_pred)
tree_f1 = f1_score(y_test, tree_pred)
rf_f1 = f1_score(y_test, rf_pred)

# Imprimir los F1-scores de cada modelo
print("F1-score para SVM:", svm_f1)
print("F1-score para Árbol de Clasificación:", tree_f1)
print("F1-score para Random Forest:", rf_f1)


ValueError: Input y contains NaN.