# Lluvia con classifier

In [1]:
# Descargar dataset
!wget -q https://raw.githubusercontent.com/davidlealo/tallermodelossupervisados/refs/heads/main/data/Lluvia_full.csv

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import metrics

In [2]:
# 1. Cargar y preparar datos
df = pd.read_csv('Lluvia_full.csv')

# Mapear 'LluviaMan' a 0/1
df['LluviaMan'] = df['LluviaMan'].map({'No': 0, 'Yes': 1})

# Seleccionar características importantes
features = ['Hum3pm', 'Sol', 'Nub3pm', 'Nub9am', 'Hum9am', 'Pres9am']
df_clean = df[features + ['LluviaMan']].dropna()

X = df_clean[features]
y = df_clean['LluviaMan']

# Dividir y escalar
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [3]:
# 2. Crear modelos individuales
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=100, random_state=1)
clf3 = KNeighborsClassifier(n_neighbors=5)
clf4 = SVC(kernel='poly', C=1, gamma=0.1, probability=True)


In [4]:
# 3. VotingClassifier - Hard Voting
voting_hard = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2), ('knn', clf3), ('svc', clf4)],
    voting='hard'
)

voting_hard.fit(X_train_scaled, y_train)
y_pred_hard = voting_hard.predict(X_test_scaled)
acc_hard = metrics.accuracy_score(y_test, y_pred_hard)
print("VotingClassifier (hard) accuracy:", round(acc_hard, 3))



VotingClassifier (hard) accuracy: 0.844


In [5]:
# 4. VotingClassifier - Soft Voting
voting_soft = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2), ('knn', clf3), ('svc', clf4)],
    voting='soft'
)

voting_soft.fit(X_train_scaled, y_train)
y_pred_soft = voting_soft.predict(X_test_scaled)
acc_soft = metrics.accuracy_score(y_test, y_pred_soft)
print("VotingClassifier (soft) accuracy:", round(acc_soft, 3))



VotingClassifier (soft) accuracy: 0.846


In [6]:
# 5. Validación cruzada
from sklearn.model_selection import cross_val_score

print("\n--- Validación cruzada (5-fold) ---")
for name, model in [
    ('LogisticRegression', clf1),
    ('RandomForest', clf2),
    ('KNN', clf3),
    ('SVC', clf4),
    ('VotingHard', voting_hard),
    ('VotingSoft', voting_soft)
]:
    scores = cross_val_score(model, scaler.transform(X), y, cv=5)
    print(f"{name}: accuracy promedio = {np.mean(scores):.3f} (+/- {np.std(scores):.3f})")


--- Validación cruzada (5-fold) ---
LogisticRegression: accuracy promedio = 0.840 (+/- 0.005)
RandomForest: accuracy promedio = 0.839 (+/- 0.005)
KNN: accuracy promedio = 0.824 (+/- 0.005)
SVC: accuracy promedio = 0.839 (+/- 0.006)
VotingHard: accuracy promedio = 0.841 (+/- 0.006)
VotingSoft: accuracy promedio = 0.843 (+/- 0.006)


> luego, optimizar randomForest y darle mas peso para un posible mejor accuracy