<a href="https://colab.research.google.com/github/claudialeguiza/AA1-TUIA-Kidonakis-Leguiza/blob/navegador/generar_modelos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
import keras
from keras.models import Sequential, save_model, load_model
from keras.layers import Dense, Activation, Dropout
import joblib
import warnings
warnings.simplefilter('ignore')

In [2]:
datos = pd.read_csv('/content/weatherAUS.csv', delimiter = ",")

In [3]:
df = datos[datos.Location\
                      .isin(( 'Sydney','SydneyAirport','Melbourne', 'MelbourneAirport',\
                             'Canberra','Adelaide', 'MountGambier','Cobar', 'Dartmoor' ))]

In [4]:
def preprocesamiento(data):
    data.info()
    data.isna().sum()

    # Definir columnas con valores nulos
    columnas_con_nulos = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
                          'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm','Humidity9am',
                          'Humidity3pm', 'Pressure9am','Pressure3pm', 'Cloud9am',
                          'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainfallTomorrow']

    # Rellenar valores faltantes en 'RainToday' y 'RainTomorrow'
    data['RainToday'] = data.groupby('Date')['RainToday'].transform(lambda x: x.fillna(x.mode().iloc[0]))
    data['RainTomorrow'] = data.groupby('Date')['RainTomorrow'].transform(lambda x: x.fillna(x.mode().iloc[0]))

    # Rellenar valores faltantes en direcciones del viento
    data['WindGustDir'] = data.groupby('Date')['WindGustDir'].transform(lambda x: x.fillna(x.mode().iloc[0]) if not x.isna().all() else x)
    data['WindDir9am'] = data.groupby('Date')['WindDir9am'].transform(lambda x: x.fillna(x.mode().iloc[0]) if not x.isna().all() else x)
    data['WindDir3pm'] = data.groupby('Date')['WindDir3pm'].transform(lambda x: x.fillna(x.mode().iloc[0]) if not x.isna().all() else x)

    # Rellenar valores faltantes con la media por día para las columnas especificadas
    media_por_dia = data.groupby('Date')[columnas_con_nulos].transform('mean')
    data[columnas_con_nulos] = data[columnas_con_nulos].fillna(media_por_dia)

    data['Date'] = pd.to_datetime(data['Date'])

    return data


In [5]:
def crear_columna_season(data):
   data['season'] = data['Date'].apply(asignar_estacion)
   return data

In [6]:
def asignar_estacion(fecha):
    mes = fecha.month
    if mes in [12, 1, 2]:  # Verano: Diciembre, Enero, Febrero
        return 'Summer'
    elif mes in [3, 4, 5]:  # Otoño: Marzo, Abril, Mayo
        return 'Autumn'
    elif mes in [6, 7, 8]:  # Invierno: Junio, Julio, Agosto
        return 'Winter'
    else:  # Primavera: Septiembre, Octubre, Noviembre
        return 'Spring'

In [15]:
def codificar_variables(data):
    data1 = pd.get_dummies(data, columns=['RainToday', 'RainTomorrow', 'season', 'Location'], drop_first=True)

    # Crear columnas para WindGustDir, WindDir9am, WindDir3pm
    wind_directions = ["SW", "S", 'SSW', 'W', 'SSE', 'E', 'SE', 'NE', 'NNE', 'WSW', 'WNW', 'NW', 'N', 'ESE', 'ENE']
    for var in wind_directions:
        data1[f'WindGustDir_{var}'] = (data['WindGustDir'] == var).astype(int)
        data1[f'WindDir9am_{var}'] = (data['WindDir9am'] == var).astype(int)
        data1[f'WindDir3pm_{var}'] = (data['WindDir3pm'] == var).astype(int)

    return data1.drop(columns=['WindGustDir', 'WindDir9am', 'WindDir3pm'])

In [8]:
def estandarizar_df(data):
  scaler = RobustScaler()
  data_scaled = scaler.fit_transform(data)
  return data_scaled

In [27]:
def truncar_dividir_df(data):
    data = data.sort_values(["Date"])
    fecha_especifica = '2009-01-01'
    data_filtrada = data[data['Date'] >= fecha_especifica]

    data_filtrada.reset_index(drop=True, inplace=True)  # Resetea el índice y no crea uno nuevo
    data_train = data_filtrada.iloc[:21658]
    data_test = data_filtrada.iloc[21658:]

    return data_train, data_test

In [24]:
def estandarizar_regresion(data):
    # Separar variables independientes y dependientes
    X_regresion = data.drop(columns=['RainfallTomorrow', 'Date'])
    X_scaled = estandarizar_df(X_regresion)
    y_regresion = data['RainfallTomorrow'].values.reshape(-1, 1)
    y_scaled = estandarizar_df(pd.DataFrame(y_regresion, columns=['RainfallTomorrow']))
    return X_scaled, y_scaled

In [25]:
def estandarizar_balancear_clasificacion(data):
    X_clasificacion = data.drop(columns=['RainTomorrow_Yes', "RainfallTomorrow", "Date"])
    X_scaled1 = estandarizar_df(X_clasificacion)
    y_clasificacion = data['RainTomorrow_Yes']

    smote = SMOTE(random_state=42)
    X_smote_scaled, y_smote_scaled = smote.fit_resample(X_scaled1, y_clasificacion)

    return X_smote_scaled, y_smote_scaled

In [30]:
pipeline_prepara_datos = Pipeline([
    ('preproceso', FunctionTransformer(preprocesamiento, validate=False)),
    ('season', FunctionTransformer(crear_columna_season, validate=False)),
    ('codificar', FunctionTransformer(codificar_variables, validate=False))
])

# Obtener datos de entrenamiento
df_procesado = pipeline_prepara_datos.fit_transform(df)

# Convertir la salida del pipeline a DataFrame y verificar la salida
df_procesado = pd.DataFrame(df_procesado, columns=pipeline_prepara_datos.named_steps['codificar'].fit_transform(df).columns)

# Dividir datos en entrenamiento y prueba
data_train, data_test = truncar_dividir_df(df_procesado)

# Entrenar modelo de regresión
X_train_scaled, y_train_scaled = estandarizar_regresion(data_train)




# joblib.dump(pipe_regresionregresion, 'models/regresion_pipeline.joblib')

<class 'pandas.core.frame.DataFrame'>
Index: 28233 entries, 6047 to 102519
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Unnamed: 0        28233 non-null  int64         
 1   Date              28233 non-null  datetime64[ns]
 2   Location          28233 non-null  object        
 3   MinTemp           28233 non-null  float64       
 4   MaxTemp           28233 non-null  float64       
 5   Rainfall          28233 non-null  float64       
 6   Evaporation       28233 non-null  float64       
 7   Sunshine          28233 non-null  float64       
 8   WindGustDir       28229 non-null  object        
 9   WindGustSpeed     28231 non-null  float64       
 10  WindDir9am        28231 non-null  object        
 11  WindDir3pm        28233 non-null  object        
 12  WindSpeed9am      28233 non-null  float64       
 13  WindSpeed3pm      28233 non-null  float64       
 14  Humidity9am       28233

In [None]:
pipe_clasificacion = Pipeline([
    ('imputer', preprocesamiento(df), crear_columna_season(df), codificar_variables(df)),
    ('scaler', RobustScaler()),
    ('model', classification_model_best_params)
])

pipe_clasificacion.fit(X_smote_train, y_smote_train)

joblib.dump(pipe_clasificacion, 'models/clasificacion_pipeline.joblib')