<a href="https://colab.research.google.com/github/claudialeguiza/AA1-TUIA-Kidonakis-Leguiza/blob/navegador/generar_modelos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
import keras
from keras.models import Sequential, save_model, load_model
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import Adam
from keras.metrics import Precision
from keras.utils import to_categorical
import joblib
import warnings
warnings.simplefilter('ignore')

In [52]:
datos = pd.read_csv('/content/weatherAUS.csv', delimiter = ",")

In [53]:
df = datos[datos.Location\
                      .isin(( 'Sydney','SydneyAirport','Melbourne', 'MelbourneAirport',\
                             'Canberra','Adelaide', 'MountGambier','Cobar', 'Dartmoor' ))]

In [54]:
def preprocesamiento(data):
    data.info()
    data.isna().sum()

    # Definir columnas con valores nulos
    columnas_con_nulos = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
                          'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm','Humidity9am',
                          'Humidity3pm', 'Pressure9am','Pressure3pm', 'Cloud9am',
                          'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainfallTomorrow']

    # Rellenar valores faltantes en 'RainToday' y 'RainTomorrow'
    data['RainToday'] = data.groupby('Date')['RainToday'].transform(lambda x: x.fillna(x.mode().iloc[0]))
    data['RainTomorrow'] = data.groupby('Date')['RainTomorrow'].transform(lambda x: x.fillna(x.mode().iloc[0]))

    # Rellenar valores faltantes en direcciones del viento
    data['WindGustDir'] = data.groupby('Date')['WindGustDir'].transform(lambda x: x.fillna(x.mode().iloc[0]) if not x.isna().all() else x)
    data['WindDir9am'] = data.groupby('Date')['WindDir9am'].transform(lambda x: x.fillna(x.mode().iloc[0]) if not x.isna().all() else x)
    data['WindDir3pm'] = data.groupby('Date')['WindDir3pm'].transform(lambda x: x.fillna(x.mode().iloc[0]) if not x.isna().all() else x)

    # Rellenar valores faltantes con la media por día para las columnas especificadas
    media_por_dia = data.groupby('Date')[columnas_con_nulos].transform('mean')
    data[columnas_con_nulos] = data[columnas_con_nulos].fillna(media_por_dia)

    data['Date'] = pd.to_datetime(data['Date'])

    return data


In [55]:
def crear_columna_season(data):
   data['season'] = data['Date'].apply(asignar_estacion)
   return data

In [56]:
def asignar_estacion(fecha):
    mes = fecha.month
    if mes in [12, 1, 2]:  # Verano: Diciembre, Enero, Febrero
        return 'Summer'
    elif mes in [3, 4, 5]:  # Otoño: Marzo, Abril, Mayo
        return 'Autumn'
    elif mes in [6, 7, 8]:  # Invierno: Junio, Julio, Agosto
        return 'Winter'
    else:  # Primavera: Septiembre, Octubre, Noviembre
        return 'Spring'

In [57]:
def codificar_variables(data):
    data1 = pd.get_dummies(data, columns=['RainToday', 'RainTomorrow', 'season', 'Location'], drop_first=True)

    # Crear columnas para WindGustDir, WindDir9am, WindDir3pm
    wind_directions = ["SW", "S", 'SSW', 'W', 'SSE', 'E', 'SE', 'NE', 'NNE', 'WSW', 'WNW', 'NW', 'N', 'ESE', 'ENE']
    for var in wind_directions:
        data1[f'WindGustDir_{var}'] = (data['WindGustDir'] == var).astype(int)
        data1[f'WindDir9am_{var}'] = (data['WindDir9am'] == var).astype(int)
        data1[f'WindDir3pm_{var}'] = (data['WindDir3pm'] == var).astype(int)

    return data1.drop(columns=['WindGustDir', 'WindDir9am', 'WindDir3pm'])

In [58]:
def estandarizar_df(data):
  scaler = RobustScaler()
  data_scaled = scaler.fit_transform(data)
  return data_scaled

In [59]:
def truncar_dividir_df(data):
    data = data.sort_values(["Date"])
    fecha_especifica = '2009-01-01'
    data_filtrada = data[data['Date'] >= fecha_especifica]

    data_filtrada.reset_index(drop=True, inplace=True)  # Resetea el índice y no crea uno nuevo
    data_train = data_filtrada.iloc[:21658]

    return data_train

In [60]:
def estandarizar_df(data):
    # Separar variables independientes y dependientes
    X_regresion = data.drop(columns =['RainfallTomorrow', 'RainTomorrow_Yes','Date'])
    X_scaled = estandarizar_df(data)
    y_regresion = data['RainfallTomorrow'].values.reshape(-1, 1)
    y_scaled = estandarizar_df(pd.DataFrame(y_regresion, columns=['RainfallTomorrow']))
    return X_scaled, y_scaled

In [61]:
def estandarizar_balancear_clas(data):
    #X_clasificacion = data.drop(columns=['RainTomorrow_Yes'])
    X_scaled1 = estandarizar_df(data)
    y_clasificacion = data['RainTomorrow_Yes']

    smote = SMOTE(random_state=42)
    X_smote_scaled, y_smote_scaled = smote.fit_resample(X_scaled1, y_clasificacion)

    return X_smote_scaled, y_smote_scaled

In [62]:
# Definir la arquitectura de la red neuronal para regresión
def create_regression_model(input_shape):
    model = Sequential()
    model.add(Dense(126, input_dim=input_shape, activation='relu'))
    model.add(Dropout(0.5)) # Capa de Dropout para regularizacion, evita el overfitting
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1))  # Capa de salida para regresión
    return model

In [63]:
def regression_model(X_train_scaled, y_train_scaled):
  # Crear el modelo
  regression_model = create_regression_model(X_train_scaled.shape[1])

  # Compilar el modelo
  regression_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

  # Entrenar el modelo
  history_regression = regression_model.fit(X_train_scaled, y_train_scaled, epochs=100, batch_size=32)
  return

In [64]:
# Definir la arquitectura de la red neuronal para clasificación
def create_classification_model(input_shape):
    model = Sequential()
    model.add(Dense(126, input_dim=input_shape, activation='relu'))
    model.add(Dropout(0.5)) # Capa de Dropout para regularizacion, evita el overfitting
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5)) # Capa de Dropout para regularizacion, evita el overfitting
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))  # Capa de salida para clasificación binaria
    return model


In [65]:
def clasification_model(X_smote_train, y_smote_train):
  # Crear el modelo
  classification_model = create_classification_model(X_smote_train.shape[1])

  # Compilar el modelo
  classification_model.compile(optimizer=Adam(learning_rate=0.001), loss= 'binary_crossentropy', metrics= ['Precision'])

  # Entrenar el modelo
  history_classification = classification_model.fit(X_smote_train, y_smote_train, epochs=100, batch_size=32)

  return

In [None]:
pipeline_prepara_datos = Pipeline([
    ('preproceso', FunctionTransformer(preprocesamiento, validate=False)),
    ('season', FunctionTransformer(crear_columna_season, validate=False)),
    ('codificar', FunctionTransformer(codificar_variables, validate=False))
])

# Obtener datos de entrenamiento
df_procesado = pipeline_prepara_datos.fit_transform(df)

pipeline_train_split = Pipeline([
    ('split', FunctionTransformer(truncar_dividir_df, validate=False)),
    ('estandarizar', FunctionTransformer(estandarizar_df, validate=False)),
    ])

pipeline_modelo_regresion = Pipeline([
     ('modelo', FunctionTransformer(regression_model, validate=False))
                                      ])

# Obtener datos de entrenamiento
X_train_scaled, y_train_scaled = pipeline_train_split.fit_transform(df_procesado)

 # Entrenar el modelo
pipeline_modelo_regresion.fit(X_train_scaled, y_train_scaled, epochs=100, batch_size=32)

joblib.dump(pipeline_modelo_regresion, 'models/regresion_pipeline.joblib')

In [None]:
pipeline_train_split_clas = Pipeline([
    ('split', FunctionTransformer(truncar_dividir_df, validate=False)),
    ('estandarizar_clas', FunctionTransformer(estandarizar_balancear_clas, validate=False)),
    ])

pipeline_modelo_clasificacion = Pipeline([
     ('modelo_clas', FunctionTransformer(clasification_model, validate=False))
                                      ])

df_procesado1 = pipeline_prepara_datos.fit_transform(df)

# Obtener datos de entrenamiento
X_smote_train, y_smote_train = pipeline_train_split_clas.fit_transform(df_procesado1)

# Entrenar_modelo
pipeline_modelo_clasificacion.fit(X_smote_train, y_smote_train, epochs=100, batch_size=32)

joblib.dump(pipeline_modelo_clasificacion, 'models/clasificacion_pipeline.joblib')