<a href="https://colab.research.google.com/github/claudialeguiza/AA1-TUIA-Kidonakis-Leguiza/blob/navegador/generar_modelos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
import keras
from keras.models import Sequential, save_model, load_model
from keras.layers import Dense, Activation, Dropout
import joblib
import warnings
warnings.simplefilter('ignore')

In [3]:
datos = pd.read_csv('/content/weatherAUS.csv', delimiter = ",")

In [4]:
df = datos[datos.Location\
                      .isin(( 'Sydney','SydneyAirport','Melbourne', 'MelbourneAirport',\
                             'Canberra','Adelaide', 'MountGambier','Cobar', 'Dartmoor' ))]

In [5]:
def preprocesamiento(data):
    data.info()
    data.isna().sum()

    # Definir columnas con valores nulos
    columnas_con_nulos = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
                          'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm','Humidity9am',
                          'Humidity3pm', 'Pressure9am','Pressure3pm', 'Cloud9am',
                          'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainfallTomorrow']

    # Rellenar valores faltantes en 'RainToday' y 'RainTomorrow'
    data['RainToday'] = data.groupby('Date')['RainToday'].transform(lambda x: x.fillna(x.mode().iloc[0]))
    data['RainTomorrow'] = data.groupby('Date')['RainTomorrow'].transform(lambda x: x.fillna(x.mode().iloc[0]))

    # Rellenar valores faltantes en direcciones del viento
    data['WindGustDir'] = data.groupby('Date')['WindGustDir'].transform(lambda x: x.fillna(x.mode().iloc[0]) if not x.isna().all() else x)
    data['WindDir9am'] = data.groupby('Date')['WindDir9am'].transform(lambda x: x.fillna(x.mode().iloc[0]) if not x.isna().all() else x)
    data['WindDir3pm'] = data.groupby('Date')['WindDir3pm'].transform(lambda x: x.fillna(x.mode().iloc[0]) if not x.isna().all() else x)

    # Rellenar valores faltantes con la media por día para las columnas especificadas
    media_por_dia = data.groupby('Date')[columnas_con_nulos].transform('mean')
    data[columnas_con_nulos] = data[columnas_con_nulos].fillna(media_por_dia)

    data['Date'] = pd.to_datetime(data['Date'])

    return data


In [6]:
def crear_columna_season(data):
   data['season'] = data['Date'].apply(asignar_estacion)
   return data

In [7]:
def asignar_estacion(fecha):
    mes = fecha.month
    if mes in [12, 1, 2]:  # Verano: Diciembre, Enero, Febrero
        return 'Summer'
    elif mes in [3, 4, 5]:  # Otoño: Marzo, Abril, Mayo
        return 'Autumn'
    elif mes in [6, 7, 8]:  # Invierno: Junio, Julio, Agosto
        return 'Winter'
    else:  # Primavera: Septiembre, Octubre, Noviembre
        return 'Spring'

In [8]:
def codificar_variables(data):

  data1 = data.copy()

  for var in ["SW", "S", 'SSW', 'W', 'SSE', 'E', 'SE', 'NE', 'NNE', 'WSW', 'WNW',
            'NW', 'N', 'ESE', 'ENE']:
    # Crear columnas para WindGustDir
    data1[f'WindGustDir_{var}'] = 0
    data1.loc[data['WindGustDir'] == var, f'WindGustDir_{var}'] = 1

    # Crear columnas para WindGust9am
    data1[f'WindDir9am_{var}'] = 0
    data1.loc[data['WindDir9am'] == var, f'WindDir9am_{var}'] = 1

    # Crear columnas para WindGust3pm
    data1[f'WindDir3pm_{var}'] = 0
    data1.loc[data['WindDir3pm'] == var, f'WindDir3pm_{var}'] = 1

    # Generamos dummys
    # Codificar la variable 'RainToday'
  data1 = pd.get_dummies(data1, columns=['RainToday'],drop_first=True)

    # Codificar la variable 'RainTomorrow'
  data1 = pd.get_dummies(data1, columns=['RainTomorrow'],drop_first=True)

    # Codificar la variable 'season'
  data1 = pd.get_dummies(data1, columns=['season'],drop_first=True)

    # Codificar la variable 'Location'
  data1 = pd.get_dummies(data1, columns=['Location'], drop_first=True)
  data1 = data1.drop(columns= ['WindGustDir', 'WindDir9am', 'WindDir3pm'])
  data1 = data1.drop(columns = ['Unnamed: 0'])

  return data1

In [9]:
def estandarizar_df(data):
  scaler = RobustScaler()
  data_scaled = scaler.fit_transform(data)
  return data_scaled

In [15]:
def truncar_dividir_df(data):
  data = data.sort_values(["Date"])
  fecha_especifica = '2009-01-01'
  data_filtrada = data[data['Date'] >= fecha_especifica]

  data_filtrada.reset_index(drop = True, inplace = True)#Resetea el indice y no  crea uno  nuevo
  data_train  = data_filtrada.iloc[:21658]
  data_test = data_filtrada.iloc[21658:]

  return data_train

In [11]:
def estandarizar_regresion(data):

  # Separar variables inependientes y dependintes
   X_regresion = data.drop(columns=['RainfallTomorrow', 'Date'])
   X_scaled = estandarizar_df(X_regresion)
   y_regresion = data['RainfallTomorrow']
   y_scaled = estandarizar_df(y_regresion)
   return X_scaled, y_scaled

In [16]:
def estandarizar_balancear_clasificacion(data):

   X_clasificacion = data.drop(columns=['RainTomorrow_Yes', "RainfallTomorrow","Date"])
   X_scaled1 = estandarizar_df(X_clasificacion)
   y_clasificacion = data['RainTomorrow_Yes']
   y_scaled1 = estandarizar_df(y_clasificacion)

   smote  = SMOTE(random_state =42)
   X_smote_scaled, y_smote_scaled = smote.fit_resample(X_scaled1,y_scaled1)

   return X_smote_scaled, y_smote_scaled


In [21]:
pipeline_prepara_datos = Pipeline([
    ('imputar',preprocesamiento(df), crear_columna_season(df)),
    ('truncar', truncar_dividir_df(df)),
    ('codificar', codificar_variables(df)),
    ('estandarizar',estandarizar_regresion(df))
     ])

x_train_scaled, y_train_scaled = pipeline_prepara_datos(df)



pipeline_regresion.fit(X_train_scaled, y_train_scaled)

joblib.dump(pipe_regresionregresion, 'models/regresion_pipeline.joblib')

<class 'pandas.core.frame.DataFrame'>
Index: 28233 entries, 6047 to 102519
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Unnamed: 0        28233 non-null  int64         
 1   Date              28233 non-null  datetime64[ns]
 2   Location          28233 non-null  object        
 3   MinTemp           28233 non-null  float64       
 4   MaxTemp           28233 non-null  float64       
 5   Rainfall          28233 non-null  float64       
 6   Evaporation       28233 non-null  float64       
 7   Sunshine          28233 non-null  float64       
 8   WindGustDir       28229 non-null  object        
 9   WindGustSpeed     28231 non-null  float64       
 10  WindDir9am        28231 non-null  object        
 11  WindDir3pm        28233 non-null  object        
 12  WindSpeed9am      28233 non-null  float64       
 13  WindSpeed3pm      28233 non-null  float64       
 14  Humidity9am       28233

ValueError: could not convert string to float: 'Cobar'

In [None]:
pipe_clasificacion = Pipeline([
    ('imputer', preprocesamiento(df), crear_columna_season(df), codificar_variables(df)),
    ('scaler', RobustScaler()),
    ('model', classification_model_best_params)
])

pipe_clasificacion.fit(X_smote_train, y_smote_train)

joblib.dump(pipe_clasificacion, 'models/clasificacion_pipeline.joblib')