In [726]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.linear_model import LinearRegression, LogisticRegression
import pandas as pd
import numpy as np
import tensorflow as tf

In [727]:
file_path = "weatherAUS.csv"
df0 = pd.read_csv(file_path, sep=",", engine="python")
df = df0.copy()
ciudades = [
    " Adelaide",
    "Canberra",
    "Cobar",
    "Dartmoor",
    "Melbourne",
    "MelbourneAirport",
    "MountGambier",
    "Sydney",
    "SydneyAirport",
]  
# Filtrar por ciudades
df = df[df["Location"].isin(ciudades)]
df = df.drop("Location", axis=1)
# X
X = df.drop(['RainTomorrow', 'RainfallTomorrow', 'Unnamed: 0'], axis=1)
# y
# y = df[["RainfallTomorrow"]]
y = df[["RainTomorrow"]]

In [728]:
# Rellenar valores faltantes de RainFallTomorrow
# mediana = y['RainfallTomorrow'].median()
# # Rellenar NaN con la mediana
# y = y['RainfallTomorrow'].fillna(mediana)
# y.isna().sum()

mode = y['RainTomorrow'].mode()[0]
y = y['RainTomorrow'].fillna(mode)
# y.replace({'Yes': 1, 'No': 0}, inplace=True)
# y['RainTomorrow'] = y['RainTomorrow'].astype(int)

In [729]:
X = X.drop(['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday'], axis=1)

In [730]:
class NeuralNetworkTensorFlowOpt(BaseEstimator, RegressorMixin):
    def __init__(self, num_layers=3, n_units_layer_0=54, n_units_layer_1=95, lr=0.001, epochs=50):
        self.num_layers = num_layers
        self.n_units_layer_0 = n_units_layer_0
        self.n_units_layer_1 = n_units_layer_1
        self.lr = lr
        self.epochs = epochs
        self.model = self.build_model()

    def build_model(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Dense(self.n_units_layer_0, activation='sigmoid', input_shape=(17,)))

        for _ in range(1, self.num_layers):
            model.add(tf.keras.layers.Dense(self.n_units_layer_1, activation='sigmoid'))

        model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.lr), 
                      loss='mean_squared_error')
        return model
    
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.history = self.model.fit(X, y, epochs=self.epochs, verbose=0)
        return self

    def predict(self, X):
        X = np.array(X)
        predictions = self.model.predict(X)
        return predictions
    
    def score(self, X, y):
        X = np.array(X)
        y = np.array(y)
        return self.model.evaluate(X, y, verbose=0)


In [731]:
class TransformData20(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y):
        return self  # No se necesita hacer ningún ajuste en el fit para este caso
    
    def transform(self, X, y=None):        
        # Eliminar columna "Unnamed: 0"
        if 'Unnamed: 0' in X.columns:
            X = X.drop("Unnamed: 0", axis=1)
        
        # Convertir la fecha a datetime
        X['Date'] = pd.to_datetime(X['Date'])
        
        # Determinar bimestre
        X['Bimestre'] = X['Date'].apply(self.determinar_bimestre)
        
        # Rellenar valores faltantes de Rainfall
        mediana_por_dia = X.groupby(X["Date"].dt.date)["Rainfall"].median()
        X["Rainfall"] = X.apply(
            lambda row: mediana_por_dia[row["Date"].date()] if pd.isnull(row["Rainfall"]) else row["Rainfall"],
            axis=1,
        )
        
        # Rellenar valores faltantes de Evaporation por bimestre
        medianas_evaporation = X.groupby("Bimestre")["Evaporation"].median()
        for bimestre, median in medianas_evaporation.items():
            X.loc[(X["Bimestre"] == bimestre) & (X["Evaporation"].isnull()), "Evaporation"] = median
        
        # Rellenar valores faltantes de Sunshine por día
        X['Sunshine'] = X.groupby(X['Date'].dt.day)["Sunshine"].transform(lambda x: x.fillna(x.mean()))

        # Rellenar valores faltantes de WindDir por día
        # X["WindGustDir"] = X.groupby(X["Date"].dt.day)[
        #     "WindGustDir"
        # ].transform(lambda x: x.fillna(x.mode().iloc[0]))
        # X["WindDir9am"] = X.groupby(X["Date"].dt.day)[
        #     "WindDir9am"
        # ].transform(lambda x: x.fillna(x.mode().iloc[0]))
        # X["WindDir3pm"] = X.groupby(X["Date"].dt.day)[
        #     "WindDir3pm"
        # ].transform(lambda x: x.fillna(x.mode().iloc[0]))
        
        # Rellenar valores faltantes de WindSpeed, Humidity, Cloud, Pressure, Temp por día
        columns_to_fillna = ['WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Cloud9am',
                             'Cloud3pm', 'Pressure9am', 'Pressure3pm', 'MinTemp', 'MaxTemp', 'Temp9am', 'Temp3pm']
        
        for column in columns_to_fillna:
            if column in X.columns:
                X[column] = X.groupby(X['Date'].dt.day)[column].transform(lambda x: x.fillna(x.median()))
        
        # Rellenar valores faltantes de RainToday con la moda y pasarlo a 1 y 0
        # moda_RainToday = X.groupby("Date")["RainToday"].transform(
        #     lambda x: x.mode().iloc[0] if not x.mode().empty else None
        # )
        # X["RainToday"] = X["RainToday"].fillna(moda_RainToday)
        # X["RainToday"] = X["RainToday"].map({"Yes": 1, "No": 0})

        # Agrupar direcciones de viento
        # X['WindGustDir'] = X['WindGustDir'].apply(self.agrupar_direcciones)
        # X['WindDir9am'] = X['WindDir9am'].apply(self.agrupar_direcciones)
        # X['WindDir3pm'] = X['WindDir3pm'].apply(self.agrupar_direcciones)

        # Crear variables dummies para direcciones agrupadas
        # X = pd.get_dummies(X, columns=['WindGustDir', 'WindDir9am', 'WindDir3pm'],
        #                     drop_first=True)
        # X.replace({True: 1, False: 0}, inplace=True)
        
        # Crear variables dummies para direcciones agrupadas
        # dummies = pd.get_dummies(X, columns=['WindGustDir', 'WindDir9am', 'WindDir3pm'], drop_first=True)
        # dummies = dummies[['WindGustDir_N', 'WindGustDir_S', 'WindGustDir_W', 'WindDir9am_N', 'WindDir9am_S', 'WindDir9am_W', 'WindDir3pm_N', 'WindDir3pm_S', 'WindDir3pm_W']]
        # X = pd.concat([X, dummies], axis=1)
        
        # Calcular diferencia de temperatura máxima y mínima
        # X['Dif_Temp_Max_Min'] = X['MaxTemp'] - X['MinTemp']
        # X = X.drop(['MaxTemp', 'MinTemp'], axis=1)
        
        # Calcular diferencia de temperaturas 9am y 3pm
        # X['Temp_Difference'] = X['Temp3pm'] - X['Temp9am']
        # X = X.drop(['Temp3pm', 'Temp9am'], axis=1)
        
        # Eliminar columnas innecesarias
        X = X.drop(['Date', 'Bimestre'], axis=1)
        # X = X.drop(['Bimestre'], axis=1)
        
        return X
    
    def determinar_bimestre(self, fecha):
        mes = fecha.month
        if 1 <= mes <= 2:
            return "Bimestre 1"
        elif 3 <= mes <= 4:
            return "Bimestre 2"
        elif 5 <= mes <= 6:
            return "Bimestre 3"
        elif 7 <= mes <= 8:
            return "Bimestre 4"
        elif 9 <= mes <= 10:
            return "Bimestre 5"
        else:
            return "Bimestre 6"
    
    def agrupar_direcciones(self, direccion):
        grupos_principales = {
            "N": ["N", "NNW", "NNE"],
            "S": ["S", "SSW", "SSE"],
            "E": ["E", "ENE", "ESE", "SE", "NE"],
            "W": ["W", "WNW", "WSW", "SW", "NW"],
        }
        for grupo, direcciones in grupos_principales.items():
            if direccion in direcciones:
                return grupo
        return "Otro"


In [732]:
# Crear el pipeline
pipeline20 = Pipeline([
    ('transform_data', TransformData20()),
    # ('regression', LogisticRegression(random_state=42, class_weight='balanced'))
    ('regression', NeuralNetworkTensorFlowOpt())
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [733]:
# Entrenar el pipeline
pipeline20.fit(X, y)

ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_12" is incompatible with the layer: expected axis -1 of input shape to have value 17, but received input with shape (None, 16)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 16), dtype=float32)
  • training=True
  • mask=None

In [670]:
# Data
data = {
    'Date': ['2009-01-19'],    
    'MinTemp': [21.],
    'MaxTemp': [37.5], 
    'Rainfall': [0.0],
    'Evaporation': [14.8],
    'Sunshine': [6.9],
    # 'WindGustDir': ['N'],  
    'WindGustSpeed': [80.0],
    # 'WindDir9am': ['N'], 
    # 'WindDir3pm': ['W'],  
    'WindSpeed9am': [28.0],
    'WindSpeed3pm': [19.0],
    'Humidity9am': [50.0],
    'Humidity3pm': [80.0],
    'Pressure9am': [1013.1],
    'Pressure3pm': [1009.6],
    'Cloud9am': [7.0],
    'Cloud3pm': [6.0],
    'Temp9am': [26.2],
    'Temp3pm': [34.1],
    # 'RainToday': [1],
    'Bimestre': ['Bimestre 1'],
}

fila = pd.DataFrame(data)
fila['Date'] = pd.to_datetime(fila['Date'])
# fila.replace({True: int(1), False: int(0)}, inplace=True)

# # Crear variables dummies para direcciones agrupadas
# # fila = pd.get_dummies(fila, columns=['WindGustDir', 'WindDir9am', 'WindDir3pm'],
# #                     drop_first=True)
# # fila.replace({True: 1, False: 0}, inplace=True)
# # Crear variables dummies para direcciones agrupadas
# dummies = pd.get_dummies(fila, columns=['WindGustDir', 'WindDir9am', 'WindDir3pm'],
#                     drop_first=True)
# # dummies = dummies[['WindGustDir_N', 'WindGustDir_S', 'WindGustDir_W', 'WindDir9am_N', 'WindDir9am_S', 'WindDir9am_W', 'WindDir3pm_N', 'WindDir3pm_S', 'WindDir3pm_W']]
# # fila = pd.concat([fila, dummies], axis=1)

In [724]:
print("SHAPE X:", X.shape)
print("SHAPE DF_FILA:", fila.shape)

SHAPE X: (25041, 17)
SHAPE DF_FILA: (1, 18)


In [672]:
# Predecir con el pipeline
predictions = pipeline20.predict(fila)

In [673]:
print(predictions)

['Yes']
