In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

In [20]:
file_path = "weatherAUS.csv"
df0 = pd.read_csv(file_path, sep=",", engine="python")
df = df0.copy()
ciudades = [
    " Adelaide",
    "Canberra",
    "Cobar",
    "Dartmoor",
    "Melbourne",
    "MelbourneAirport",
    "MountGambier",
    "Sydney",
    "SydneyAirport",
]  
# Filtrar por ciudades
df = df[df["Location"].isin(ciudades)]
df = df.drop("Location", axis=1)
# X
X = df.drop(['RainTomorrow', 'RainfallTomorrow', 'Unnamed: 0'], axis=1)
# y
y = df[["RainfallTomorrow"]]

In [21]:
# Rellenar valores faltantes de RainFallTomorrow
mediana = y['RainfallTomorrow'].median()
# Rellenar NaN con la mediana
y = y['RainfallTomorrow'].fillna(mediana)
y.isna().sum()

0

In [22]:
X.isna().sum()

Date                0
MinTemp           568
MaxTemp           556
Rainfall          884
Evaporation      2865
Sunshine         5290
WindGustDir      1620
WindGustSpeed    1617
WindDir9am       1374
WindDir3pm        421
WindSpeed9am      353
WindSpeed3pm      335
Humidity9am       935
Humidity3pm       885
Pressure9am       831
Pressure3pm       822
Cloud9am         6175
Cloud3pm         6358
Temp9am           580
Temp3pm           572
RainToday         884
dtype: int64

In [23]:
class TransformData20(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()
        
    def fit(self, X, y):
        return self  # No se necesita hacer ningún ajuste en el fit para este caso
    
    def transform(self, X, y=None):        
        # Eliminar columna "Unnamed: 0"
        if 'Unnamed: 0' in X.columns:
            X = X.drop("Unnamed: 0", axis=1)
        
        # Filtrar por ciudades
        # X = X[X["Location"].isin(self.ciudades)]
        # X = X.drop("Location", axis=1)
        
        # Convertir la fecha a datetime
        X['Date'] = pd.to_datetime(X['Date'])
        
        # Determinar bimestre
        X['Bimestre'] = X['Date'].apply(self.determinar_bimestre)
        
        # Rellenar valores faltantes de Rainfall
        mediana_por_dia = X.groupby(X["Date"].dt.date)["Rainfall"].median()
        X["Rainfall"] = X.apply(
            lambda row: mediana_por_dia[row["Date"].date()] if pd.isnull(row["Rainfall"]) else row["Rainfall"],
            axis=1,
        )
        
        # Rellenar valores faltantes de Evaporation por bimestre
        medianas_evaporation = X.groupby("Bimestre")["Evaporation"].median()
        for bimestre, median in medianas_evaporation.items():
            X.loc[(X["Bimestre"] == bimestre) & (X["Evaporation"].isnull()), "Evaporation"] = median
        
        # Rellenar valores faltantes de Sunshine por día
        X['Sunshine'] = X.groupby(X['Date'].dt.day)["Sunshine"].transform(lambda x: x.fillna(x.mean()))


        # Rellenar valores faltantes de WindDir por día
        X["WindGustDir"] = X.groupby(X["Date"].dt.day)[
            "WindGustDir"
        ].transform(lambda x: x.fillna(x.mode().iloc[0]))
        X["WindDir9am"] = X.groupby(X["Date"].dt.day)[
            "WindDir9am"
        ].transform(lambda x: x.fillna(x.mode().iloc[0]))
        X["WindDir3pm"] = X.groupby(X["Date"].dt.day)[
            "WindDir3pm"
        ].transform(lambda x: x.fillna(x.mode().iloc[0]))
        
        # Rellenar valores faltantes de WindSpeed, Humidity, Cloud, Pressure, Temp por día
        columns_to_fillna = ['WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Cloud9am',
                             'Cloud3pm', 'Pressure9am', 'Pressure3pm', 'MinTemp', 'MaxTemp', 'Temp9am', 'Temp3pm']
        
        for column in columns_to_fillna:
            if column in X.columns:
                X[column] = X.groupby(X['Date'].dt.day)[column].transform(lambda x: x.fillna(x.median()))
        
        # Rellenar valores faltantes de RainToday con la moda y pasarlo a 1 y 0
        moda_RainToday = X.groupby("Date")["RainToday"].transform(
            lambda x: x.mode().iloc[0] if not x.mode().empty else None
        )
        X["RainToday"] = X["RainToday"].fillna(moda_RainToday)
        X["RainToday"] = X["RainToday"].map({"Yes": 1, "No": 0})

        # Agrupar direcciones de viento
        X['WindGustDir_Agrupado'] = X['WindGustDir'].apply(self.agrupar_direcciones)
        X['WindDir9am_Agrupado'] = X['WindDir9am'].apply(self.agrupar_direcciones)
        X['WindDir3pm_Agrupado'] = X['WindDir3pm'].apply(self.agrupar_direcciones)
        X = X.drop(['WindGustDir', 'WindDir9am', 'WindDir3pm'], axis=1)
        
        # Crear variables dummies para direcciones agrupadas
        X = pd.get_dummies(X, columns=['WindGustDir_Agrupado', 'WindDir9am_Agrupado', 'WindDir3pm_Agrupado'],
                            drop_first=True)
        
        # Calcular diferencia de temperatura máxima y mínima
        X['Dif_Temp_Max_Min'] = X['MaxTemp'] - X['MinTemp']
        X = X.drop(['MaxTemp', 'MinTemp'], axis=1)
        
        # Calcular diferencia de temperaturas 9am y 3pm
        X['Temp_Difference'] = X['Temp3pm'] - X['Temp9am']
        X = X.drop(['Temp3pm', 'Temp9am'], axis=1)
        
        # Eliminar columnas innecesarias
        X = X.drop(['Date', 'Bimestre'], axis=1)

        # df_train
        scaler = StandardScaler()
        X_Scale = scaler.fit_transform(X)
        X_Scale = pd.DataFrame(X_Scale, columns=X.columns)
        
        return X_Scale
    
    def determinar_bimestre(self, fecha):
        mes = fecha.month
        if 1 <= mes <= 2:
            return "Bimestre 1"
        elif 3 <= mes <= 4:
            return "Bimestre 2"
        elif 5 <= mes <= 6:
            return "Bimestre 3"
        elif 7 <= mes <= 8:
            return "Bimestre 4"
        elif 9 <= mes <= 10:
            return "Bimestre 5"
        else:
            return "Bimestre 6"
    
    def agrupar_direcciones(self, direccion):
        grupos_principales = {
            "N": ["N", "NNW", "NNE"],
            "S": ["S", "SSW", "SSE"],
            "E": ["E", "ENE", "ESE", "SE", "NE"],
            "W": ["W", "WNW", "WSW", "SW", "NW"],
        }

        for grupo, direcciones in grupos_principales.items():
            if direccion in direcciones:
                return grupo

        return "Otro"


In [24]:
# Crear el pipeline
pipeline20 = Pipeline([
    # ('scaler', Scaler()),
    ('transform_data', TransformData20()),
    ('regression', LinearRegression())
])

In [25]:
# Entrenar el pipeline
pipeline20.fit(X, y)

In [26]:
pd.set_option('future.no_silent_downcasting', True)
#### Fila para PREDICT
# Filtrar el DataFrame
fila = df.iloc[10]
# Convertir la fila en un nuevo DataFrame
df_fila = pd.DataFrame(fila).transpose()
df_fila = df_fila.drop(['RainTomorrow', 'RainfallTomorrow', 'Unnamed: 0'], axis=1)
transformer = TransformData20()
transformer.transform(df_fila)
# Mostrar la fila seleccionada
df_fila.columns

Index(['Date', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'Bimestre', 'WindGustDir_Agrupado',
       'WindDir9am_Agrupado', 'WindDir3pm_Agrupado'],
      dtype='object')

In [27]:
# Lista de columnas que no quieres convertir a float
excluded_columns = ['Date', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'Bimestre',
       'WindGustDir_Agrupado', 'WindDir9am_Agrupado', 'WindDir3pm_Agrupado']

# Convertir todas las columnas a float, excepto las excluidas
for col in df_fila.columns:
    if col not in excluded_columns:
        df_fila[col] = df_fila[col].astype(float)

In [28]:
df_fila.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, 6057 to 6057
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  1 non-null      datetime64[ns]
 1   MinTemp               1 non-null      float64       
 2   MaxTemp               1 non-null      float64       
 3   Rainfall              1 non-null      float64       
 4   Evaporation           1 non-null      float64       
 5   Sunshine              1 non-null      float64       
 6   WindGustDir           1 non-null      object        
 7   WindGustSpeed         1 non-null      float64       
 8   WindDir9am            1 non-null      object        
 9   WindDir3pm            1 non-null      object        
 10  WindSpeed9am          1 non-null      float64       
 11  WindSpeed3pm          1 non-null      float64       
 12  Humidity9am           1 non-null      float64       
 13  Humidity3pm           1

In [29]:
df_fila

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,Bimestre,WindGustDir_Agrupado,WindDir9am_Agrupado,WindDir3pm_Agrupado
6057,2009-01-11,19.7,35.5,0.0,11.0,12.7,NE,41.0,NNE,WSW,...,1005.8,1.0,5.0,24.0,33.6,0.0,Bimestre 1,E,N,W


In [30]:
X.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,Bimestre,WindGustDir_Agrupado,WindDir9am_Agrupado,WindDir3pm_Agrupado
6047,2009-01-01,17.9,35.2,0.0,12.0,12.3,SSW,48.0,ENE,SW,...,1004.4,2.0,5.0,26.6,33.4,0,Bimestre 1,S,E,W
6048,2009-01-02,18.4,28.9,0.0,14.8,13.0,S,37.0,SSE,SSE,...,1012.1,1.0,1.0,20.3,27.0,0,Bimestre 1,S,S,S
6049,2009-01-03,15.5,34.1,0.0,12.6,13.3,SE,30.0,N,N,...,1011.6,6.0,1.0,14.5,32.7,0,Bimestre 1,E,N,N
6050,2009-01-04,19.4,37.6,0.0,10.8,10.6,NNE,46.0,NNE,NNW,...,1009.2,1.0,6.0,28.7,34.9,0,Bimestre 1,N,N,N
6051,2009-01-05,21.9,38.4,0.0,11.4,12.2,WNW,31.0,WNW,WSW,...,1009.1,1.0,5.0,29.1,35.6,0,Bimestre 1,W,W,W


In [31]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25041 entries, 6047 to 102519
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  25041 non-null  datetime64[ns]
 1   MinTemp               25041 non-null  float64       
 2   MaxTemp               25041 non-null  float64       
 3   Rainfall              25041 non-null  float64       
 4   Evaporation           25041 non-null  float64       
 5   Sunshine              25041 non-null  float64       
 6   WindGustDir           25041 non-null  object        
 7   WindGustSpeed         25041 non-null  float64       
 8   WindDir9am            25041 non-null  object        
 9   WindDir3pm            25041 non-null  object        
 10  WindSpeed9am          25041 non-null  float64       
 11  WindSpeed3pm          25041 non-null  float64       
 12  Humidity9am           25041 non-null  float64       
 13  Humidity3pm      

In [32]:
print("SHAPE X:", X.shape)
print("SHAPE DF_FILA:", df_fila.shape)

SHAPE X: (25041, 25)
SHAPE DF_FILA: (1, 25)


In [33]:
# Predecir con el pipeline
predictions = pipeline20.predict(df_fila)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- WindDir3pm_Agrupado_N
- WindDir3pm_Agrupado_S
- WindDir3pm_Agrupado_W
- WindDir9am_Agrupado_N
- WindDir9am_Agrupado_S
- ...
