In [18]:
import pandas as pd
import joblib
from sklearn.preprocessing import MinMaxScaler

In [19]:
full_df = pd.read_csv('../../data/ml/full.csv', parse_dates=['fecha'])
train_df = pd.read_csv('../../data/ml/train.csv', parse_dates=['fecha'])
validation_df = pd.read_csv('../../data/ml/validation.csv', parse_dates=['fecha'])
test_df = pd.read_csv('../../data/ml/test.csv', parse_dates=['fecha'])

In [47]:
validation_df['tmed'].head()

0     9.0
1    10.2
2     4.6
3     7.6
4     5.1
Name: tmed, dtype: float64

In [20]:
scaler_X_full = MinMaxScaler(feature_range=(0, 1))
scaler_X_train = MinMaxScaler(feature_range=(0, 1))
scaler_y_full = MinMaxScaler(feature_range=(0, 1))
scaler_y_train = MinMaxScaler(feature_range=(0, 1))

In [36]:
def scale_df(df: pd.DataFrame, scaler_X, scaler_y, fit: bool = False) -> pd.DataFrame:

    df_unscaled = df.copy()
    
    target = 'tmed'
    ignore_cols = ['fecha', 'idema', 'fecha_sin', 'fecha_cos']
    
    X_scale_cols = list(df_unscaled.drop(columns=ignore_cols + [target]).columns)
    print(X_scale_cols)
    y_scale_cols = [target]

    if fit:
        X_scaled = scaler_X.fit_transform(df_unscaled[X_scale_cols].values)
        y_scaled = scaler_y.fit_transform(df_unscaled[y_scale_cols].values)
    else:
        X_scaled = scaler_X.transform(df_unscaled[X_scale_cols].values)
        y_scaled = scaler_y.transform(df_unscaled[y_scale_cols].values)
    
    df_scaled = pd.DataFrame(X_scaled, columns=df_unscaled[X_scale_cols].columns)
    df_scaled[target] = y_scaled

    df_scaled[ignore_cols] = df_unscaled[ignore_cols].values
    df_scaled = df_scaled[df_unscaled.columns]

    numeric_cols = ['fecha_sin', 'fecha_cos']
    df_scaled[numeric_cols] = df_scaled[numeric_cols].apply(pd.to_numeric, errors='coerce')

    return df_scaled.round(4)

In [37]:
full_df_scaled = scale_df(full_df, scaler_X_full, scaler_y_full, fit=True)
train_df_scaled = scale_df(train_df, scaler_X_train, scaler_y_train, fit=True)

['prec', 'tmin', 'tmax', 'hr_max', 'hr_media', 'latitud', 'altitud']
['prec', 'tmin', 'tmax', 'hr_max', 'hr_media', 'latitud', 'altitud']


In [38]:
joblib.dump(scaler_X_full, '../../ml/scalers/scaler_X_full.joblib')
joblib.dump(scaler_y_full, '../../ml/scalers/scaler_y_full.joblib')
joblib.dump(scaler_X_train, '../../ml/scalers/scaler_X_train.joblib')
joblib.dump(scaler_y_train, '../../ml/scalers/scaler_y_train.joblib')

['../../ml/scalers/scaler_y_train.joblib']

In [39]:
validation_df_scaled = scale_df(validation_df, scaler_X_train, scaler_y_train, fit=False)
test_df_scaled = scale_df(test_df, scaler_X_train, scaler_y_train, fit=False)

['prec', 'tmin', 'tmax', 'hr_max', 'hr_media', 'latitud', 'altitud']
['prec', 'tmin', 'tmax', 'hr_max', 'hr_media', 'latitud', 'altitud']


In [40]:
full_df_scaled.head(3)

Unnamed: 0,fecha,idema,tmed,prec,tmin,tmax,hr_max,hr_media,latitud,altitud,fecha_sin,fecha_cos
0,2010-01-01,2331,0.3339,0.0049,0.3475,0.5594,0.8727,0.9394,0.9112,0.2879,0.5086,0.9999
1,2010-01-01,0320I,0.3463,0.0,0.3435,0.5779,0.8545,0.5657,0.9091,0.372,0.5086,0.9999
2,2010-01-01,C428T,0.5724,0.0,0.5186,0.7049,0.6909,0.6869,0.032,0.1349,0.5086,0.9999


In [41]:
train_df_scaled.head(3)

Unnamed: 0,fecha,idema,tmed,prec,tmin,tmax,hr_max,hr_media,latitud,altitud,fecha_sin,fecha_cos
0,2010-01-01,2331,0.3285,0.0104,0.3673,0.2989,0.8972,0.9394,0.9112,0.3117,0.5086,0.9999
1,2010-01-01,0320I,0.3411,0.0,0.362,0.3284,0.8785,0.5657,0.9091,0.4028,0.5086,0.9999
2,2010-01-01,C428T,0.5709,0.0,0.594,0.532,0.7103,0.6869,0.032,0.1461,0.5086,0.9999


In [42]:
validation_df_scaled.head(3)

Unnamed: 0,fecha,idema,tmed,prec,tmin,tmax,hr_max,hr_media,latitud,altitud,fecha_sin,fecha_cos
0,2021-01-01,B346X,0.4417,0.0581,0.4534,0.4253,0.9065,0.899,0.7355,0.0417,0.5086,0.9999
1,2021-01-01,6293X,0.4632,0.0194,0.4394,0.4778,0.6729,0.5051,0.5599,0.0007,0.5086,0.9999
2,2021-01-01,C453I,0.3627,0.0,0.3849,0.3465,0.9252,0.6599,0.0455,0.6105,0.5086,0.9999


In [43]:
test_df_scaled.head(3)

Unnamed: 0,fecha,idema,tmed,prec,tmin,tmax,hr_max,hr_media,latitud,altitud,fecha_sin,fecha_cos
0,2023-01-01,8293X,0.4865,0.0,0.4306,0.5287,0.7383,0.6465,0.7035,0.0305,0.5086,0.9999
1,2023-01-01,2918Y,0.4506,0.0852,0.4745,0.422,0.8785,0.6263,0.8006,0.3594,0.5086,0.9999
2,2023-01-01,4520X,0.5278,0.0328,0.5185,0.5205,0.8972,0.4949,0.6519,0.2049,0.5086,0.9999


In [44]:
numeric_cols = full_df_scaled.select_dtypes(include='number')
numeric_cols.max() - numeric_cols.min()

tmed         1.0
prec         1.0
tmin         1.0
tmax         1.0
hr_max       1.0
hr_media     1.0
latitud      1.0
altitud      1.0
fecha_sin    1.0
fecha_cos    1.0
dtype: float64

In [45]:
full_df_scaled.to_csv('../../data/ml/full_scaled.csv', index=False)
train_df_scaled.to_csv('../../data/ml/train_scaled.csv', index=False)
validation_df_scaled.to_csv('../../data/ml/validation_scaled.csv', index=False)
test_df_scaled.to_csv('../../data/ml/test_scaled.csv', index=False)