In [32]:
import sys
sys.path.append('../..')
from helpers import lookups
import pandas as pd
import glob
import joblib

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
train_dfs = [
    pd.read_csv(file, parse_dates=['fecha'], low_memory=False)
    for file in glob.glob('../../data/historical/historico/*.csv')
    if '2025' not in file
]

In [4]:
train_df = pd.concat(train_dfs, ignore_index=True)

In [5]:
train_df = train_df.sort_values(by='fecha').reset_index(drop=True)

In [6]:
train_df.head()

Unnamed: 0,fecha,idema,tmed,prec,tmin,hora_tmin,tmax,hora_tmax,hr_max,hora_hr_max,hr_min,hora_hr_min,hr_media,dir,velmedia,racha,hora_racha,pres_max,pres_min,sol
0,1997-01-01,3110C,0.6,0.0,-0.5,09:40:00,1.7,,99.0,,84.0,14:15:00,93.0,13.0,0.3,3.9,16:00:00,891.3,885.6,
1,1997-01-01,1495,1.8,4.1,0.4,07:00:00,3.2,12:00:00,,,,,100.0,18.0,1.9,6.1,06:05:00,977.4,975.2,0.5
2,1997-01-01,B691,10.0,6.0,4.0,06:00:00,16.0,12:00:00,,,,,84.0,4.0,0.8,6.1,13:40:00,,,2.3
3,1997-01-01,0076,7.8,6.2,4.0,02:50:00,11.6,14:20:00,,,,,81.0,8.0,4.4,11.1,20:30:00,1012.0,1005.8,6.7
4,1997-01-01,1024E,2.7,0.0,0.4,,5.0,15:30:00,,,,,85.0,6.0,1.9,5.8,13:57:00,981.6,975.0,6.5


In [7]:
train_df['fecha'].dt.year.unique()

array([1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
       2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,
       2019, 2020, 2021, 2022, 2023, 2024], dtype=int32)

In [8]:
def clean_df(df):

    df_clean = df.copy()

    df_clean = df_clean.drop(columns=lookups.time_cols)

    numeric_cols = df_clean.drop(columns=['fecha', 'idema']).columns

    df_clean[numeric_cols] = df_clean[numeric_cols].apply(pd.to_numeric, errors='coerce')
    
    df_clean['year'] = df_clean['fecha'].dt.year
    df_clean['fecha_day'] = df_clean['fecha'].dt.dayofyear
    
    df_clean['fecha_sin'] = df_clean.apply(lambda row: np.sin(2 * np.pi * row['fecha_day'] / 366) if row['year'] % 4 == 0 \
                                           else np.sin(2 * np.pi * row['fecha_day'] / 365), axis=1)
    df_clean['fecha_cos'] = df_clean.apply(lambda row: np.cos(2 * np.pi * row['fecha_day'] / 366) if row['year'] % 4 == 0 \
                                           else np.cos(2 * np.pi * row['fecha_day'] / 365), axis=1)
    
    df_clean['year'] = (df_clean['year'] - 1950) / 100

    df_clean['dir_sin'] = df_clean.apply(lambda row: np.sin(2 * np.pi * row['dir'] / 99), axis=1)
    df_clean['dir_cos'] = df_clean.apply(lambda row: np.cos(2 * np.pi * row['dir'] / 99), axis=1)

    df_clean = df_clean.drop(columns=['dir', 'fecha_day'])
    df_clean = df_clean.round(4)

    return df_clean

In [9]:
train_df_clean = clean_df(train_df)

In [28]:
train_df_combined = train_df_clean.merge(lookups.locations_df[['idema', 'latitud', 'longitud', 'altitud']], on='idema', how='left')

In [29]:
train_df_combined.head()

Unnamed: 0,fecha,idema,tmed,prec,tmin,tmax,hr_max,hr_min,hr_media,velmedia,racha,pres_max,pres_min,sol,year,fecha_sin,fecha_cos,dir_sin,dir_cos,latitud,longitud,altitud
0,1997-01-01,3110C,0.6,0.0,-0.5,1.7,99.0,84.0,93.0,0.3,3.9,891.3,885.6,,0.47,0.0172,0.9999,0.7346,0.6785,41.0,-3.6,1030
1,1997-01-01,1495,1.8,4.1,0.4,3.2,,,100.0,1.9,6.1,977.4,975.2,0.5,0.47,0.0172,0.9999,0.9096,0.4154,42.2333,-8.6167,255
2,1997-01-01,B691,10.0,6.0,4.0,16.0,,,84.0,0.8,6.1,,,2.3,0.47,0.0172,0.9999,0.2511,0.9679,39.7333,3.0,40
3,1997-01-01,0076,7.8,6.2,4.0,11.6,,,81.0,4.4,11.1,1012.0,1005.8,6.7,0.47,0.0172,0.9999,0.4862,0.8738,41.2833,2.0667,4
4,1997-01-01,1024E,2.7,0.0,0.4,5.0,,,85.0,1.9,5.8,981.6,975.0,6.5,0.47,0.0172,0.9999,0.3717,0.9284,43.3,-2.0333,250


In [31]:
train_df_combined.to_csv('../../data/ml/historical_clean_nan.csv', index=False)