In [1]:
import pandas as pd
import numpy as np
import glob
import sys
sys.path.append('../..')
from helpers import lookups

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
dfs = [pd.read_csv(file, parse_dates=['fecha']) for file in glob.glob('../../data/historical/historico/*.csv')]

In [5]:
df = pd.concat(dfs, ignore_index=True)
df = df.sort_values(by='fecha').reset_index(drop=True)

In [6]:
nan_percentage = df.isna().mean() * 100
nan_percentage = nan_percentage[nan_percentage > 0].sort_values(ascending=False)
nan_percentage.apply(lambda x: f"{x:.2f}%")

sol            82.91%
pres_max       75.85%
pres_min       75.61%
hora_hr_max    39.38%
hora_racha     25.58%
dir            20.79%
racha          20.79%
velmedia       20.14%
hora_hr_min    16.05%
hora_tmin      10.38%
hr_max          5.90%
hr_min          5.90%
hora_tmax       5.82%
hr_media        5.60%
prec            3.64%
tmed            2.27%
tmin            2.24%
tmax            2.23%
dtype: object

In [None]:
import sys
sys.path.append('..')
from helpers import lookups


def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df_clean = df.copy()
    
    df_clean = df_clean.drop(columns=lookups.time_cols)
    
    df_clean = df_clean.merge(lookups.locations_df[['idema', 'latitud', 'longitud', 'altitud']], on='idema', how='left')
    df_clean['fecha_day'] = df_clean['fecha'].dt.dayofyear
    
    df_clean['fecha_sin'] = df_clean.apply(
        lambda row: (np.sin(2 * np.pi * row['fecha_day'] / 366) + 1) / 2 if row['year'] % 4 == 0 
        else (np.sin(2 * np.pi * row['fecha_day'] / 365) + 1) / 2, axis=1)
    
    df_clean['fecha_cos'] = df_clean.apply(
        lambda row: (np.cos(2 * np.pi * row['fecha_day'] / 366) + 1) / 2 if row['year'] % 4 == 0 
        else (np.cos(2 * np.pi * row['fecha_day'] / 365) + 1) / 2, axis=1)

    numeric_cols = df_clean.drop(columns=['fecha', 'idema']).columns
    df_clean[numeric_cols] = df_clean[numeric_cols].apply(pd.to_numeric, errors='coerce')

    

In [7]:
df_clean = df.copy()

In [8]:
df_clean = df_clean.drop(columns=lookups.time_cols)

In [9]:
df_clean = df_clean.merge(lookups.locations_df[['idema', 'latitud', 'longitud', 'altitud']], on='idema', how='left')

In [10]:
df_clean['year'] = df_clean['fecha'].dt.year
df_clean['year'] = (df_clean['year'] - 1950) / 100

In [11]:
df_clean['fecha_day'] = df_clean['fecha'].dt.dayofyear

In [12]:
df_clean['fecha_sin'] = df_clean.apply(
    lambda row: (np.sin(2 * np.pi * row['fecha_day'] / 366) + 1) / 2 if row['year'] % 4 == 0 
    else (np.sin(2 * np.pi * row['fecha_day'] / 365) + 1) / 2, axis=1)

In [13]:
df_clean['fecha_cos'] = df_clean.apply(
    lambda row: (np.cos(2 * np.pi * row['fecha_day'] / 366) + 1) / 2 if row['year'] % 4 == 0 
    else (np.cos(2 * np.pi * row['fecha_day'] / 365) + 1) / 2, axis=1)

In [14]:
numeric_cols = df_clean.drop(columns=['fecha', 'idema']).columns
df_clean[numeric_cols] = df_clean[numeric_cols].apply(pd.to_numeric, errors='coerce')

In [15]:
df_clean['dir_sin'] = df_clean.apply(lambda row: np.sin(2 * np.pi * row['dir'] / 99), axis=1)
df_clean['dir_cos'] = df_clean.apply(lambda row: np.cos(2 * np.pi * row['dir'] / 99), axis=1)

In [16]:
df_clean = df_clean.drop(columns=['dir', 'fecha_day'])

In [17]:
df_clean = df_clean.round(4)

In [18]:
df_clean.head()

Unnamed: 0,fecha,idema,tmed,prec,tmin,tmax,hr_max,hr_min,hr_media,velmedia,racha,pres_max,pres_min,sol,latitud,longitud,altitud,year,fecha_sin,fecha_cos,dir_sin,dir_cos
0,2010-01-01,2331,2.7,3.5,0.8,4.6,97.0,80.0,94.0,3.1,15.0,911.6,899.4,0.3,42.35,-3.6167,891,0.6,0.5086,0.9999,0.9938,0.1108
1,2010-01-01,0320I,3.4,0.0,0.5,6.4,95.0,41.0,57.0,6.1,18.1,,,,42.3167,2.1,1151,0.6,0.5086,0.9999,0.9938,0.1108
2,2010-01-01,C428T,16.2,0.0,13.7,18.8,77.0,58.0,69.0,2.5,6.7,,,,28.1667,-16.4667,418,0.6,0.5086,0.9999,0.4298,0.9029
3,2010-01-01,C239N,17.1,0.0,12.0,22.2,86.0,58.0,69.0,4.4,10.0,,,,28.2,-14.0167,1,0.6,0.5086,0.9999,0.6428,0.766
4,2010-01-01,B158X,13.0,0.0,11.2,14.8,70.0,53.0,59.0,7.2,23.9,,,,39.55,2.45,50,0.6,0.5086,0.9999,0.9999,-0.0159


In [19]:
df_clean.to_csv('../../data/ml/clean.csv', index=False)