In [28]:
import pandas as pd
import numpy as np
import time

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('../../data/ml/historical_full_nan.csv')
df = df.round(4)
df.head()

Unnamed: 0,fecha,idema,tmed,prec,tmin,tmax,hr_max,hr_min,hr_media,velmedia,racha,pres_max,pres_min,sol,year,fecha_sin,fecha_cos,dir_sin,dir_cos,latitud,longitud,altitud
0,1997-01-01,3110C,0.6,0.0,-0.5,1.7,99.0,84.0,93.0,0.3,3.9,891.3,885.6,,0.47,0.0172,0.9999,0.7346,0.6785,41.0,-3.6,1030
1,1997-01-01,1495,1.8,4.1,0.4,3.2,,,100.0,1.9,6.1,977.4,975.2,0.5,0.47,0.0172,0.9999,0.9096,0.4154,42.2333,-8.6167,255
2,1997-01-01,B691,10.0,6.0,4.0,16.0,,,84.0,0.8,6.1,,,2.3,0.47,0.0172,0.9999,0.2511,0.9679,39.7333,3.0,40
3,1997-01-01,0076,7.8,6.2,4.0,11.6,,,81.0,4.4,11.1,1012.0,1005.8,6.7,0.47,0.0172,0.9999,0.4862,0.8738,41.2833,2.0667,4
4,1997-01-01,1024E,2.7,0.0,0.4,5.0,,,85.0,1.9,5.8,981.6,975.0,6.5,0.47,0.0172,0.9999,0.3717,0.9284,43.3,-2.0333,250


In [4]:
nan_percentage = df.isna().mean() * 100
nan_percentage = nan_percentage[nan_percentage > 0].sort_values(ascending=False)
nan_percentage.apply(lambda x: f"{x:.2f}%")

sol         79.80%
pres_max    71.02%
pres_min    70.93%
hr_min      29.46%
racha       23.64%
hr_max      23.13%
dir_sin     23.01%
dir_cos     23.01%
velmedia    22.11%
hr_media    20.08%
tmin         5.80%
tmax         5.78%
prec         4.36%
tmed         3.83%
dtype: object

In [5]:
df = df.drop(columns=['sol', 'pres_max', 'pres_min', 'hr_max', 'hr_min'])

In [6]:
li_cols = ['tmed', 'tmin', 'tmax', 'prec', 'hr_media']
knn_cols = ['velmedia', 'racha', 'dir_sin', 'dir_cos']

In [7]:
df[li_cols] = df[li_cols].interpolate(method='linear', axis=0)

In [8]:
lookup = pd.read_csv('../../data/locations/estacion_distance_lookup.csv')

In [43]:
def impute_knn(df, k=4):
    imputed_df = df.copy()

    for fecha in imputed_df['fecha'].unique():
        fecha_data = imputed_df[imputed_df['fecha'] == fecha]

        for col in knn_cols:
            nan_mask = fecha_data[col].isna()

            for index, row in fecha_data[nan_mask].iterrows():
                nearest_idemas = lookup[row['idema']]
                nearest_values = fecha_data[fecha_data['idema'].isin(nearest_idemas)][col][:k]

                if not nearest_values.empty:
                    imputed_value = nearest_values.mean()
                    imputed_df.loc[index, col] = imputed_value

    return imputed_df

In [44]:
start = time.time()
imputed_df = impute_knn(df)
time.time() - start

4237.813260793686

In [46]:
imputed_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6026325 entries, 0 to 6026324
Data columns (total 17 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   fecha      6026325 non-null  object 
 1   idema      6026325 non-null  object 
 2   tmed       6026325 non-null  float64
 3   prec       6026325 non-null  float64
 4   tmin       6026325 non-null  float64
 5   tmax       6026325 non-null  float64
 6   hr_media   6026325 non-null  float64
 7   velmedia   6024154 non-null  float64
 8   racha      5982651 non-null  float64
 9   year       6026325 non-null  float64
 10  fecha_sin  6026325 non-null  float64
 11  fecha_cos  6026325 non-null  float64
 12  dir_sin    6023178 non-null  float64
 13  dir_cos    6023178 non-null  float64
 14  latitud    6026325 non-null  float64
 15  longitud   6026325 non-null  float64
 16  altitud    6026325 non-null  int64  
dtypes: float64(14), int64(1), object(2)
memory usage: 781.6+ MB


In [47]:
nan_percentage = imputed_df.isna().mean() * 100
nan_percentage = nan_percentage[nan_percentage > 0].sort_values(ascending=False)
nan_percentage.apply(lambda x: f"{x:.2f}%")

racha       0.72%
dir_sin     0.05%
dir_cos     0.05%
velmedia    0.04%
dtype: object

apply a final impute

In [50]:
reimputed_df = imputed_df.interpolate(method='linear', axis=0)

  reimputed_df = imputed_df.interpolate(method='linear', axis=0)


In [51]:
nan_percentage = reimputed_df.isna().mean() * 100
nan_percentage = nan_percentage[nan_percentage > 0].sort_values(ascending=False)
nan_percentage.apply(lambda x: f"{x:.2f}%")

Series([], dtype: float64)

In [53]:
reimputed_df.to_csv('../../data/ml/historical_full.csv', index=False)