In [2]:
import pandas as pd
import numpy as np

In [3]:
df_val = pd.read_csv('data/validation_timeseries.csv')
df_test = pd.read_csv('data/test_timeseries.csv')


In [4]:
def process_dataframe(df, df_name):
    # Ordenar el dataframe por 'fips' y 'date' para asegurar la secuencia temporal
    df = df.sort_values(by=['fips', 'date'])

    # Realizar la interpolación de los valores de 'score' manteniendo el orden cronológico
    df['score_interpolated'] = df.groupby('fips')['score'].apply(lambda group: group.interpolate(method='linear')).reset_index(level=0, drop=True)

    # Aplicar forward fill para rellenar valores nulos al inicio de la serie
    df['score_interpolated'] = df.groupby('fips')['score_interpolated'].ffill()

    # Aplicar backward fill para rellenar valores nulos al final de la serie
    df['score_interpolated'] = df.groupby('fips')['score_interpolated'].bfill()

    # Definir los puntos de corte para las categorías
    bins = [-np.inf, 0.5, 1.5, 2.5, 3.5, 4.5, np.inf]
    labels = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]

    # Aplicar la categorización a la columna interpolada
    df['score_final_interpolated'] = pd.cut(df['score_interpolated'], bins=bins, labels=labels)

    # Convertir los valores de 'score_final_interpolated' a float
    df['score_final_interpolated'] = df['score_final_interpolated'].astype(float)

    # Mostrar las primeras filas para verificar los resultados
    print(df[['score_interpolated', 'score_final_interpolated']].head(10))

    # Mostrar el conteo de valores únicos en 'score_final_interpolated'
    print(df['score_final_interpolated'].value_counts())

    # Guardar el DataFrame procesado en un archivo CSV
    output_filename = f'data/{df_name}_timeseries_interpolated.csv'
    df.to_csv(output_filename, index=False)
    print(f'DataFrame guardado como {output_filename}')

    return df

In [5]:
df_val = process_dataframe(df_val, 'val')


   score_interpolated  score_final_interpolated
0                 2.0                       2.0
1                 2.0                       2.0
2                 2.0                       2.0
3                 2.0                       2.0
4                 2.0                       2.0
5                 2.0                       2.0
6                 2.0                       2.0
7                 2.0                       2.0
8                 2.0                       2.0
9                 2.0                       2.0
score_final_interpolated
0.0    1550895
1.0     395271
2.0     198997
3.0      85757
4.0      31618
5.0       6302
Name: count, dtype: int64
DataFrame guardado como data/val_timeseries_interpolated.csv


In [6]:
df_test = process_dataframe(df_test, 'test')

   score_interpolated  score_final_interpolated
0                 0.0                       0.0
1                 0.0                       0.0
2                 0.0                       0.0
3                 0.0                       0.0
4                 0.0                       0.0
5                 0.0                       0.0
6                 0.0                       0.0
7                 0.0                       0.0
8                 0.0                       0.0
9                 0.0                       0.0
score_final_interpolated
0.0    1725225
1.0     297046
2.0     153411
3.0      66338
4.0      24450
5.0       5478
Name: count, dtype: int64
DataFrame guardado como data/test_timeseries_interpolated.csv
