In [1]:
import numpy as np
import pandas as pd

In [2]:
# Fijar la semilla para reproducibilidad
np.random.seed(666)

In [3]:
# leemos los arhivos de train, val y test
df_train = pd.read_csv('data/train_timeseries_interpolated.csv')
df_val = pd.read_csv('data/val_timeseries_interpolated.csv')
df_test = pd.read_csv('data/test_timeseries_interpolated.csv')

In [4]:
# Cargamos soil data
df_soil = pd.read_csv('data/soil_data.csv')

In [5]:
df_soil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3109 entries, 0 to 3108
Data columns (total 32 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fips           3109 non-null   int64  
 1   lat            3109 non-null   float64
 2   lon            3109 non-null   float64
 3   elevation      3109 non-null   int64  
 4   slope1         3109 non-null   float64
 5   slope2         3109 non-null   float64
 6   slope3         3109 non-null   float64
 7   slope4         3109 non-null   float64
 8   slope5         3109 non-null   float64
 9   slope6         3109 non-null   float64
 10  slope7         3109 non-null   float64
 11  slope8         3109 non-null   float64
 12  aspectN        3109 non-null   float64
 13  aspectE        3109 non-null   float64
 14  aspectS        3109 non-null   float64
 15  aspectW        3109 non-null   float64
 16  aspectUnknown  3109 non-null   float64
 17  WAT_LAND       3109 non-null   float64
 18  NVG_LAND

In [6]:
# opcion para que pandas muestre todo en un print
pd.set_option('display.max_rows', None)

In [7]:
# Eliminar las columnas 'score' y 'score_interpolated'
columns_to_drop = ['score', 'score_interpolated']
df_train = df_train.drop(columns=columns_to_drop)
df_val = df_val.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

In [8]:
df_train['date'] = pd.to_datetime(df_train['date'])
df_val['date'] = pd.to_datetime(df_val['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

In [9]:
# Concatenar los datasets para utilizar PyCaret (opcional)
df_combined = pd.concat([df_train, df_val, df_test])

In [10]:
# Restablecer el índice a un RangeIndex para evitar duplicados
df_combined = df_combined.reset_index(drop=True)

In [11]:
# Seleccionar aleatoriamente 1000 fips únicos
fips_to_keep = np.random.choice(df_combined['fips'].unique(), size=500, replace=False)

# Filtrar el DataFrame para que solo contenga los fips seleccionados
df_combined = df_combined[df_combined['fips'].isin(fips_to_keep)]

In [12]:
df_combined.shape

(1644000, 21)

In [13]:
len(df_combined.fips.unique())

500

In [14]:
# joineamos a df_combined el df_soil por fips
df_combined = df_combined.merge(df_soil, on='fips', how='left')

In [15]:
df_combined.shape

(1644000, 52)

In [16]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1644000 entries, 0 to 1643999
Data columns (total 52 columns):
 #   Column                    Non-Null Count    Dtype         
---  ------                    --------------    -----         
 0   fips                      1644000 non-null  int64         
 1   date                      1644000 non-null  datetime64[ns]
 2   PRECTOT                   1644000 non-null  float64       
 3   PS                        1644000 non-null  float64       
 4   QV2M                      1644000 non-null  float64       
 5   T2M                       1644000 non-null  float64       
 6   T2MDEW                    1644000 non-null  float64       
 7   T2MWET                    1644000 non-null  float64       
 8   T2M_MAX                   1644000 non-null  float64       
 9   T2M_MIN                   1644000 non-null  float64       
 10  T2M_RANGE                 1644000 non-null  float64       
 11  TS                        1644000 non-null  float6

In [18]:
# guardamos en carpeta data el resultado para laburar a futuro
df_combined.to_parquet('data/combined_timeseries_interpolated.parquet', index=False)