In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Importar PyCaret
from pycaret.classification import *

In [2]:
# Fijar la semilla para reproducibilidad
np.random.seed(666)

In [3]:
# leemos los arhivos de train, val y test
df_train = pd.read_csv('data/train_timeseries_interpolated.csv')
df_val = pd.read_csv('data/val_timeseries_interpolated.csv')
df_test = pd.read_csv('data/test_timeseries_interpolated.csv')

In [4]:
# opcion para que pandas muestre todo en un print
pd.set_option('display.max_rows', None)

In [5]:
df_train.shape

(5678316, 23)

In [6]:
df_val.shape

(2268840, 23)

In [7]:
df_test.shape

(2271948, 23)

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5678316 entries, 0 to 5678315
Data columns (total 23 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   fips                      int64  
 1   date                      object 
 2   PRECTOT                   float64
 3   PS                        float64
 4   QV2M                      float64
 5   T2M                       float64
 6   T2MDEW                    float64
 7   T2MWET                    float64
 8   T2M_MAX                   float64
 9   T2M_MIN                   float64
 10  T2M_RANGE                 float64
 11  TS                        float64
 12  WS10M                     float64
 13  WS10M_MAX                 float64
 14  WS10M_MIN                 float64
 15  WS10M_RANGE               float64
 16  WS50M                     float64
 17  WS50M_MAX                 float64
 18  WS50M_MIN                 float64
 19  WS50M_RANGE               float64
 20  score                   

In [9]:
# Eliminar las columnas 'score' y 'score_interpolated'
columns_to_drop = ['score', 'score_interpolated']
df_train = df_train.drop(columns=columns_to_drop)
df_val = df_val.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

In [10]:
df_train['date'] = pd.to_datetime(df_train['date'])
df_val['date'] = pd.to_datetime(df_val['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

In [11]:
# Concatenar los datasets para utilizar PyCaret (opcional)
df_combined = pd.concat([df_train, df_val, df_test])


In [12]:
# Restablecer el índice a un RangeIndex para evitar duplicados
df_combined = df_combined.reset_index(drop=True)

In [13]:
df_combined.shape

(10219104, 21)

In [14]:
len(df_combined.fips.unique())

3108

In [15]:
# Seleccionar aleatoriamente 1000 fips únicos
fips_to_keep = np.random.choice(df_combined['fips'].unique(), size=1000, replace=False)

# Filtrar el DataFrame para que solo contenga los fips seleccionados
df_combined = df_combined[df_combined['fips'].isin(fips_to_keep)]

In [16]:
df_combined.shape

(3288000, 21)

In [17]:
len(df_combined.fips.unique())

1000

In [18]:
# Configurar PyCaret para modelar 'score_final_interpolated' con series temporales
clf = setup(data=df_combined, 
            target='score_final_interpolated', 
            fold_strategy='timeseries',  # Mantener la secuencia temporal
            fold=5,  # Número de divisiones (folds)
            data_split_shuffle=False,  # Evitar mezclar datos entre splits
            fold_shuffle=False,  # Evitar mezclar datos en folds
            data_split_stratify=False,  # No estratificar la división
            session_id=123)  # Fijar la semilla para reproducibilidad

KeyboardInterrupt: 

In [None]:
# Comparar modelos y seleccionar el mejor
best_model = compare_models()

In [None]:
# Entrenar el mejor modelo en el dataset completo
final_model = finalize_model(best_model)

In [None]:
# Predecir en el conjunto de test
predictions = predict_model(final_model, data=df_test)

In [None]:
# Mostrar las primeras predicciones
print(predictions.head())