# Exploración de performance modelos y feature importances con Pycaret

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from pycaret.classification import *

In [2]:
np.random.seed(666)

In [3]:
df_train = pd.read_csv('data/train_timeseries_interpolated.csv')
df_val = pd.read_csv('data/val_timeseries_interpolated.csv')
df_test = pd.read_csv('data/test_timeseries_interpolated.csv')

In [4]:
pd.set_option('display.max_rows', None)

In [5]:
df_train.shape

(5678316, 23)

In [6]:
df_val.shape

(2268840, 23)

In [7]:
df_test.shape

(2271948, 23)

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5678316 entries, 0 to 5678315
Data columns (total 23 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   fips                      int64  
 1   date                      object 
 2   PRECTOT                   float64
 3   PS                        float64
 4   QV2M                      float64
 5   T2M                       float64
 6   T2MDEW                    float64
 7   T2MWET                    float64
 8   T2M_MAX                   float64
 9   T2M_MIN                   float64
 10  T2M_RANGE                 float64
 11  TS                        float64
 12  WS10M                     float64
 13  WS10M_MAX                 float64
 14  WS10M_MIN                 float64
 15  WS10M_RANGE               float64
 16  WS50M                     float64
 17  WS50M_MAX                 float64
 18  WS50M_MIN                 float64
 19  WS50M_RANGE               float64
 20  score                   

In [9]:
# Eliminar las columnas 'score' y 'score_interpolated'
columns_to_drop = ['score', 'score_interpolated']
df_train = df_train.drop(columns=columns_to_drop)
df_val = df_val.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

In [10]:
df_train['date'] = pd.to_datetime(df_train['date'])
df_val['date'] = pd.to_datetime(df_val['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

In [11]:
df_combined = pd.concat([df_train, df_val, df_test])


In [12]:
df_combined = df_combined.reset_index(drop=True)

In [13]:
df_combined.shape

(10219104, 21)

In [14]:
len(df_combined.fips.unique())

3108

In [15]:
fips_to_keep = np.random.choice(df_combined['fips'].unique(), size=500, replace=False)

df_combined = df_combined[df_combined['fips'].isin(fips_to_keep)]

In [16]:
df_combined.shape

(1644000, 21)

In [17]:
len(df_combined.fips.unique())

500

In [21]:
# Configurar PyCaret para modelar 'score_final_interpolated' con series temporales
clf = setup(data=df_combined, 
            target='score_final_interpolated', 
            fold_strategy='timeseries',  
            fold=5,  
            data_split_shuffle=False,
            fold_shuffle=False,
            data_split_stratify=False,  
            session_id=464831859) 

Unnamed: 0,Description,Value
0,Session id,464831859
1,Target,score_final_interpolated
2,Target type,Multiclass
3,Original data shape,"(1644000, 21)"
4,Transformed data shape,"(1644000, 23)"
5,Transformed train set shape,"(1150800, 23)"
6,Transformed test set shape,"(493200, 23)"
7,Numeric features,19
8,Date features,1
9,Preprocess,True


In [22]:
# Comparar modelos y seleccionar el mejor
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.6341,0.7282,0.6341,0.5476,0.5623,0.1795,0.2091,17.254
lda,Linear Discriminant Analysis,0.6315,0.0,0.6315,0.4973,0.5121,0.0748,0.1261,1.386
ridge,Ridge Classifier,0.629,0.0,0.629,0.4615,0.4998,0.0425,0.0895,0.896
dummy,Dummy Classifier,0.6269,0.5,0.6269,0.3955,0.4843,0.0,0.0,0.808
lr,Logistic Regression,0.6224,0.0,0.6224,0.4552,0.5049,0.0572,0.093,40.428
ada,Ada Boost Classifier,0.6215,0.0,0.6215,0.4752,0.5058,0.0571,0.0964,18.83
rf,Random Forest Classifier,0.6177,0.705,0.6177,0.5362,0.5558,0.1685,0.1882,34.04
svm,SVM - Linear Kernel,0.5953,0.0,0.5953,0.4335,0.4895,0.0506,0.0582,89.432
qda,Quadratic Discriminant Analysis,0.5814,0.0,0.5814,0.5087,0.53,0.1357,0.1467,1.208
gbc,Gradient Boosting Classifier,0.579,0.0,0.579,0.5142,0.5138,0.1029,0.1237,458.336


In [29]:
print(best_model)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=464831859, verbose=0,
                     warm_start=False)


In [31]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…