# Automatic Feature Engineering: Generación de nuevas variables y Feature Selection

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
import featuretools as ft

import autofeat
from autofeat import FeatureSelector

%load_ext autoreload
%autoreload 2

In [3]:
df = pd.read_parquet('data/combined_timeseries_interpolated.parquet')

In [4]:
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

df = df.drop(columns=['date'])

In [5]:
df.shape

(1644000, 54)

## Vamos a agregar nuevas columnas, generadas a partir de operaciones locas entre ellas mismas.

In [6]:
# Limitamos la cantidad de primitivas para optimizar el rendimiento
transform_primitives = ['add_numeric', 'multiply_numeric']  # Primitivas limitadas
agg_primitives = ['mean', 'sum']  # Primitivas limitadas de agregación

In [7]:
# Se definen las columnas meteorológicas de interés + elevation
meteorological_columns = ['QV2M', 'WS10M', 'T2M', 'WS50M', 'TS', 'PS', 'T2MDEW', 'T2MWET', 'PRECTOT', 'elevation']


In [8]:
# Guardar una copia de las columnas no meteorológicas para conservarlas, excluyendo la columna target
non_meteorological_columns = df.columns.difference(meteorological_columns)

In [9]:
target_column = 'score_final_interpolated'

In [10]:
target_values = df[target_column].values

In [11]:
df_without_target = df.drop(columns=[target_column])

In [12]:
df_meteorological = df_without_target[meteorological_columns]

In [13]:
es = ft.EntitySet(id="meteorological_data")

In [14]:
es = es.add_dataframe(dataframe_name="df_meteorological", dataframe=df_meteorological, index="index", make_index=True)


In [15]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                      target_dataframe_name="df_meteorological",
                                      trans_primitives=transform_primitives,
                                      agg_primitives=agg_primitives,
                                      max_depth=1,  # Reducción de la profundidad
                                      features_only=False)


  agg_primitives: ['mean', 'sum']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.


In [16]:
df_extended = feature_matrix.reset_index(drop=True)

In [17]:
df = pd.concat([df_without_target.drop(columns=meteorological_columns).reset_index(drop=True), df_extended], axis=1)

In [19]:
df[target_column] = target_values

In [20]:
df.shape

(1644000, 144)

## Ahora autoseleccionamos features

In [21]:
X = df.drop(columns=['score_final_interpolated'])
y = df['score_final_interpolated']

In [22]:
# dropeamos columnas de tipo datetime en X
X = X.select_dtypes(exclude=['datetime64'])

In [23]:
X.shape

(1644000, 143)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.3,random_state =0)

In [25]:
fsel = FeatureSelector(verbose=1)

In [26]:
new_X = fsel.fit_transform(pd.DataFrame(X), pd.DataFrame(y))

  y = column_or_1d(y, warn=True)


[featsel] Scaling data...

2024-09-24 22:08:49,435 INFO: [featsel] Feature selection run 1/5


done.


2024-09-24 22:15:58,588 INFO: [featsel] Feature selection run 2/5
2024-09-24 22:23:53,142 INFO: [featsel] Feature selection run 3/5
2024-09-24 22:32:47,665 INFO: [featsel] Feature selection run 4/5
2024-09-24 22:41:37,506 INFO: [featsel] Feature selection run 5/5
2024-09-24 22:49:45,282 INFO: [featsel] 95 features after 5 feature selection runs
2024-09-24 22:52:02,310 INFO: [featsel] 42 features after correlation filtering
2024-09-24 22:52:30,853 INFO: [featsel] 35 features after noise filtering


In [27]:
df.shape

(1644000, 144)

In [28]:
new_X.shape

(1644000, 35)

In [29]:
common_columns = set(df.columns).intersection(set(new_X.columns))
common_columns


{'CULT_LAND',
 'FOR_LAND',
 'GRS_LAND',
 'NVG_LAND',
 'PRECTOT * TS',
 'PRECTOT + T2M',
 'PS * TS',
 'PS + WS50M',
 'QV2M * WS10M',
 'SQ1',
 'SQ4',
 'SQ5',
 'SQ7',
 'T2M * TS',
 'T2M_RANGE',
 'TS * WS10M',
 'URB_LAND',
 'WS10M',
 'WS10M_RANGE',
 'WS50M_RANGE',
 'aspectE',
 'aspectN',
 'aspectS',
 'day',
 'elevation * PRECTOT',
 'elevation * T2MDEW',
 'elevation * TS',
 'elevation * WS50M',
 'fips',
 'lat',
 'lon',
 'month',
 'slope2',
 'slope7',
 'year'}

In [30]:
columns_to_keep = list(common_columns) + ['score_final_interpolated']

df_filtered = df[columns_to_keep]

In [31]:
df_filtered.shape

(1644000, 36)

In [32]:
df_filtered.to_parquet('data/df_filtered.parquet', index=False)