In [1]:
import lightgbm as lgb
from catboost import CatBoostRegressor
import os, glob
import json
import folium
import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_absolute_error
from math import sqrt
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,OneHotEncoder,StandardScaler



import warnings
# Filter out specific ValueWarnings from statsmodels
warnings.filterwarnings("ignore")

In [5]:
!ls /kaggle/input/

train-lag1


In [15]:
ndays=1

In [8]:
df_train_features=pd.read_csv('/kaggle/input/train-lag1/train_lag1.csv')
df_train_features

In [9]:
df_train_features

Unnamed: 0,row_id,county,is_business,product_type,is_consumption,prediction_unit_id,date,lowest_price_per_mwh,highest_price_per_mwh,origin_date,...,day,weekday,month,year,segment,sin(dayofyear),cos(dayofyear),sin(hour),cos(hour),target
0,366048,0,0,1,0,0,2022-01-01,80.0,94.0,2021-12-30,...,1,6,1,2022,0_0_1_0,0.017166,0.999853,0.000000,1.000000,0.000
1,366049,0,0,1,1,0,2022-01-01,80.0,94.0,2021-12-30,...,1,6,1,2022,0_0_1_1,0.017166,0.999853,0.000000,1.000000,442.226
2,366050,0,0,2,0,1,2022-01-01,80.0,94.0,2021-12-30,...,1,6,1,2022,0_0_2_0,0.017166,0.999853,0.000000,1.000000,0.000
3,366051,0,0,2,1,1,2022-01-01,80.0,94.0,2021-12-30,...,1,6,1,2022,0_0_2_1,0.017166,0.999853,0.000000,1.000000,44.899
4,366052,0,0,3,0,2,2022-01-01,80.0,94.0,2021-12-30,...,1,6,1,2022,0_0_3_0,0.017166,0.999853,0.000000,1.000000,0.015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1651897,2018347,15,1,0,1,64,2023-05-31,29.0,34.0,2023-05-29,...,31,3,5,2023,15_1_0_1,0.522133,-0.852864,-0.258819,0.965926,197.233
1651898,2018348,15,1,1,0,59,2023-05-31,29.0,34.0,2023-05-29,...,31,3,5,2023,15_1_1_0,0.522133,-0.852864,-0.258819,0.965926,0.000
1651899,2018349,15,1,1,1,59,2023-05-31,29.0,34.0,2023-05-29,...,31,3,5,2023,15_1_1_1,0.522133,-0.852864,-0.258819,0.965926,28.404
1651900,2018350,15,1,3,0,60,2023-05-31,29.0,34.0,2023-05-29,...,31,3,5,2023,15_1_3_0,0.522133,-0.852864,-0.258819,0.965926,0.000


In [10]:
numeric_columns=(list(df_train_features.select_dtypes('number')))[0:len(list(df_train_features.select_dtypes('number')))-10]
numeric_columns.append('target')
numeric_columns.remove('data_block_id')
numeric_columns.remove('hours_ahead')
numeric_columns.remove('hours_ahead_fl_7d')

numeric_columns

['row_id',
 'county',
 'is_business',
 'product_type',
 'is_consumption',
 'prediction_unit_id',
 'lowest_price_per_mwh',
 'highest_price_per_mwh',
 'eic_count',
 'installed_capacity',
 'euros_per_mwh',
 'data_block_id_right',
 'temperature',
 'dewpoint',
 'cloudcover_high',
 'cloudcover_low',
 'cloudcover_mid',
 'cloudcover_total',
 '10_metre_u_wind_component',
 '10_metre_v_wind_component',
 'direct_solar_radiation',
 'surface_solar_radiation_downwards',
 'snowfall',
 'total_precipitation',
 'hours_ahead_fl',
 'temperature_fl',
 'dewpoint_fl',
 'cloudcover_high_fl',
 'cloudcover_low_fl',
 'cloudcover_mid_fl',
 'cloudcover_total_fl',
 '10_metre_u_wind_component_fl',
 '10_metre_v_wind_component_fl',
 'direct_solar_radiation_fl',
 'surface_solar_radiation_downwards_fl',
 'snowfall_fl',
 'total_precipitation_fl',
 'hours_ahead_fd_7d',
 'temperature_fd_7d',
 'dewpoint_fd_7d',
 'cloudcover_high_fd_7d',
 'cloudcover_low_fd_7d',
 'cloudcover_mid_fd_7d',
 'cloudcover_total_fd_7d',
 '10_metre_u

In [11]:
category_columns=list(df_train_features.select_dtypes('category'))
category_columns.append('data_block_id')
category_columns.append('hours_ahead')
category_columns.append('hours_ahead_fl_7d')

In [12]:
label_encoders = {}
encoder = OneHotEncoder(sparse_output=False)


one_hot_encoded = encoder.fit_transform(df_train_features[category_columns])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(category_columns))
df_train_features = pd.concat([df_train_features, one_hot_df], axis=1)
df_train_features = df_train_features.drop(category_columns, axis=1)





In [13]:
scaler = StandardScaler()
df_train_features[numeric_columns] = scaler.fit_transform(df_train_features[numeric_columns])
df_train_features

Unnamed: 0,row_id,county,is_business,product_type,is_consumption,prediction_unit_id,date,lowest_price_per_mwh,highest_price_per_mwh,origin_date,...,hours_ahead_fl_7d_39.0,hours_ahead_fl_7d_40.0,hours_ahead_fl_7d_41.0,hours_ahead_fl_7d_42.0,hours_ahead_fl_7d_43.0,hours_ahead_fl_7d_44.0,hours_ahead_fl_7d_45.0,hours_ahead_fl_7d_46.0,hours_ahead_fl_7d_47.0,hours_ahead_fl_7d_nan
0,-1.732023,-1.523235,-1.088528,-0.814026,-1.0,-1.690103,2022-01-01,-0.423582,-0.380074,2021-12-30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-1.732021,-1.523235,-1.088528,-0.814026,1.0,-1.690103,2022-01-01,-0.423582,-0.380074,2021-12-30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-1.732019,-1.523235,-1.088528,0.107148,-1.0,-1.639614,2022-01-01,-0.423582,-0.380074,2021-12-30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-1.732017,-1.523235,-1.088528,0.107148,1.0,-1.639614,2022-01-01,-0.423582,-0.380074,2021-12-30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.732015,-1.523235,-1.088528,1.028323,-1.0,-1.589125,2022-01-01,-0.423582,-0.380074,2021-12-30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1651897,1.732080,1.607459,0.918672,-1.735201,1.0,1.541174,2023-05-31,-1.439088,-1.427649,2023-05-29,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1651898,1.732082,1.607459,0.918672,-0.814026,-1.0,1.288731,2023-05-31,-1.439088,-1.427649,2023-05-29,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1651899,1.732084,1.607459,0.918672,-0.814026,1.0,1.288731,2023-05-31,-1.439088,-1.427649,2023-05-29,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1651900,1.732086,1.607459,0.918672,1.028323,-1.0,1.339220,2023-05-31,-1.439088,-1.427649,2023-05-29,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
df_train_features.to_csv(f'train_lag{ndays}_pre_processing.csv')