In [39]:
import pandas as pd
import numpy as np
import holidays
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error

In [2]:
df = pd.read_csv("../data/dataset.csv",index_col=0)
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y %H:%M')

In [3]:
df['isWeekend'] = df['date'].apply(lambda x : 1 if x.weekday() > 4 else 0)

In [4]:
slovakia_holidays = holidays.Slovakia()
df['isHoliday'] = df['date'].apply(lambda x : 1 if x in slovakia_holidays else 0)

In [5]:
df['production_usage'] = df['fve'] + df['wpg'] + df['mve']
df['consumption_usage'] = df['shops'] + df['office_building'] + df['industrial_park']

In [6]:
tmp = pd.read_csv("../data/poland_temperature.csv")

In [17]:
temp = pd.DataFrame(np.repeat(tmp['Ambient Temperature (C)'], 4), columns=tmp.columns)
temp = temp.reset_index(drop=True)
wind = pd.DataFrame(np.repeat(tmp['Wind Speed (m/s)'], 4), columns=tmp.columns)
wind = wind.reset_index(drop=True)
df['temp'] = temp['Ambient Temperature (C)']
df['wind'] = wind['Wind Speed (m/s)']

In [21]:
df['hour'] = [x.hour for x in df['date']]
df['day'] = [x.day for x in df['date']]
df['month'] = [x.month for x in df['date']]
df['day_cos'] = [np.cos(x * (2 * np.pi / 24)) for x in df['hour']]
df['day_sin'] = [np.sin(x * (2 * np.pi / 24)) for x in df['hour']]
df['timestamp'] = [x.timestamp() for x in df['date']]
s = 24 * 60 * 60 
year = (365.25) * s
df['month_cos'] = [np.cos((x) * (2 * np.pi / year)) for x in df['timestamp']]
df['month_sin'] = [np.sin((x) * (2 * np.pi / year)) for x in df['timestamp']]

In [36]:
df['price'] = df['price'].apply(lambda x : float(x.replace(",",".")))

In [57]:
X = ['price', 'isWeekend', 'isHoliday', 'temp', 'wind','day_cos','day_sin','month_cos', 'month_sin']
y = 'shops'

shops_df = df[df['shops'] != 0]
X_train, X_test, y_train, y_test = train_test_split(shops_df[X], shops_df[y], test_size=0.2,random_state=22)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
xgb_model = XGBRegressor(objective="reg:squarederror", random_state=22)
xgb_model.fit(X_train, y_train)
shops_missing = df[df['shops'] == 0]
shops_missing = shops_missing[X]
X_missing = scaler.transform(shops_missing)
df.loc[df['shops'] == 0,'shops'] = xgb_model.predict(X_missing)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=22,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [61]:
X = ['price', 'isWeekend', 'isHoliday', 'temp', 'wind','day_cos','day_sin','month_cos', 'month_sin']
y = 'industrial_park'

shops_df = df[df['industrial_park'] != 0]
X_train, X_test, y_train, y_test = train_test_split(shops_df[X], shops_df[y], test_size=0.2,random_state=22)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
xgb_model = XGBRegressor(objective="reg:squarederror", random_state=22)
xgb_model.fit(X_train, y_train)
shops_missing = df[df['industrial_park'] == 0]
shops_missing = shops_missing[X]
X_missing = scaler.transform(shops_missing)
df.loc[df['industrial_park'] == 0,'industrial_park'] = xgb_model.predict(X_missing)

In [64]:
df.to_csv("../data/data_processed.csv",index=False)