In [2]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split



In [3]:
df = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', parse_dates=['TIMESTAMP'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175440 entries, 0 to 175439
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   ZONEID      175440 non-null  int64         
 1   TIMESTAMP   175440 non-null  datetime64[ns]
 2   TARGETVAR   175265 non-null  float64       
 3   U10         175440 non-null  float64       
 4   V10         175440 non-null  float64       
 5   U100        175440 non-null  float64       
 6   V100        175440 non-null  float64       
 7   HOUR        175440 non-null  int64         
 8   MONTH       175440 non-null  int64         
 9   WEEKDAY     175440 non-null  int64         
 10  IS_HOLIDAY  175440 non-null  int64         
 11  WS10        175440 non-null  float64       
 12  WS100       175440 non-null  float64       
 13  WD10        175440 non-null  float64       
 14  WD100       175440 non-null  float64       
 15  WD100CARD   175440 non-null  object        
 16  WD

In [4]:
df.TARGETVAR = df.TARGETVAR.fillna(0)

In [5]:
target = 'TARGETVAR'
features = [feat for feat in df.columns.to_list() if not (feat in [target, 'TIMESTAMP', 'WD100CARD', 'WD10CARD'])]


In [6]:
X = pd.DataFrame(df[['ZONEID', 'WS100']])
y = df[target]

# cat_var = ['MONTH','WEEKDAY']
# X = pd.get_dummies(X, columns=cat_var, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state= 42)

In [7]:
features = [feat for feat in X.columns.to_list() if not (feat in [target, 'TIMESTAMP', 'WD100CARD', 'WD10CARD'])]

results = {}
for feat in features:
    X_train_feat = pd.DataFrame(X_train[feat])
    X_test_feat = pd.DataFrame(X_test[feat])
    model = LinearRegression()
    model.fit(X_train_feat, y_train)
    y_pred = model.predict(X_test_feat)
    y_pred = [1 if value > 1 else 0 if value < 0 else value for value in y_pred]
    results[feat] = mean_squared_error(y_test, y_pred, squared=False)

results
    

{'ZONEID': 0.3102655307998044, 'WS100': 0.19619795440927637}

In [8]:
model = LinearRegression()
model.fit(pd.DataFrame(X_train['WS100']), y_train)
y_pred = model.predict(pd.DataFrame(X_test[X_test['ZONEID'] == 10]['WS100']))
y_pred = [1 if value > 1 else 0 if value < 0 else value for value in y_pred]
print(mean_squared_error(y_test[X_test['ZONEID'] == 10], y_pred, squared=False))
print(model.coef_)
print(model.intercept_)

0.25391238177041414
[0.085058]
-0.2002744156219452


In [9]:
zones = df.ZONEID.unique()

zone_X_train = {zone: X_train[X_train.ZONEID == zone] for zone in zones}
zone_X_test = {zone: X_test[X_test.ZONEID == zone] for zone in zones}
zone_y_train = {zone: y_train[X_train.ZONEID == zone] for zone in zones}
zone_y_test = {zone: y_test[X_test.ZONEID == zone] for zone in zones}

results = {}

y_pred_all = np.array([])
y_pred_idx = np.array([], dtype = 'int')

zone_y_train[1]

for zone in zones:
    model = LinearRegression()
    model.fit(zone_X_train[zone], zone_y_train[zone])
    y_pred = model.predict(zone_X_test[zone])
    y_pred = [1 if value > 1 else 0 if value < 0 else value for value in y_pred]
    results["ZONE" + str(zone)] = mean_squared_error(zone_y_test[zone], y_pred, squared=False)
    y_pred_all = np.concatenate([y_pred_all, y_pred], axis = 0)
    y_pred_idx = np.concatenate([y_pred_idx, zone_X_test[zone].index], axis = 0)

y_pred_all
y_pred_idx


array([    54,   3099,   6098, ..., 163683, 174903, 165033])

In [10]:
print(mean_squared_error(y_test[y_pred_idx], y_pred_all, squared=False))

0.1828095251345052


In [11]:
print(mean_absolute_error(y_test[y_pred_idx], y_pred_all))

0.13907925073665625


In [12]:
target = 'TARGETVAR'
features = [feat for feat in df.columns.to_list() if not (feat in [target, 'TIMESTAMP', 'WD100CARD', 'WD10CARD'])]

X = pd.DataFrame(df[features])
y = df[target]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state= 42)

In [14]:
features = [feat for feat in X.columns.to_list() if not (feat in [target, 'TIMESTAMP', 'WD100CARD', 'WD10CARD'])]

results = {}
for feat in features:
    X_train_feat = pd.DataFrame(X_train[feat])
    X_test_feat = pd.DataFrame(X_test[feat])
    model = LinearRegression()
    model.fit(X_train_feat, y_train)
    y_pred = model.predict(X_test_feat)
    y_pred = [1 if value > 1 else 0 if value < 0 else value for value in y_pred]
    results[feat] = mean_squared_error(y_test, y_pred, squared=False)

results

{'ZONEID': 0.3102655307998044,
 'U10': 0.29958429531983355,
 'V10': 0.30859623186672924,
 'U100': 0.3003220531664418,
 'V100': 0.3080778064016507,
 'HOUR': 0.3101277553237553,
 'MONTH': 0.3100492597067838,
 'WEEKDAY': 0.3104016265961158,
 'IS_HOLIDAY': 0.3103656397335789,
 'WS10': 0.2159201895420417,
 'WS100': 0.19619795440927637,
 'WD10': 0.30812739809190964,
 'WD100': 0.30806366989719,
 'U100NORM': 0.3070737945280548,
 'V100NORM': 0.30824982723319216}

In [15]:
model = LinearRegression()
model.fit(pd.DataFrame(X_train), y_train)
y_pred = model.predict(pd.DataFrame(X_test))
y_pred = [1 if value > 1 else 0 if value < 0 else value for value in y_pred]
print(mean_squared_error(y_test, y_pred, squared=False))
print(model.coef_)
print(model.intercept_)

0.1941849751964938
[ 1.82954021e-03 -2.56563235e-02 -7.50898953e-03  9.50211906e-03
  9.43635343e-03  2.44325741e-03 -7.12067142e-04  4.02259288e-04
 -1.47794868e-03  2.88128404e-02  7.09351243e-02  3.69580024e-05
  4.71484870e-05  1.51601846e-02 -2.84648882e-02]
-0.2643533880903222


In [16]:
zones = df.ZONEID.unique()

zone_X_train = {zone: X_train[X_train.ZONEID == zone] for zone in zones}
zone_X_test = {zone: X_test[X_test.ZONEID == zone] for zone in zones}
zone_y_train = {zone: y_train[X_train.ZONEID == zone] for zone in zones}
zone_y_test = {zone: y_test[X_test.ZONEID == zone] for zone in zones}

results = {}

y_pred_all = np.array([])
y_pred_idx = np.array([], dtype = 'int')

zone_y_train[1]

for zone in zones:
    model = LinearRegression()
    model.fit(zone_X_train[zone], zone_y_train[zone])
    y_pred = model.predict(zone_X_test[zone])
    y_pred = [1 if value > 1 else 0 if value < 0 else value for value in y_pred]
    results["ZONE" + str(zone)] = mean_squared_error(zone_y_test[zone], y_pred, squared=False)
    y_pred_all = np.concatenate([y_pred_all, y_pred], axis = 0)
    y_pred_idx = np.concatenate([y_pred_idx, zone_X_test[zone].index], axis = 0)

y_pred_all
y_pred_idx

print(mean_squared_error(y_test[y_pred_idx], y_pred_all, squared=False))
print(mean_absolute_error(y_test[y_pred_idx], y_pred_all))

0.17570275736217736
0.13283009672435786


Unnamed: 0_level_0,TIMESTAMP,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,IS_HOLIDAY,WS10,WS100,WD10,WD100,WD100CARD,WD10CARD,U100NORM,V100NORM
ZONEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
False,175440,175440,175440,175440,175440,175440,175440,175440,175440,175440,175440,175440,175440,175440,175440,175440,175440,175440
