In [5]:
from pathlib import Path
import os

import pandas as pd
import numpy as np
from data_processing.data_processing import *

from sklearn import preprocessing
from sklearn import linear_model
from sklearn import metrics
from sklearn.pipeline import Pipeline

In [6]:
MAIN_PATH = Path(os.getcwd()).parent

## Load data

In [7]:
org_df = pd.read_csv(os.path.join(MAIN_PATH, "data", "data.csv"))
org_df = load(org_df)
org_df

Unnamed: 0,001fcx00211.pv,001fcx00221.pv,001fcx00231.pv,001fcx00241.pv,001fir01307.daca.pv,001fir01308.daca.pv,001fir01309.daca.pv,001fir01310.daca.pv,001fir01311.daca.pv,001fir01312.daca.pv,...,001uxm0rf02.daca.pv,001uxm0rf03.daca.pv,037tix00254.daca.pv,037tix00264.daca.pv,prazonka_fe,prazonka_s,prob_corg,prob_fe,prob_s,temp_zuz
2020-09-30 22:00:00+00:00,56.729077,54.724422,11.966905,22.493207,108.702362,112.280388,110.013796,108.761583,110.604901,104.500427,...,92.024078,92.405281,24.665309,24.526161,4.48,8.98,8.6,4.55,9.87,1297.0
2020-09-30 22:01:00+00:00,54.771942,54.734675,12.026410,22.593412,108.846612,112.334600,109.906645,108.879083,110.644046,104.371977,...,92.024206,92.405239,24.663982,24.526161,4.48,8.98,8.6,4.55,9.87,
2020-09-30 22:02:00+00:00,54.695816,54.774163,11.953671,22.428933,108.827640,112.362508,109.970932,108.844765,110.621665,104.387912,...,92.024334,92.405196,24.662656,24.526161,4.48,8.98,8.6,4.55,9.87,
2020-09-30 22:03:00+00:00,54.154394,54.985713,12.052065,22.335388,108.623473,112.386061,109.836948,108.717994,110.464245,104.418573,...,92.024462,92.405153,24.661329,24.526161,4.48,8.98,8.6,4.55,9.87,
2020-09-30 22:04:00+00:00,54.693184,54.490742,12.027310,22.412620,108.746784,112.396792,109.966910,108.978605,110.603660,104.464328,...,92.024590,92.405110,24.660003,24.525475,4.48,8.98,8.6,4.55,9.87,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-31 22:55:00+00:00,57.882366,58.080133,10.056829,30.486496,101.111263,98.338021,102.332506,101.967828,100.651891,97.071359,...,98.121880,98.225985,24.460338,24.320295,6.18,8.89,7.8,6.65,12.47,
2022-01-31 22:56:00+00:00,57.953225,57.931008,9.935828,29.830075,101.034501,98.362465,102.332450,101.981035,100.575884,97.066000,...,98.121319,98.226027,24.464324,24.322539,6.18,8.89,7.8,6.65,12.47,
2022-01-31 22:57:00+00:00,58.186232,57.291012,10.014864,29.350514,101.251020,98.353221,102.312023,102.017297,100.621084,97.223686,...,98.120757,98.226070,24.468310,24.324782,6.18,8.89,7.8,6.65,12.47,
2022-01-31 22:58:00+00:00,59.016838,58.014325,10.048125,29.496899,101.073264,98.354741,102.336123,101.967655,100.596174,97.050953,...,98.120196,98.226113,24.472296,24.327025,6.18,8.89,7.8,6.65,12.47,


In [8]:
df = pd.DataFrame(index=org_df.index)

## Simple features based on correlation

In [9]:
# srednia po 'TEMP POD 2 WARSTWĄ WYMURÓWKI [°C]'
TIX1 = org_df.columns[org_df.columns.str.contains('001tix')].values
df['TIX1'] = org_df[TIX1].mean(axis='columns')

In [10]:
# srednia po 'WODY POWROTNE KOLEKTORÓW [°C]'
TIR = org_df.columns[org_df.columns.str.contains('tir')].values
df['TIR'] = org_df[TIR].mean(axis='columns')

In [11]:
# dodanie reg nadawy koncentratu
FCX = org_df.columns[org_df.columns.str.contains('fcx')].values
df[FCX] = org_df[FCX]

In [12]:
# sumaryczna moc cieplna
NIR = org_df.columns[org_df.columns.str.contains('nir')].values
df[NIR] = org_df[NIR]

In [13]:
# dodanie temp żużla
df["TEMP_ZUZ"] = org_df["temp_zuz"]

In [14]:
df = df.dropna(axis=0)
df.shape

(11377, 8)

In [15]:
df

Unnamed: 0,TIX1,TIR,001fcx00211.pv,001fcx00221.pv,001fcx00231.pv,001fcx00241.pv,001nir0szr0.daca.pv,TEMP_ZUZ
2020-09-30 22:00:00+00:00,418.990965,29.086390,56.729077,54.724422,11.966905,22.493207,14.362428,1297.0
2020-09-30 23:00:00+00:00,418.911544,29.000256,55.309363,55.327746,11.544775,22.702118,14.978720,1295.0
2020-10-01 00:00:00+00:00,418.820059,29.021386,54.703949,54.957514,11.053689,20.013150,15.199544,1303.0
2020-10-01 00:30:00+00:00,418.777284,29.064571,55.000000,55.224717,10.980221,19.902115,14.815123,1302.0
2020-10-01 01:00:00+00:00,418.735368,29.092727,55.886746,54.869892,11.057164,21.069357,14.617263,1303.0
...,...,...,...,...,...,...,...,...
2022-01-31 18:00:00+00:00,418.284737,27.694067,59.180029,58.900365,10.019018,32.147630,15.607105,1300.0
2022-01-31 19:00:00+00:00,418.138199,27.760225,58.895968,58.911688,10.013093,31.719701,14.846545,1305.0
2022-01-31 20:00:00+00:00,418.012402,27.761558,59.073501,59.593765,10.064700,30.158128,15.967954,1304.0
2022-01-31 21:00:00+00:00,417.939267,27.704117,57.774647,58.738514,9.989189,28.000376,16.147269,1308.0


## Split data

In [16]:
train, val, test = split(df)

In [17]:
X_train, X_val, X_test = train.drop(["TEMP_ZUZ"], axis=1), val.drop(["TEMP_ZUZ"], axis=1), test.drop(["TEMP_ZUZ"], axis=1)
y_train, y_val, y_test = train["TEMP_ZUZ"], val["TEMP_ZUZ"], test["TEMP_ZUZ"]

## Ridge Regression

In [18]:
ridge_model = Pipeline([('normalization', preprocessing.StandardScaler()),
                        ('ridge', linear_model.Ridge(alpha=1, solver='cholesky'))])
ridge_model.fit(X_train, y_train)

Pipeline(steps=[('normalization', StandardScaler()),
                ('ridge', Ridge(alpha=1, solver='cholesky'))])

In [19]:
metrics.mean_squared_error(y_val, ridge_model.predict(X_val))

74.98678833682321

## Polynomial Regression

In [20]:
poly_model = Pipeline([('poly', preprocessing.PolynomialFeatures(degree=2)),
                      ('linear', linear_model.LinearRegression(fit_intercept=False))])

In [21]:
poly_model = poly_model.fit(X_train, y_train)

In [22]:
metrics.mean_squared_error(y_val, poly_model.predict(X_val))

68.24177515684886

## SGD Regressor

In [23]:
sgd_model1 = Pipeline([('normalization', preprocessing.StandardScaler()),
                      ('sgd', linear_model.SGDRegressor(max_iter=100000, tol=1e-3, penalty='l2', learning_rate='adaptive'))])

In [24]:
sgd_model1 = sgd_model1.fit(X_train, y_train)

In [25]:
metrics.mean_squared_error(y_val, sgd_model1.predict(X_val))

75.02119569744599

In [26]:
sgd_model2 = Pipeline([('normalization', preprocessing.StandardScaler()),
                      ('sgd', linear_model.SGDRegressor(max_iter=100000, tol=1e-3, penalty='l1', learning_rate='adaptive'))])
sgd_model2 = sgd_model2.fit(X_train, y_train)
metrics.mean_squared_error(y_val, sgd_model2.predict(X_val))

74.97402700988214

In [27]:
sgd_model3 = Pipeline([('sgd', linear_model.SGDRegressor(max_iter=100000, tol=1e-3, penalty='l1', learning_rate='adaptive'))])
sgd_model3 = sgd_model3.fit(X_train, y_train)
metrics.mean_squared_error(y_val, sgd_model3.predict(X_val))

1.0997429236078328e+18

## SGD + Poly Regression

In [28]:
sgd_poly_model1 = Pipeline([('poly', preprocessing.PolynomialFeatures(degree=2)),
                            ('normalization', preprocessing.StandardScaler()),
                            ('sgd', linear_model.SGDRegressor(max_iter=100000, tol=1e-3, penalty='l2', learning_rate='adaptive'))])
sgd_poly_model1 = sgd_poly_model1.fit(X_train, y_train)
metrics.mean_squared_error(y_val, sgd_poly_model1.predict(X_val))

66.92055933988249

In [29]:
sgd_poly_model2 = Pipeline([('poly', preprocessing.PolynomialFeatures(degree=3)),
                            ('normalization', preprocessing.StandardScaler()),
                            ('sgd', linear_model.SGDRegressor(max_iter=100000, tol=1e-3, penalty='l2', learning_rate='adaptive'))])
sgd_poly_model2 = sgd_poly_model1.fit(X_train, y_train)
metrics.mean_squared_error(y_val, sgd_poly_model1.predict(X_val))

70.3055041626263

In [30]:
sgd_poly_model2 = Pipeline([('poly', preprocessing.PolynomialFeatures(degree=4)),
                            ('normalization', preprocessing.StandardScaler()),
                            ('sgd', linear_model.SGDRegressor(max_iter=100000, tol=1e-3, penalty='l2', learning_rate='adaptive'))])
sgd_poly_model2 = sgd_poly_model1.fit(X_train, y_train)
metrics.mean_squared_error(y_val, sgd_poly_model1.predict(X_val))

69.68376229953977

## Poly + Ridge Regression

In [65]:
ridge_poly_model1 = Pipeline([('poly', preprocessing.PolynomialFeatures(degree=3)),
                            ('normalization', preprocessing.StandardScaler()),
                            ('ridge', linear_model.Ridge(alpha=.5, solver='cholesky'))])
ridge_poly_model1 = ridge_poly_model1.fit(X_train, y_train)
metrics.mean_squared_error(y_val, ridge_poly_model1.predict(X_val))

63.81028513099203

# Using all features, without average

### Split data

In [31]:
all_df = org_df.dropna()
all_df

Unnamed: 0,001fcx00211.pv,001fcx00221.pv,001fcx00231.pv,001fcx00241.pv,001fir01307.daca.pv,001fir01308.daca.pv,001fir01309.daca.pv,001fir01310.daca.pv,001fir01311.daca.pv,001fir01312.daca.pv,...,001uxm0rf02.daca.pv,001uxm0rf03.daca.pv,037tix00254.daca.pv,037tix00264.daca.pv,prazonka_fe,prazonka_s,prob_corg,prob_fe,prob_s,temp_zuz
2020-09-30 22:00:00+00:00,56.729077,54.724422,11.966905,22.493207,108.702362,112.280388,110.013796,108.761583,110.604901,104.500427,...,92.024078,92.405281,24.665309,24.526161,4.48,8.98,8.60,4.55,9.87,1297.0
2020-09-30 23:00:00+00:00,55.309363,55.327746,11.544775,22.702118,108.378922,112.237887,109.862477,109.128895,110.520359,104.410075,...,92.033540,92.402722,24.585718,24.448561,4.48,8.98,8.60,4.55,9.87,1295.0
2020-10-01 00:00:00+00:00,54.703949,54.957514,11.053689,20.013150,109.861116,112.517942,110.237388,109.214988,110.434876,104.657074,...,92.056562,92.400504,24.564115,24.484372,4.48,8.98,8.60,4.55,9.87,1303.0
2020-10-01 00:30:00+00:00,55.000000,55.224717,10.980221,19.902115,109.585113,112.805842,110.151395,108.958777,110.694621,104.680675,...,92.068073,92.403063,24.610243,24.517869,4.48,8.98,8.60,4.55,9.87,1302.0
2020-10-01 01:00:00+00:00,55.886746,54.869892,11.057164,21.069357,109.757104,112.602264,110.165194,109.178997,110.596014,104.695973,...,92.076607,92.405623,24.605002,24.488603,4.30,7.84,8.61,4.58,9.74,1303.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-31 18:00:00+00:00,59.180029,58.900365,10.019018,32.147630,103.049852,98.548160,102.593229,102.306346,100.762990,97.234765,...,98.130381,98.257987,24.401280,24.226479,6.38,9.12,7.74,6.06,11.26,1300.0
2022-01-31 19:00:00+00:00,58.895968,58.911688,10.013093,31.719701,102.570421,98.549435,102.571353,102.108130,100.757045,97.183075,...,98.131138,98.234942,24.301374,24.145683,6.38,9.12,7.74,6.06,11.26,1305.0
2022-01-31 20:00:00+00:00,59.073501,59.593765,10.064700,30.158128,102.197750,98.577139,102.375469,102.194845,100.746604,97.138241,...,98.130569,98.218519,24.326187,24.222134,6.38,9.12,7.74,6.06,11.26,1304.0
2022-01-31 21:00:00+00:00,57.774647,58.738514,9.989189,28.000376,102.266821,98.477571,102.374577,102.107050,100.649014,97.129910,...,98.130687,98.221079,24.290438,24.144540,6.38,9.12,7.74,6.06,11.26,1308.0


In [32]:
train, val, test = split(all_df)

In [34]:
X_train, X_val, X_test = train.drop(["temp_zuz"], axis=1), val.drop(["temp_zuz"], axis=1), test.drop(["temp_zuz"], axis=1)
y_train, y_val, y_test = train["temp_zuz"], val["temp_zuz"], test["temp_zuz"]

### Polynomial Regression

In [35]:
poly_model_all = Pipeline([('poly', preprocessing.PolynomialFeatures(degree=2)),
                          ('linear', linear_model.LinearRegression(fit_intercept=False))])
poly_model_all = poly_model_all.fit(X_train, y_train)
metrics.mean_squared_error(y_val, poly_model_all.predict(X_val))

3668115.416767741

### Poly + SGD Regression

In [37]:
sgd_poly_model_all = Pipeline([('poly', preprocessing.PolynomialFeatures(degree=2)),
                            ('normalization', preprocessing.StandardScaler()),
                            ('sgd', linear_model.SGDRegressor(max_iter=100000, tol=1e-3, penalty='l2', learning_rate='adaptive'))])
sgd_poly_model_all = sgd_poly_model_all.fit(X_train, y_train)
metrics.mean_squared_error(y_val, sgd_poly_model_all.predict(X_val))

8.857688618013848e+22

Wniosek: zdecydowanie nie chcemy brać wszystkich feauterów