In [1]:
# Step 1
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression as OLS
from sklearn.metrics import mean_squared_error
import numpy as np
import helpers.activity_901_helper as helper


In [2]:
# Step 2
power_data = pd.read_csv('../data/power_plant.csv')
print(power_data.shape)
power_data.head(5)

(9568, 5)


Unnamed: 0,AT,V,AP,RH,EP
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.4,74.2,445.75
2,29.74,56.9,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.8,40.66,1017.13,97.2,464.43


In [3]:
# Step 3
seed_int = 42
train_f = 0.8
val_f = 0.1
test_f = 0.1

labels = [
    'Train',
    'Value',
    'Test',
]

rows = [helper.create_rand_series(index=list(power_data.index), raw_data=power_data, fraction=train_f, seed=seed_int)]
rows.append(helper.create_rand_series(index=list(power_data.drop(rows[0], axis=0).index), raw_data=power_data, fraction=val_f, seed=seed_int))
rows.append(pd.Series(power_data.drop(pd.concat(rows), axis=0).index))

rand_data = [] # Rand
for i in range(len(rows)):
    rand_data.append(power_data.iloc[rows[i], :])
    helper.print_shape(labels[i], rand_data[i].shape)


Train: (7654, 5)
Value: (956, 5)
Test: (958, 5)


In [4]:
# Step 4
tts_data = [] # train test split data
for split_data in train_test_split(power_data, train_size=train_f, random_state=seed_int):
        tts_data.append(split_data)

vd = tts_data[1] # grabbing the value data as it got processed this round
tts_data.pop(1) # removing the extra value since we'll be changing it up after the next round

for split_data in train_test_split(vd, train_size=.5, random_state=seed_int):

    tts_data.append(split_data)

for i in range(len(labels)):
    print(len(tts_data[i]))
    helper.print_shape(labels[i], tts_data[i].shape)

7654
Train: (7654, 5)
957
Value: (957, 5)
957
Test: (957, 5)


In [5]:
# Step 5
totals = [0,0]
for i in range(len(rand_data)):
    totals[0] += len(rand_data[i])
    totals[1] += len(tts_data[i])

if totals[0] != totals[1]:
    raise Exception('The values were not equal!')
else:
    print('The values were equal!')


The values were equal!


In [6]:
# Step 6
scaler = StandardScaler()
scaler.fit(rand_data[0].iloc[:, :-1])
scaler_data_x = []
scaler_data_y = []
for data in rand_data:
    scaler_data_x.append(scaler.transform(data.iloc[:, :-1]))
    scaler_data_y.append(data['EP'])


In [7]:
# Step 7
linear_model = OLS()
linear_model.fit(scaler_data_x[0], scaler_data_y[0])

In [8]:
# Step 8
for i in range(len(scaler_data_x)):
    sx = scaler_data_x[i]
    sy = scaler_data_y[i]
    print(f'{labels[i]} Score: {linear_model.score(sx, sy)}')
    print(f'{labels[i]} RMSE: {mean_squared_error(linear_model.predict(sx), sy)}')

Train Score: 0.9287072840354756
Train RMSE: 20.732519659228682
Value Score: 0.9242319850572253
Value RMSE: 21.78122371621954
Test Score: 0.9324697715061053
Test RMSE: 20.060589209291425
