In [None]:
#Pulling a dataset from a csv file on a local drive
#The data records the energy output of a power plant given temperatures and other features

!pip install pandas --quiet
import pandas as pd
file_path = 'insert filepath here'

#Reading the data into a dataframe
powerplant_df = pd.read_csv('insert filepath here')
powerplant_df

In [None]:
#Reviewing the data types
powerplant_df.info()

In [None]:
#Reviewing the data stylized facts
powerplant_df.describe()

In [None]:
#Importing graphics libraries
!pip install plotly matplotlib seaborn --quiet
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'


In [None]:
#Histogram for Average Temperatures
powerplant_df.AT.describe()
fig = px.histogram(powerplant_df,
                   x='AT',
                   marginal='box',
                   color_discrete_sequence=['red'],
                   title='Distribution of Average Temperature')
fig.update_layout(bargap=0.1)
fig.show()

                    

In [None]:
#Histogram for Vacuum values
powerplant_df.V.describe()
fig = px.histogram(powerplant_df,
                   x='V',
                   marginal='box',
                   color_discrete_sequence=['green'],
                   title='Distribution of Vacuum')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
#Histogram for Ambient Pressure values
powerplant_df.AP.describe()
fig = px.histogram(powerplant_df,
                   x='AP',
                   marginal='box',
                   color_discrete_sequence=['blue'],
                   title='Distribution of Ambient Pressure')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
#Histogram for Relative Humidity values
powerplant_df.RH.describe()
fig = px.histogram(powerplant_df,
                   x='RH',
                   marginal='box',
                   color_discrete_sequence=['purple'],
                   title='Distribution of Relative Humidity')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
#Histogram for Energy Output values
powerplant_df.PE.describe()
fig = px.histogram(powerplant_df,
                   x='PE',
                   marginal='box',
                   color_discrete_sequence=['orange'],
                   title='Distribution of Net Hourly Electrical Output')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
#Scatterplot Energy Output vs Average Temperatures
fig = px.scatter(powerplant_df,
                   x='PE',
                   y='AT',
                   opacity=0.8,
                   hover_data=['RH'],
                   title='Energy Output vs Temperature')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
#Scatterplot Energy Output vs Vacuum
fig = px.scatter(powerplant_df,
                   x='PE',
                   y='V',
                   opacity=0.8,
                   hover_data=['RH'],
                   title='Energy Output vs Vaccum')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
#Scatterplot Energy Output vs Humidity
fig = px.scatter(powerplant_df,
                   x='PE',
                   y='RH',
                   opacity=0.8,
                   hover_data=['V'],
                   title='Energy Output vs Humidity')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
#Scatterplot Humidity vs Average Temperatures
fig = px.scatter(powerplant_df,
                   x='RH',
                   y='AT',
                   opacity=0.8,
                   hover_data=['V'],
                   title='Humidity vs Temperature')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
#Reviewing correlations
powerplant_df.PE.corr(powerplant_df.AT)


In [None]:
powerplant_df.PE.corr(powerplant_df.V)


In [None]:
powerplant_df.corr()


In [None]:
#Summary heatmap of correlations in matrix
sns.heatmap(powerplant_df.corr(), cmap='Reds', annot=True)
plt.title('Correlation Matrix');

In [None]:
#Approximating a linear model btw AT and PE manually (based on above scatterplot)
ATs = powerplant_df.AT

def estimate_output(AT, w, b):
    return w*AT + b

w = -2.5
b = 505

estimated_output = estimate_output(ATs, w, b)

plt.plot(estimated_output, ATs, 'r-o');
plt.xlabel('Estimated Output');
plt.ylabel('Temperature');


In [None]:
#Generate estimated values based on manual model
estimate_output(30, w, b)

In [None]:
estimated_output

In [None]:
#Plot manual model against the data
target = powerplant_df.PE

plt.plot(estimated_output, ATs, 'r', alpha=0.9);
plt.scatter(target, ATs, s=8, alpha=0.8);
plt.xlabel('Energy Output');
plt.ylabel('Temperature');
plt.legend(['Estimate', 'Actual']);


In [None]:
# Review fit manually
targets = powerplant_df.PE
predictions = estimated_output
predictions

In [None]:
targets


In [None]:
#Install numpy and calculate RMSE

!pip install numpy ==quiet
import numpy as np
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(targets - predictions)))

rmse(targets, predictions)

In [None]:
#Install scikit-learn and fit linear regression with AT as only variable for ALL data

!pip install scikit-learn --quiet
from sklearn.linear_model import LinearRegression

model = LinearRegression()

help(model.fit)

inputs = powerplant_df[['AT']]
targets = powerplant_df.PE
print('inputs_shape: ', inputs.shape);
print('targets_shape: ', targets.shape);


In [None]:

type(inputs)

In [None]:
model.fit(inputs, targets)


In [None]:
model.predict(np.array([[30],
                        [20],
                        [10]]))

In [None]:
predictions = model.predict(inputs)
predictions

In [None]:
inputs

In [None]:
targets


In [None]:
rmse(targets, predictions)

In [None]:
#w
model.coef_





In [None]:
#b
model.intercept_

In [None]:
#Use all variables to explain dependent variable with linear regression
model = LinearRegression()

help(model.fit)

inputs = powerplant_df[['AT','RH','V', 'AP']]
targets = powerplant_df.PE
print('inputs_shape: ', inputs.shape);
print('targets_shape: ', targets.shape);

In [None]:
model.fit(inputs, targets)


In [None]:
predictions = model.predict(inputs)

rmse(targets, predictions)

In [None]:
#w
model.coef_

In [None]:
#b
model.intercept_

In [None]:
#Use train_test_split to only use 80% of data to determine realistic preditability

X = powerplant_df[['AT', 'V', 'RH', 'AP']]
y = powerplant_df['PE']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=10)

len(X_train)

In [None]:
model_2 = LinearRegression()

help(model_2.fit)

inputs_2 = X_train
targets_2 = y_train
print('inputs_2shape: ', inputs_2.shape);
print('targets_2shape: ', targets_2.shape);

In [None]:
model_2.fit(inputs_2, targets_2)

In [None]:
predictions_2 = model_2.predict(inputs_2)

rmse(targets_2, predictions_2)

In [None]:
model_2.predict(inputs_2)


In [None]:
inputs_3 = X_test
targets_3 = y_test

predictions_3 = model_2.predict(inputs_3)
mean_absolute_error(predictions_3, targets_3)


In [None]:
model_2.score(x_test, y_test)

In [None]:
r2_score(predictions_3, targets_3)

In [None]:
#Comparing linear regressions using MSE: 80% of the data vs ALL training data

from sklearn.metrics import mean_squared_error

mse_2 = mean_squared_error(targets_2, predictions_2)
print("Mean Squared Error 2:", mse_2)

In [None]:
mse = mean_squared_error(targets, predictions)
print("Mean Squared Error:", mse)

In [None]:
powerdata = powerplant_df

In [None]:
powerdata.head(10)

In [None]:
#Set up data for Random Forest Regression

X = powerdata.drop(['PE'], axis=1)

In [None]:
y = powerdata['PE']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=19)

In [None]:
#Import and run the Random Forest package

from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr = RandomForestRegressor(random_state=13)

In [None]:
rfr.fit(X_train, y_train)

In [None]:
y_predictions = rfr.predict(X_test)

In [None]:
#Import metrics and run MSE to compare to Linear Regression

from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
mean_absolute_error(y_predictions, y_test)



In [None]:
mean_squared_error(y_predictions, y_test)
#MSE is lower for Random Forest than for Linear Regression as expected

In [None]:
r2_score(y_predictions, y_test)
#As compared to the Linear Regression score of 93%, r2 for Random Forest is 95%

In [None]:
#The r2 score for the Random Forest Regression is 0.025 higher than for the Linear Regression.

#In both cases, the evaluation is made on test data, not the training data set.
