# Stage B Quiz Submission

## Import Statements and file reading

In [96]:
# Import all base libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import machine learning tools.
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error



df = pd.read_csv("energydata_complete.csv")

## Solution to Question 12

In [97]:
# Obtain the feature and target datasets.
x = np.array(df['T2']).reshape(-1, 1)  # We reshape because the dataset only has one feature.
y = df['T6']

# Split the dataset and train the model.
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.3,
    random_state=2,
) 
model = LinearRegression()
model.fit(x_train, y_train)

# Obtain predictions.
y_pred = model.predict(x_test)
measure = r2_score(y_test, y_pred)

round(measure, 2)

0.65

## Solution to Question 13

In [98]:
scaler = MinMaxScaler()

new_df = df.drop(columns=['date', 'lights'])

normalized_df = pd.DataFrame(scaler.fit_transform(new_df), columns=new_df.columns)

# Grab the features and the target dataset.
features = normalized_df.drop(columns=['Appliances'])
target = normalized_df['Appliances']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Train the model with the training set.
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test);

In [99]:
# Calculate the mean absolute error to 2dp.
mae = mean_absolute_error(y_test, y_pred)
round(mae, 2)

0.05

## Solution to Question 14

In [100]:
# Implement a Residual Sum of Squares function (RSS).
def rss_score(y_true, y_pred):
    return np.sum((y_true - y_pred)**2)

# Calculate the RSS to 2dp
rss = rss_score(y_test, y_pred)
round(rss, 2)

45.35

## Solution to Question 15

In [101]:
# Calculate the Root Mean Squared Error (RMSE) to 3dp.
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

round(rmse, 3)

0.088

## Solution to Question 16

In [102]:
# Calculate the Coefficient of Determination (R Squared) to 2dp.
rsquare = r2_score(y_test, y_pred)
round(rsquare, 2)

0.15

## Solution to Question 17

In [103]:
# Extract the weights into a series.
weight_series = pd.Series(model.coef_, index=features.columns)
weight_series.sort_values(inplace=True)

# Get the lowest and highest weights and print them
low, high = weight_series.index[0], weight_series.index[-1]
print("Lowest weight: " + low)
print("Highest weight: " + high)


Lowest weight: RH_2
Highest weight: RH_1


## Solution to Question 18

In [104]:
# Train the ridge model and obtain predictions.
ridge_model = Ridge(alpha=0.4)
ridge_model.fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)

# Calculate and print the RMSE.
mse_ridge = mean_squared_error(y_test, y_pred)
rmse_ridge = np.sqrt(mse)

assert rmse == rmse_ridge, "Yes"  # 'Yes' if there is a difference.
print("No")  # 'No' if they are the same.

No


## Solution to Question 19

In [105]:
# Train the lasso model and obtain the feature weights.
lasso_model = Lasso(alpha=0.001)
lasso_model.fit(X_train, y_train)
weight_series = pd.Series(lasso_model.coef_, index=features.columns)


# Get the number of feature weights that are not 'zero'.
len(weight_series[weight_series != 0])

4

## Solution to Question 20

In [106]:
# Obtain the predicted values for the lasso model.
y_pred = lasso_model.predict(X_test)

# Calculate the RMSE value for the lasso model.
mse_lasso = mean_squared_error(y_test, y_pred)
rmse_lasso = np.sqrt(mse_lasso)

round(rmse_lasso, 3)

0.094