In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import klib
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

**Attribute Information:**

Date, time year-month-day hour:minute:second

Appliances, energy use in Wh 

lights, energy use of light fixtures in the house in Wh

T1, Temperature in kitchen area, in Celsius

RH_1, Humidity in kitchen area, in %

T2, Temperature in living room area, in Celsius

RH_2, Humidity in living room area, in %

T3, Temperature in laundry room area

RH_3, Humidity in laundry room area, in %

T4, Temperature in office room, in Celsius

RH_4, Humidity in office room, in %

T5, Temperature in bathroom, in Celsius

RH_5, Humidity in bathroom, in %

T6, Temperature outside the building (north side), in Celsius

RH_6, Humidity outside the building (north side), in %

T7, Temperature in ironing room , in Celsius

RH_7, Humidity in ironing room, in %

T8, Temperature in teenager room 2, in Celsius

RH_8, Humidity in teenager room 2, in %

T9, Temperature in parents room, in Celsius

RH_9, Humidity in parents room, in %

To, Temperature outside (from Chievres weather station), in Celsius

Pressure (from Chievres weather station), in mm Hg

RH_out, Humidity outside (from Chievres weather station), in %

Wind speed (from Chievres weather station), in m/s

Visibility (from Chievres weather station), in km

Tdewpoint (from Chievres weather station), Â°C

rv1, Random variable 1, nondimensional

rv2, Random variable 2, nondimensional

In [2]:
d = pd.read_csv('energydata_complete.csv')
d

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,2016-05-27 17:20:00,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,...,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,2016-05-27 17:30:00,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,...,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,2016-05-27 17:40:00,270,10,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,...,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,2016-05-27 17:50:00,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,...,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


In [3]:
dic = {'T1':'t_kitchen','RH_1':'rh_kitchen','T2':'t_living','RH_2':'rh_living','T3':'t_laundry','RH_3':'rh_laundry',
       'T4':'t_office','RH_4':'rh_office','T5':'t_bath','RH_5':'rh_bath','T6':'t_north','RH_6':'rh_north','T7':'t_iron',
       'RH_7':'rh_iron','T8':'t_teen','RH_8':'rh_teen','T9':'t_parent','RH_9':'rh_parent',}

d.rename(columns=dic,inplace=True)
d

Unnamed: 0,date,Appliances,lights,t_kitchen,rh_kitchen,t_living,rh_living,t_laundry,rh_laundry,t_office,...,t_parent,rh_parent,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,2016-05-27 17:20:00,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,...,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,2016-05-27 17:30:00,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,...,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,2016-05-27 17:40:00,270,10,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,...,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,2016-05-27 17:50:00,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,...,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


In [4]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   t_kitchen    19735 non-null  float64
 4   rh_kitchen   19735 non-null  float64
 5   t_living     19735 non-null  float64
 6   rh_living    19735 non-null  float64
 7   t_laundry    19735 non-null  float64
 8   rh_laundry   19735 non-null  float64
 9   t_office     19735 non-null  float64
 10  rh_office    19735 non-null  float64
 11  t_bath       19735 non-null  float64
 12  rh_bath      19735 non-null  float64
 13  t_north      19735 non-null  float64
 14  rh_north     19735 non-null  float64
 15  t_iron       19735 non-null  float64
 16  rh_iron      19735 non-null  float64
 17  t_teen       19735 non-null  float64
 18  rh_teen      19735 non-null  float64
 19  t_pa

In [5]:
d.describe()

Unnamed: 0,Appliances,lights,t_kitchen,rh_kitchen,t_living,rh_living,t_laundry,rh_laundry,t_office,rh_office,...,t_parent,rh_parent,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [6]:
d.columns

Index(['date', 'Appliances', 'lights', 't_kitchen', 'rh_kitchen', 't_living',
       'rh_living', 't_laundry', 'rh_laundry', 't_office', 'rh_office',
       't_bath', 'rh_bath', 't_north', 'rh_north', 't_iron', 'rh_iron',
       't_teen', 'rh_teen', 't_parent', 'rh_parent', 'T_out', 'Press_mm_hg',
       'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')

----

**TAG-LINE CODE**

In [7]:
model = LinearRegression()

In [8]:
# From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) 
# and the temperature outside the building (y = T6). What is the Root Mean Squared error in three D.P?

t = d.iloc[:,5].to_numpy()
f = d.iloc[:,13].to_numpy()

In [9]:
t

array([19.2       , 19.2       , 19.2       , ..., 25.62857143,
       25.414     , 25.26428571])

In [10]:
f

array([ 7.02666667,  6.83333333,  6.56      , ..., 23.62666667,
       22.43333333, 21.02666667])

In [11]:
f_train,f_test,t_train,t_test = train_test_split(f,t,test_size=0.3,random_state=42)

In [12]:
model.fit(f_train.reshape(-1,1),t_train)

In [13]:
t_pred = model.predict(f_test.reshape(-1,1))
t_pred

array([18.40913116, 18.79295557, 17.98673213, ..., 20.83109609,
       21.86766908, 21.66710847])

In [14]:
np.sqrt(mean_squared_error(t_test,t_pred))

1.3084508148931222

----

In [15]:
# Remove the following columns: [“date”, “lights”]. The target variable is “Appliances”. Use a 70-30 train-test set split with a 
# random state of 42 (for reproducibility). Normalize the dataset using the MinMaxScaler (Hint: Use the MinMaxScaler fit_transform 
# and transform methods on the train and test set respectively). Run a multiple linear regression using the training set. Answer the 
# following questions:

d.drop(['date','lights'],axis=1,inplace=True)
d.head()

Unnamed: 0,Appliances,t_kitchen,rh_kitchen,t_living,rh_living,t_laundry,rh_laundry,t_office,rh_office,t_bath,...,t_parent,rh_parent,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


Normalization of the data

In [16]:
scaler = MinMaxScaler()

In [17]:
df =  pd.DataFrame(scaler.fit_transform(d), columns=d.columns)
df.head()

Unnamed: 0,Appliances,t_kitchen,rh_kitchen,t_living,rh_living,t_laundry,rh_laundry,t_office,rh_office,t_bath,...,t_parent,rh_parent,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


Dividing into target and features

In [18]:
x = df.iloc[:,1:]
x.head()

Unnamed: 0,t_kitchen,rh_kitchen,t_living,rh_living,t_laundry,rh_laundry,t_office,rh_office,t_bath,rh_bath,...,t_parent,rh_parent,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,0.381691,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,0.381691,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,0.380037,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,0.380037,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,0.380037,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


In [19]:
y = df.iloc[:,0]
y.head()

0    0.046729
1    0.046729
2    0.037383
3    0.037383
4    0.046729
Name: Appliances, dtype: float64

Splitting the data into train and test sets

In [20]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [21]:
model = LinearRegression()

Training the Model

In [22]:
model.fit(x_train,y_train)

Predicting using trained model

In [23]:
y_train_pred = model.predict(x_train)
y_train_pred

array([0.03731062, 0.08431463, 0.02815965, ..., 0.04626765, 0.08724703,
       0.07305445])

Calculating the Performance Metrics

In [24]:
# What is the Mean Absolute Error (in three decimal places) for the  training set?
mean_absolute_error(y_train,y_train_pred)

0.050225351770483914

In [25]:
# What is the Root Mean Squared Error (in three decimal places) for the training set?
np.sqrt(mean_squared_error(y_train,y_train_pred))

0.08898663509268888

In [26]:
y_pred = model.predict(x_test)
y_pred

array([0.03321002, 0.24412716, 0.03411734, ..., 0.06837578, 0.10030044,
       0.05729594])

In [27]:
# What is the Mean Absolute Error (in three decimal places) for test set?
mean_absolute_error(y_test,y_pred)

0.05013189436664744

In [28]:
# What is the Root Mean Squared Error (in three decimal places) for test set?
np.sqrt(mean_squared_error(y_test,y_pred))

0.08751213329679851

Q) Did the Model above overfit to the training set

A) No, because the error is with training set is nearly equal to the testing set.

------

In [29]:
# Train a ridge regression model with default parameters. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

ridge = Ridge(alpha=0.5)

In [30]:
ridge.fit(x_train,y_train)

In [31]:
y_rig_pred = ridge.predict(x_test)
y_rig_pred 

array([0.03322435, 0.23958056, 0.03476642, ..., 0.06881236, 0.100253  ,
       0.05880823])

In [32]:
np.sqrt(mean_squared_error(y_test,y_rig_pred))

0.08754118590838057

-------

In [33]:
# What is the new RMSE with the Lasso Regression on the test set?

lasso = Lasso(alpha=0.001)
lasso.fit(x_train,y_train)

In [37]:
y_las_pred = lasso.predict(x_test)
y_las_pred

array([0.07370267, 0.08143458, 0.07716072, ..., 0.07792848, 0.09034412,
       0.08359255])

In [38]:
np.sqrt(mean_squared_error(y_test,y_las_pred))

0.09358170467245137

----

In [34]:
# Train a lasso regression model with default value and obtain the new feature weights with it.
# How many of the features have non-zero feature weights?


def get_weights_df(model, feat, col_name):
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df


In [35]:
linear_model_weights = get_weights_df(model, x_train, 'Linear_Model_Weight')
ridge_weights_df = get_weights_df(ridge, x_train, 'Ridge_Weight')
lasso_weights_df = get_weights_df(lasso, x_train, 'Lasso_weight')

In [36]:
final_weights = pd.merge(linear_model_weights, ridge_weights_df, on='Features')
final_weights = pd.merge(final_weights, lasso_weights_df, on='Features')
final_weights

Unnamed: 0,Features,Linear_Model_Weight,Ridge_Weight,Lasso_weight
0,rv2,-101553300000.0,0.000743,-0.0
1,rh_living,-0.4565919,-0.401134,-0.0
2,T_out,-0.321877,-0.250765,0.0
3,t_living,-0.2361085,-0.19388,0.0
4,t_parent,-0.1899052,-0.188584,-0.0
5,rh_teen,-0.1575797,-0.156596,-0.00011
6,RH_out,-0.07770147,-0.050541,-0.049557
7,rh_iron,-0.04461393,-0.046291,-0.0
8,rh_parent,-0.03979075,-0.041701,-0.0
9,t_bath,-0.01566763,-0.020727,-0.0


------