# Baseline Model

This script applies a linear regression model the data driven feature set, given a baseline preformance for comparison.

In [1]:
import numpy as np
import pandas as pd
import datetime
import os
import glob
import json
import matplotlib.pyplot as plt

In [2]:
# set variables from config file
config_path = os.path.abspath('..')[:-7]

with open(config_path + '/config.json', 'r') as f:
    config = json.load(f)

processing_path = config['DEFAULT']['processing_path']
epc_train_clean_fname = config['DEFAULT']['epc_train_clean_fname']
epc_test_clean_fname = config['DEFAULT']['epc_test_clean_fname']
epc_train_dd_fname = config['DEFAULT']['epc_train_dd_fname']
epc_test_dd_fname = config['DEFAULT']['epc_test_dd_fname']

In [3]:
epc_train = pd.read_csv(os.path.join(processing_path,epc_train_dd_fname),header = 0,delimiter = ',')
epc_test = pd.read_csv(os.path.join(processing_path,epc_test_dd_fname),header = 0,delimiter = ',')

In [4]:
epc_train.drop(['BUILDING_REFERENCE_NUMBER'],axis=1,inplace=True)
epc_test.drop(['BUILDING_REFERENCE_NUMBER'],axis=1,inplace=True)

### one hot encode categorical values

In [7]:
for col in ['MAINS_GAS_FLAG','HEAT_LOSS_CORRIDOOR','built_form','energy_tariff','floor_description','floor_level',
            'glazed_type','hotwater_description','lighting_description','mainheat_controls','property_type',
            'roof_description','transaction_type','walls_description','window_description','locality','extension',
            'floor_height','habitable_rooms','open_fireplaces']:
    print(col)
    for_dummy = epc_train.pop(col)
    epc_train = pd.concat([epc_train, pd.get_dummies(for_dummy, prefix=col)], axis=1)

MAINS_GAS_FLAG
HEAT_LOSS_CORRIDOOR
built_form
energy_tariff
floor_description
floor_level
glazed_type
hotwater_description
lighting_description
mainheat_controls
property_type
roof_description
transaction_type
walls_description
window_description
locality
extension
floor_height
habitable_rooms
open_fireplaces


In [8]:
for col in ['MAINS_GAS_FLAG','HEAT_LOSS_CORRIDOOR','built_form','energy_tariff','floor_description','floor_level',
            'glazed_type','hotwater_description','lighting_description','mainheat_controls','property_type',
            'roof_description','transaction_type','walls_description','window_description','locality','extension',
            'floor_height','habitable_rooms','open_fireplaces']:
    print(col)
    for_dummy = epc_test.pop(col)
    epc_test = pd.concat([epc_test, pd.get_dummies(for_dummy, prefix=col)], axis=1)

MAINS_GAS_FLAG
HEAT_LOSS_CORRIDOOR
built_form
energy_tariff
floor_description
floor_level
glazed_type
hotwater_description
lighting_description
mainheat_controls
property_type
roof_description
transaction_type
walls_description
window_description
locality
extension
floor_height
habitable_rooms
open_fireplaces


In [9]:
#Extract the target and features
target_train = epc_train['CURRENT_ENERGY_EFFICIENCY']
inputs_train = epc_train.drop('CURRENT_ENERGY_EFFICIENCY',axis=1)
target_test = epc_test['CURRENT_ENERGY_EFFICIENCY']
inputs_test = epc_test.drop('CURRENT_ENERGY_EFFICIENCY',axis=1)

### scale numeric values 

In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

In [11]:
def scale_numeric(df,col):
    
    ''' 
    Fits a scaler called scaler to the specified column
    Parameters
      df: a dataframe
      col: numeric variable to scale
    Returns a dataframe
    '''
    
    null_index = df[col].isnull()
    df.loc[~null_index, [col]] = scaler.fit_transform(df.loc[~null_index, [col]])
    
    return df

In [12]:
inputs_train = scale_numeric(inputs_train,'TOTAL_FLOOR_AREA')
inputs_test = scale_numeric(inputs_test,'TOTAL_FLOOR_AREA')

### fill missing values

In [13]:
epc_train[['TOTAL_FLOOR_AREA']].isnull().sum().sort_values(ascending = False) / epc_train.shape[0]

TOTAL_FLOOR_AREA    0.0
dtype: float64

### Train the model

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [15]:
#Initailise
reg = LinearRegression()
#train
reg.fit(inputs_train,target_train)

  linalg.lstsq(X, y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
#View the coefficients
print('Coefficients: \n', reg.coef_)

#View the intercept
print(reg.intercept_)

Coefficients: 
 [-1.37916981e+00 -5.28356088e-01  7.91677918e+00  1.29980232e+01
  5.26393175e+00  6.96927031e+00  9.59837976e-01  4.39327574e+00
 -1.97265367e+00  1.22193902e+00  1.23333871e+01  4.03000395e-01
  3.32764767e+00  6.41728029e+00  6.75564926e+00  6.17004795e-01
 -1.51612704e+00  1.51408500e+00 -3.24521026e+00  4.06227611e+00
 -6.89821161e-01 -6.16695082e-01  2.01034444e+00  3.68061500e+00
  1.37972693e+00  1.24310303e-01  4.45646816e+00  1.79486639e+11
  1.79486639e+11  1.79486639e+11  9.96339511e+00  8.63806331e+00
  1.13528280e+01 -7.01001062e+00 -1.59963532e+01 -7.23443257e+00
 -6.07975551e+00 -9.85786021e+00 -2.73126887e+01  1.07144736e+12
  1.07144736e+12  1.07144736e+12  1.07144736e+12  3.22093667e+12
  3.22093667e+12  3.22093667e+12  3.22093667e+12  3.22093667e+12
  3.22093667e+12  3.22093667e+12 -1.02127490e+00  1.90714604e+00
 -1.17647244e+00  1.77716141e-01 -9.66141668e+11 -9.66141668e+11
 -9.66141668e+11 -9.66141668e+11 -9.66141668e+11 -9.66141668e+11
 -5.18066

In [17]:
#Get performance on training data
predict_train = reg.predict(inputs_train)
train_r_squared = r2_score(target_train,predict_train)
train_mse = mean_squared_error(target_train,predict_train)
print('Variance score: %.4f' % train_r_squared)
print("Mean squared error: %.4f" % train_mse)

Variance score: 0.7131
Mean squared error: 72.9379


In [18]:
#Get performance on test data
predict_test = reg.predict(inputs_test)
test_r_squared = r2_score(target_test,predict_test)
test_mse = mean_squared_error(target_test,predict_test)
print('Variance score: %.4f' % test_r_squared)
print("Mean squared error: %.4f" % test_mse)

Variance score: 0.7110
Mean squared error: 72.6514
