# Model Fitting

## Imports

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# To save dict
import pickle
# Dataset
import sklearn
# function to shuffle dataset and divide into test and train set
from sklearn.model_selection import train_test_split
# Scaling
from sklearn.preprocessing import StandardScaler
# Model
from sklearn.linear_model import LinearRegression
# Model functions:
#    reg = LinearRegression().fit(x_train, y_train)                Fits the model to (x_train, y_train)
#    reg.score(x_train, y_train), reg.score(x_test, y_test)        Get R^2 of the model
#    reg.coef_, reg.intercept_                                     Get coefficients
#    reg.predict(x_test)                                           Predict results from model
print('SkLearn version is {}'.format(sklearn.__version__))

SkLearn version is 0.23.2


## Load cleaned Dataset

In [2]:
# load dataset
h_price = pd.read_csv("King_County_cleaned.csv")

## Metric
Use __MAPE__ (Mean average percentage error) as metric, because we have regression!\
$mape = \overline{ape} \text{  with  } ape=\frac{y - \hat{y}}{y}$

In [3]:
# MAPE mean absolute percentage error
def mape(a,b):
    """ calculate MAPE, input (y_true, y_pred)"""
    mask = a != 0 # use mask to prevent dividing by zero
    return (np.fabs(a-b)/a)[mask].mean()

## Calculate all R$^2$
Get R$^2$ for all numerical values!

In [4]:
l = []
# get result
y = h_price.price
# for given features
for feature in ['bedrooms', 'bathrooms', 'm2_living',
                'm2_lot', 'floors', 'condition', 'grade',
                'm2_above', 'yr_built', 'zipcode', 'lat',
                'long', 'm2_living15', 'm2_lot15', 'date_m'
                ]:
    X = h_price[feature].to_numpy().reshape(y.shape[0], 1)
    reg = LinearRegression().fit(X, y)
    l.append([feature, reg.score(X, y)])
# sort and print all R2
l = sorted(l,key=lambda x: x[1], reverse=True)
for i in l:
    print('R^2 for feature {} is {:.2f}'.format(i[0], i[1]))

R^2 for feature m2_living is 0.49
R^2 for feature grade is 0.45
R^2 for feature m2_above is 0.37
R^2 for feature m2_living15 is 0.34
R^2 for feature bathrooms is 0.28
R^2 for feature bedrooms is 0.10
R^2 for feature lat is 0.09
R^2 for feature floors is 0.07
R^2 for feature m2_lot is 0.01
R^2 for feature m2_lot15 is 0.01
R^2 for feature yr_built is 0.00
R^2 for feature zipcode is 0.00
R^2 for feature condition is 0.00
R^2 for feature long is 0.00
R^2 for feature date_m is 0.00


* m2_living, grade, m2_above, m2_living15, bathrooms = __big corr__
* bedrooms, lat, floors = __low corr__
* zipcode, condition, date_m test with dummies (also grade because it's exponential)

## Dummy test
Do R$^2$ test with dummies!

In [5]:
l = []
# get result
y = h_price.price
# for given features
for feature in ['zipcode', 'condition', 'date_m', 'grade', 'view', 'yr_renovated']:
    X = pd.get_dummies(h_price[feature], drop_first='True')
    reg = LinearRegression().fit(X, y)
    l.append([feature, reg.score(X, y)])
# sort and print results
l = sorted(l,key=lambda x: x[1], reverse=True)
for i in l:
    print('R^2 for feature {} is {:.2f}'.format(i[0], i[1]))

R^2 for feature grade is 0.52
R^2 for feature zipcode is 0.41
R^2 for feature view is 0.17
R^2 for feature yr_renovated is 0.03
R^2 for feature condition is 0.01
R^2 for feature date_m is 0.00


* grade correlates better as dummy, because it's not linear
* zipcode also very good correlation
* view correlation not bad
* condition, date_m, yr_renovated no correlation

## Function to preprocess data
Create train and test sets, that are standard scaled!

In [6]:
# Create a library with all dummies, in case you only get one example
all_dummies = {}
for i in ['zipcode', 'condition', 'date_m', 'grade', 'view', 'yr_renovated']:
    tmp = pd.get_dummies(h_price[i], prefix=i)
    all_dummies[i] = tmp.columns.values
# save dict to
with open('dummies.pkl', 'wb') as f:
    pickle.dump(all_dummies, f, pickle.HIGHEST_PROTOCOL)

In [7]:
def get_train_test_sets(features, features_dummies, dataFrame):
    ''' input list of features: list of strings
        input list of dummy features: list of strings
        input dataframe
        res: feature of interest'''
    # load dict
    with open('dummies.pkl', 'rb') as f:
        all_dummies = pickle.load(f)
    ## Add dummies to features / data frame
    copy = dataFrame.copy()
    # Reset index to concat
    copy = copy.reset_index(drop=True)
    for feat_dum in features_dummies:
        df = pd.get_dummies(copy[feat_dum], prefix=feat_dum)
        # Add all missing dummy values
        dummies = all_dummies[feat_dum]
        # initializing dummies with zeros
        df_all = pd.DataFrame(0, index=np.arange(df.shape[0]), columns = dummies)
        # Add all existing ones in correct order -> loop over full one
        for feat in df.columns.values:
            df_all[feat] = df[feat]
        #df.drop(df.columns[len(df.columns)-1], axis=1, inplace=True)
        features = features + list(df_all.columns.values)
        copy = pd.concat([copy, df_all], axis=1, sort=False)

    X = copy[features].values
    
    # reshape array to nFeatures, m
    if len(X.shape) == 1:
        X = X.reshape(X.shape[0], 1)
    #print(X.shape)

    y = dataFrame.price.values
    y = y.reshape(y.shape[0], 1)
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    
    # Scale data
    scaler = StandardScaler()
    # fit scaler to training data
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    
    return x_train, x_test, y_train, y_test

## Intuitive Model
* use all values from above with good R$^2$!

In [8]:
num_features = ['m2_living', 'm2_above', 'bathrooms']
cat_features = ['grade', 'zipcode', 'view']

#### Get train and test set

In [9]:
X_train, X_test, y_train, y_test = get_train_test_sets(num_features, cat_features, h_price)

#### Create model and fit to training set

In [10]:
linReg = LinearRegression().fit(X_train, y_train);

#### Predict Accuracies

In [11]:
y_train_pred = linReg.predict(X_train)
y_test_pred = linReg.predict(X_test)
print('Mape for train set: {:.1f}%'.format(mape(y_train, y_train_pred)*100))
print('Mape for test set:  {:.1f}%'.format(mape(y_test, y_test_pred)*100))

Mape for train set: 17.8%
Mape for test set:  18.0%


* values for train / test set almost the same -> __no overfit__, no regularization needed
* test with less values

### Model with less features

In [12]:
num_features = ['m2_living']
cat_features = ['grade', 'zipcode', 'view']

#### Update train test set

In [13]:
X_train, X_test, y_train, y_test = get_train_test_sets(num_features, cat_features, h_price)

#### Fit pipeline to training set

In [14]:
linReg = LinearRegression().fit(X_train, y_train);

#### Predict Accuracies

In [15]:
y_train_pred = linReg.predict(X_train)
y_test_pred = linReg.predict(X_test)
print('Mape for train set: {:.1f}%'.format(mape(y_train, y_train_pred)*100))
print('Mape for test set:  {:.1f}%'.format(mape(y_test, y_test_pred)*100))

Mape for train set: 17.7%
Mape for test set:  18.0%


* features __bathrooms__, __m2_above__ can be __neglected__, result is same

## Try logarithmic price values
Prices are unevenly distributed over large scale, try logarithmic value.

In [16]:
# Features
num_features = ['m2_living']
cat_features = ['grade', 'zipcode', 'view']

# create dataset / scale it
X_train, X_test, y_train, y_test = get_train_test_sets(num_features, cat_features, h_price)
y_train_log = np.log(y_train)

# Create model instance
linReg = LinearRegression().fit(X_train, y_train_log)

# predict results, turn results back no normal prices
y_train_pred = np.exp(linReg.predict(X_train))
y_test_pred = np.exp(linReg.predict(X_test))

print('Mape for train set: {:.1f}%'.format(mape(y_train, y_train_pred)*100))
print('Mape for test set:  {:.1f}%'.format(mape(y_test, y_test_pred)*100))

Mape for train set: 14.7%
Mape for test set:  15.0%


Model accuracy __increased__ by __3%__!!! __Use logarithmic price__!!

# Result
Final model gives __accuracy of 15%__.\
The __features__ used are:
* m2_living
* grade (as dummy)
* zipcode (as dummy)
* view (as dummy)

The __price__ is converted to __logarithmic scale__, so that too high house prices have a lower influence on the total market!