# Model Fitting

## Load Dataset

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# To save dict
import pickle

In [2]:
# load dataset
h_price = pd.read_csv("King_County_House_prices_dataset.csv")#, na_values=['?'])

# Edit Dataset

In [3]:
### Change date to useful data eg date, month, year
h_price['date_m'] = np.array([i.split('/')[0] for i in h_price.date.values]).astype(int)
h_price['date_d'] = np.array([i.split('/')[1] for i in h_price.date.values]).astype(int)
h_price['date_y'] = np.array([i.split('/')[2] for i in h_price.date.values]).astype(int)

In [4]:
# 33 bedrooms, coordinates show small house, outlier, not useful for analysis
h_price.bedrooms.argmax()
h_price[15856:15857]
##### remove false bedrooms
h_price = h_price.drop(h_price.index[h_price.bedrooms.argmax()])
#h_price.sort_values('bedrooms', ascending=False).head(3)

In [5]:
#h_price.plot.scatter(x='sqft_living', y='price');
# linear dependence, one obvious outlier
h_price = h_price.drop(h_price.index[h_price.sqft_living.argmax()])

## Functions / Imports

In [6]:
### All dummies
all_dummies = {}
for i in ['zipcode', 'condition', 'date_m', 'grade', 'waterfront', 'view', 'yr_renovated']:
    tmp = pd.get_dummies(h_price[i], prefix=i)
    all_dummies[i] = tmp.columns.values
# save dict
with open('dummies.pkl', 'wb') as f:
    pickle.dump(all_dummies, f, pickle.HIGHEST_PROTOCOL)

In [7]:
# Dataset
import sklearn
# function to shuffle dataset and divide into test and train set
from sklearn.model_selection import train_test_split
# Model
from sklearn.linear_model import LinearRegression
# Model with regularization
from sklearn.linear_model import Ridge
print('SkLearn version is {}'.format(sklearn.__version__))

SkLearn version is 0.23.2


In [142]:
def get_train_test_sets(features, dataFrame, res):
    ''' input list of features: list of strings
        input dataframe
        res: feature of interest'''
    X = dataFrame[features].values
    
    # reshape array to nFeatures, m
    if len(X.shape) == 1:
        X = X.reshape(X.shape[0], 1)
    #print(X.shape)

    y = dataFrame[res].values
    y = y.reshape(y.shape[0], 1)
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    
    dataset = {'x_train': x_train,
               'x_test': x_test,
               'y_train': y_train,
               'y_test': y_test}
    
    return dataset

def get_train_test_sets_wdummies(features, features_dummies, dataFrame, res):
    ''' input list of features: list of strings
        input list of dummy features: list of strings
        input dataframe
        res: feature of interest'''
    # load dict
    with open('dummies.pkl', 'rb') as f:
        all_dummies = pickle.load(f)
    ## Add dummies to features / data frame
    copy = dataFrame.copy()
    # Reset index to concat
    copy = copy.reset_index(drop=True)
    for feat_dum in features_dummies:
        df = pd.get_dummies(copy[feat_dum], prefix=feat_dum)
        # Add all missing dummy values
        dummies = all_dummies[feat_dum]
        # initializing dummies with zeros
        df_all = pd.DataFrame(0, index=np.arange(df.shape[0]), columns = dummies)
        # Add all existing ones in correct order -> loop over full one
        for feat in df.columns.values:
            df_all[feat] = df[feat]
        #df.drop(df.columns[len(df.columns)-1], axis=1, inplace=True)
        features = features + list(df_all.columns.values)
        copy = pd.concat([copy, df_all], axis=1, sort=False)

    X = copy[features].values
    
    # reshape array to nFeatures, m
    if len(X.shape) == 1:
        X = X.reshape(X.shape[0], 1)
    #print(X.shape)

    y = dataFrame[res].values
    y = y.reshape(y.shape[0], 1)
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    
    dataset = {'x_train': x_train,
               'x_test' : x_test,
               'y_train': y_train,
               'y_test' : y_test}
    
    return dataset

# MAPE mean absolute percentage error
def mape(a,b):
    """ calculate MAPE, input (y_true, y_pred)"""
    mask = a != 0
    return (np.fabs(a-b)/a)[mask].mean()

def get_mape_from_dataset(dataset):
    ''' input dataset, fit model to train_set, get mape of model for train, test_set'''
    x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
    reg = LinearRegression().fit(x_train, y_train)
    
    return mape(y_train, reg.predict(x_train)), mape(y_test, reg.predict(x_test))

def get_combinations(full_list):
    """ return possible combinations of features in list"""
    import itertools
    combinations = []
    # append all values
    full_list = list(full_list)
    if 'price' in full_list:
        full_list.remove('price')
    combinations.append(full_list)
    for i in range(1, len(full_list)):
        possib = list(itertools.combinations(full_list, r=i))
        for x in possib:
            combinations.append(list(x))
    print('There are {} combinations'.format(len(combinations)))
    
    return combinations

# Model functions:
#    reg = LinearRegression().fit(x_train, y_train)                Fits the model to (x_train, y_train)
#    reg.score(x_train, y_train), reg.score(x_test, y_test)        Get R^2 of the model
#    reg.coef_, reg.intercept_                                     Get coefficients
#    reg.predict(x_test)                                           Predict results from model


# Test:
dataset = get_train_test_sets(['sqft_living', 'grade'], h_price, 'price')
x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
reg = LinearRegression().fit(x_train, y_train)
print(reg.score(x_train, y_train), reg.score(x_test, y_test))

0.5370480325728262 0.52769201345116


## Shortly Test some combinations

In [145]:
# Create copy
tmp = h_price.copy()
# Remove dummy features etc
tmp.drop(['waterfront', 'id', 'view', 'date', 'yr_renovated', 'zipcode', 'sqft_basement'], axis=1, inplace=True)

combinations = get_combinations(tmp.columns.values)

count = 0
num = len(combinations)
results = []
for comb in combinations:
    count += 1
    print('{:.2f}% done!'.format(count/num*100), end='\r')
    dataset = get_train_test_sets_wdummies(comb, [], tmp, 'price')
    results.append(get_mape_from_dataset(dataset))
    
comb_dict = {'results': results,
             'combs': combinations}
# save dict
with open('combin.pkl', 'wb') as f:
    pickle.dump(comb_dict, f, pickle.HIGHEST_PROTOCOL)

There are 65535 combinations
100.00% done!

In [153]:
# load dict
with open('combin.pkl', 'rb') as f:
    comb_dict = pickle.load(f)
results = comb_dict['results']
combinations = comb_dict['combs']
# show best result
m = np.argmin(results)
print(results[m])
print(combinations[m])

# Best one:
#0.2689278194731115
#['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'condition', 'grade', 'yr_built', 'lat', 'sqft_living15', 'date_m', 'date_y']

IndexError: list index out of range

## Calculate all R^2

In [11]:
l = []
for feature in ['bedrooms', 'price', 'bathrooms', 'sqft_living',
                'sqft_lot', 'floors', 'condition', 'grade',
                'sqft_above', 'yr_built',
                'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_m',
                'date_d', 'date_y']:
    dataset = get_train_test_sets([feature], h_price, 'price')
    #print(feature)
    x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
    reg = LinearRegression().fit(x_train, y_train)
    l.append([feature, reg.score(x_train, y_train)])
l = sorted(l,key=lambda x: x[1], reverse=True)
for i in l:
    print('R^2 for feature {} is {:.2f}'.format(i[0], i[1]))

R^2 for feature price is 1.00
R^2 for feature sqft_living is 0.50
R^2 for feature grade is 0.45
R^2 for feature sqft_above is 0.36
R^2 for feature sqft_living15 is 0.35
R^2 for feature bathrooms is 0.28
R^2 for feature bedrooms is 0.10
R^2 for feature lat is 0.09
R^2 for feature floors is 0.06
R^2 for feature sqft_lot is 0.01
R^2 for feature sqft_lot15 is 0.01
R^2 for feature zipcode is 0.00
R^2 for feature yr_built is 0.00
R^2 for feature condition is 0.00
R^2 for feature long is 0.00
R^2 for feature date_m is 0.00
R^2 for feature date_y is 0.00
R^2 for feature date_d is 0.00


* sqft_living, grade, sqft_above, sqft_living15, bathrooms = big corr
* bedrooms, floors = low corr
* zipcode, condition, date_m test with dummies (also grade because it's exponential)

## Dummy test

In [12]:
l = []
for feature in ['zipcode', 'condition', 'date_m', 'grade', 'waterfront', 'view', 'yr_renovated']:
    tmp = h_price.copy()
    df = pd.get_dummies(tmp[feature])
    features = df.columns.values
    tmp = tmp.join(df)
    dataset = get_train_test_sets(features, tmp, 'price')
    #print(feature)
    x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
    reg = LinearRegression().fit(x_train, y_train)
    l.append([feature, reg.score(x_train, y_train)])
l = sorted(l,key=lambda x: x[1], reverse=True)
for i in l:
    print('R^2 for feature {} is {:.2f}'.format(i[0], i[1]))

R^2 for feature grade is 0.52
R^2 for feature zipcode is 0.41
R^2 for feature view is 0.17
R^2 for feature waterfront is 0.07
R^2 for feature yr_renovated is 0.03
R^2 for feature condition is 0.01
R^2 for feature date_m is 0.00


* grade correlates better as dummy
* zipcode also very good correlation
* view correlation not bad
* waterfront, condition, date_m no single correlation, check in combination

## Try to built values for damaged columns

### .1 View

In [13]:
print('number of missing values is {}'.format(h_price.view.isna().sum()))
copy = h_price.dropna(subset = ['view'])
l = []
for feature in ['bedrooms', 'price', 'bathrooms', 'sqft_living',
                'sqft_lot', 'floors', 'condition', 'grade',
                'sqft_above', 'yr_built',
                'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_m',
                'date_d', 'date_y']:
    dataset = get_train_test_sets([feature], copy, 'view')
    #print(feature)
    x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
    reg = LinearRegression().fit(x_train, y_train)
    l.append([feature, reg.score(x_train, y_train)])
for feature in ['zipcode', 'condition', 'date_m', 'grade', 'waterfront']:
    tmp = copy.copy()
    df = pd.get_dummies(tmp[feature])
    features = df.columns.values
    tmp = tmp.join(df)
    dataset = get_train_test_sets(features, tmp, 'view')
    #print(feature)
    x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
    reg = LinearRegression().fit(x_train, y_train)
    l.append([feature + 'dummy', reg.score(x_train, y_train)])
l = sorted(l,key=lambda x: x[1], reverse=True)
for i in l:
    print('R^2 for feature {} is {:.2f}'.format(i[0], i[1]))

number of missing values is 63
R^2 for feature price is 0.16
R^2 for feature waterfrontdummy is 0.15
R^2 for feature sqft_living is 0.08
R^2 for feature sqft_living15 is 0.08
R^2 for feature zipcodedummy is 0.08
R^2 for feature gradedummy is 0.07
R^2 for feature grade is 0.06
R^2 for feature bathrooms is 0.03
R^2 for feature sqft_above is 0.03
R^2 for feature zipcode is 0.01
R^2 for feature bedrooms is 0.01
R^2 for feature long is 0.01
R^2 for feature sqft_lot15 is 0.01
R^2 for feature sqft_lot is 0.00
R^2 for feature yr_built is 0.00
R^2 for feature conditiondummy is 0.00
R^2 for feature condition is 0.00
R^2 for feature floors is 0.00
R^2 for feature date_mdummy is 0.00
R^2 for feature lat is 0.00
R^2 for feature date_y is 0.00
R^2 for feature date_m is 0.00
R^2 for feature date_d is 0.00


In [14]:
copy = h_price.dropna(subset = ['view'])
features = ['price', 'sqft_living']
for dum in ['waterfront', 'grade', 'zipcode']:
    df = pd.get_dummies(copy[dum])
    features = features + list(df.columns.values)
    copy = copy.join(df)

dataset = get_train_test_sets(features, copy, 'view')
x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
reg = LinearRegression().fit(x_train, y_train)
reg.score(x_test, y_test), reg.score(x_train, y_train)
# score = 1 - (u/v)

(-51384555.98571988, 0.32320516906453256)

### .2 yr_renovated

In [15]:
print('number of missing values is {}'.format(h_price.yr_renovated.isna().sum()))
copy = h_price.dropna(subset = ['yr_renovated'])
l = []
for feature in ['bedrooms', 'price', 'bathrooms', 'sqft_living',
                'sqft_lot', 'floors', 'condition', 'grade',
                'sqft_above', 'yr_built',
                'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_m',
                'date_d', 'date_y']:
    dataset = get_train_test_sets([feature], copy, 'yr_renovated')
    #print(feature)
    x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
    reg = LinearRegression().fit(x_train, y_train)
    l.append([feature, reg.score(x_train, y_train)])
for feature in ['zipcode', 'condition', 'date_m', 'grade', 'waterfront']:
    tmp = copy.copy()
    df = pd.get_dummies(tmp[feature])
    features = df.columns.values
    tmp = tmp.join(df)
    dataset = get_train_test_sets(features, tmp, 'yr_renovated')
    #print(feature)
    x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
    reg = LinearRegression().fit(x_train, y_train)
    l.append([feature + 'dummy', reg.score(x_train, y_train)])
l = sorted(l,key=lambda x: x[1], reverse=True)
for i in l:
    print('R^2 for feature {} is {:.2f}'.format(i[0], i[1]))

number of missing values is 3842
R^2 for feature yr_built is 0.05
R^2 for feature zipcodedummy is 0.03
R^2 for feature price is 0.02
R^2 for feature waterfrontdummy is 0.01
R^2 for feature zipcode is 0.00
R^2 for feature long is 0.00
R^2 for feature conditiondummy is 0.00
R^2 for feature sqft_living is 0.00
R^2 for feature condition is 0.00
R^2 for feature bathrooms is 0.00
R^2 for feature gradedummy is 0.00
R^2 for feature date_mdummy is 0.00
R^2 for feature sqft_above is 0.00
R^2 for feature lat is 0.00
R^2 for feature date_y is 0.00
R^2 for feature grade is 0.00
R^2 for feature bedrooms is 0.00
R^2 for feature sqft_lot is 0.00
R^2 for feature date_d is 0.00
R^2 for feature date_m is 0.00
R^2 for feature floors is 0.00
R^2 for feature sqft_lot15 is 0.00
R^2 for feature sqft_living15 is 0.00


In [16]:
copy = h_price.dropna(subset = ['yr_renovated'])
features = ['yr_built', 'price']
for dum in ['zipcode']:
    df = pd.get_dummies(copy[dum])
    features = features + list(df.columns.values)
    copy = copy.join(df)

dataset = get_train_test_sets(features, copy, 'yr_renovated')
x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
reg = LinearRegression().fit(x_train, y_train)
reg.score(x_test, y_test), reg.score(x_train, y_train)
# score = 1 - (u/v)

(0.04828185421380182, 0.09140023869944625)

### .3 Waterfront

In [17]:
print('number of missing values is {}'.format(h_price.view.isna().sum()))
copy = h_price.dropna(subset = ['waterfront'])
l = []
for feature in ['bedrooms', 'price', 'bathrooms', 'sqft_living',
                'sqft_lot', 'floors', 'condition', 'grade',
                'sqft_above', 'yr_built',
                'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_m',
                'date_d', 'date_y']:
    dataset = get_train_test_sets([feature], copy, 'waterfront')
    #print(feature)
    x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
    reg = LinearRegression().fit(x_train, y_train)
    l.append([feature, reg.score(x_train, y_train)])
for feature in ['zipcode', 'condition', 'date_m', 'grade']:
    tmp = copy.copy()
    df = pd.get_dummies(tmp[feature])
    features = df.columns.values
    tmp = tmp.join(df)
    dataset = get_train_test_sets(features, tmp, 'waterfront')
    #print(feature)
    x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
    reg = LinearRegression().fit(x_train, y_train)
    l.append([feature + 'dummy', reg.score(x_train, y_train)])
l = sorted(l,key=lambda x: x[1], reverse=True)
for i in l:
    print('R^2 for feature {} is {:.2f}'.format(i[0], i[1]))

number of missing values is 63
R^2 for feature price is 0.09
R^2 for feature zipcodedummy is 0.04
R^2 for feature gradedummy is 0.03
R^2 for feature sqft_living is 0.01
R^2 for feature grade is 0.01
R^2 for feature sqft_living15 is 0.01
R^2 for feature sqft_above is 0.01
R^2 for feature bathrooms is 0.01
R^2 for feature long is 0.00
R^2 for feature sqft_lot15 is 0.00
R^2 for feature floors is 0.00
R^2 for feature zipcode is 0.00
R^2 for feature conditiondummy is 0.00
R^2 for feature date_mdummy is 0.00
R^2 for feature sqft_lot is 0.00
R^2 for feature yr_built is 0.00
R^2 for feature condition is 0.00
R^2 for feature date_d is 0.00
R^2 for feature date_y is 0.00
R^2 for feature lat is 0.00
R^2 for feature date_m is 0.00
R^2 for feature bedrooms is 0.00


In [18]:
copy = h_price.dropna(subset = ['waterfront'])
features = ['price']
for dum in ['grade', 'zipcode']:
    df = pd.get_dummies(copy[dum])
    features = features + list(df.columns.values)
    copy = copy.join(df)

dataset = get_train_test_sets(features, copy, 'waterfront')
x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
reg = LinearRegression().fit(x_train, y_train)
reg.score(x_test, y_test), reg.score(x_train, y_train)

(0.130713515826837, 0.21508877582753627)

#### Doesn't work becuase limited to linear regression!

## Intuitive Model
* use all values from above with good R^2

In [143]:
features = ['sqft_living', 'sqft_above', 'bathrooms']
dummies = ['grade', 'zipcode', 'view', 'waterfront']#, 'yr_renovated']

dataset = get_train_test_sets_wdummies(features, dummies, h_price, 'price')
get_mape_from_dataset(dataset)
# No overfit, no regularization needed

(0.17703738936226002, 0.17785674250188935)

In [144]:
# Test with filling nans in waterfront to 0
copy = h_price.copy()
copy.waterfront = copy.waterfront.fillna(0)
features = ['sqft_living', 'sqft_above', 'bathrooms']
dummies = ['grade', 'zipcode', 'view', 'waterfront']#, 'yr_renovated']

dataset = get_train_test_sets_wdummies(features, dummies, copy, 'price')
get_mape_from_dataset(dataset)

(0.1770413435352387, 0.1778537428504051)

## Remove Outliers by STD

In [139]:
# Loop over all features
price_std = h_price.copy()
x = 2.5
for i in ['price', 'sqft_living']:
    std = price_std[i].std()
    mean = price_std[i].mean()
    print('{} {}'.format(i, price_std[abs(price_std[i] - mean) > std*x][i].count()))
    price_std = price_std[abs(price_std[i] - mean) < std*x]
print(price_std.shape)

price 565
sqft_living 405
(20625, 24)


In [141]:
features = ['sqft_living']#, 'bathrooms']
dummies = ['grade', 'zipcode', 'view']

dataset = get_train_test_sets_wdummies(features, dummies, price_std, 'price')

x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
reg = LinearRegression().fit(x_train, y_train)
dataset = get_train_test_sets_wdummies(features, dummies, h_price, 'price')
x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
mape(y_test, reg.predict(x_test))
#reg.score(x_test, y_test)

0.16050541083560943

In [None]:
# Results for different features
#features = ['sqft_living', 'sqft_above', 'bathrooms']
#dummies = ['grade', 'zipcode', 'view', 'waterfront']
#0.16023365212599314

## Richmans- / Poormansmodel

In [109]:
class richpoor:
    def __init__(self):
        # standard settings
        self.rp_coeff = 1030000
        self.rp_range = 15000
        self.features = ['sqft_living', 'sqft_above', 'bathrooms']
        self.dummies = ['grade', 'zipcode', 'view']
    def fit_rp(self, dataFrame):
        """ fit model to decide rich or poor """
        dataset = get_train_test_sets_wdummies(self.features, self.dummies, dataFrame, 'price')
        x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']

        rp_model = LinearRegression().fit(x_train, y_train)
        self.rp_model = rp_model
        
        df_rich = dataFrame[dataFrame.price > self.rp_coeff - self.rp_range]
        if df_rich.shape[0] < 2:
            print("range or coeff is set badly")

        dataset = get_train_test_sets_wdummies(self.features, self.dummies, df_rich, 'price')
        x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']

        model_rich = LinearRegression().fit(x_train, y_train)
        self.model_rich = model_rich
        
        df_poor = dataFrame[dataFrame.price < self.rp_coeff + self.rp_range]
        if df_poor.shape[0] < 2:
            print("range or coeff is set badly")

        dataset = get_train_test_sets_wdummies(self.features, self.dummies, df_poor, 'price')
        x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']

        model_poor = LinearRegression().fit(x_train, y_train)
        self.model_poor = model_poor
        
    def predict(self, x_test):

        # Predict if rich or poor
        y_pred = self.rp_model.predict(x_test)
        mask_rich = (y_pred > self.rp_coeff)
        mask_poor = (y_pred <= self.rp_coeff)

        # Calculate all predicts
        y_poor = self.model_poor.predict(x_test)
        y_rich = self.model_rich.predict(x_test)

        # Get the correct values from both sets
        y_pred = y_rich * mask_rich + y_poor * mask_poor
        
        return y_pred

In [64]:
# Get datasets
dataset = get_train_test_sets_wdummies(features, dummies, h_price, 'price')
x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']

In [65]:
model = richpoor()
model.fit_rp(h_price)
mape(y_test, model.predict(x_test))

0.16039578384439485

In [80]:
# Test different settings
model = richpoor()
l = []
count = 0
#range1 = list(range(int(h_price.price.min())+10000, int(h_price.price.max())-1000000, 100000))
#range2 = [100, 300, 500, 1000, 3000, 5000, 10000, 100000, 500000, 1000000, 3000000]
#### best mape for coefficients (1088000, 10000) is 0.15616
range1 = list(range(900000, 2000000, 10000))
range2 = [5000, 15000, 1000]
num = len(range1) * len(range2)
for rp_coeff in range1:
    for rp_range in range2:
        count += 1
        print('{:.2f}% done!'.format(count/num*100), end='\r')
        model.rp_coeff = rp_coeff
        model.rp_range = rp_range
        model.fit_rp(h_price)
        mp = mape(y_test, model.predict(x_test))
        l.append([(rp_coeff, rp_range), mp])

100.00% done!

In [81]:
# Get best model parameters
l = sorted(l,key=lambda x: x[1])#, reverse=True)
for i in l:
    print('mape for coefficients {} is {:.5f}'.format(i[0], i[1]))

mape for coefficients (1030000, 15000) is 0.15552
mape for coefficients (1170000, 5000) is 0.15600
mape for coefficients (1170000, 1000) is 0.15600
mape for coefficients (1180000, 5000) is 0.15612
mape for coefficients (1180000, 1000) is 0.15612
mape for coefficients (1160000, 15000) is 0.15612
mape for coefficients (940000, 1000) is 0.15613
mape for coefficients (1080000, 15000) is 0.15616
mape for coefficients (1140000, 15000) is 0.15621
mape for coefficients (1190000, 15000) is 0.15623
mape for coefficients (1100000, 5000) is 0.15630
mape for coefficients (1100000, 1000) is 0.15630
mape for coefficients (1120000, 5000) is 0.15636
mape for coefficients (1120000, 1000) is 0.15636
mape for coefficients (1020000, 5000) is 0.15640
mape for coefficients (1020000, 1000) is 0.15640
mape for coefficients (1150000, 5000) is 0.15640
mape for coefficients (1150000, 1000) is 0.15640
mape for coefficients (950000, 1000) is 0.15640
mape for coefficients (1090000, 5000) is 0.15642
mape for coeffici

In [96]:
# Loop over all features
price_std = h_price.copy()
x = 2.5
for i in ['price', 'sqft_living']:
    std = price_std[i].std()
    mean = price_std[i].mean()
    print('{} {}'.format(i, price_std[abs(price_std[i] - mean) > std*x][i].count()))
    price_std = price_std[abs(price_std[i] - mean) < std*x]
print(price_std.shape)

price 565
sqft_living 405
(20625, 24)


In [110]:
model = richpoor()
model.fit_rp(price_std)

features = ['sqft_living', 'sqft_above', 'bathrooms']
dummies = ['grade', 'zipcode', 'view']
model.features = features
model.dummies = dummies
dataset = get_train_test_sets_wdummies(features, dummies, h_price, 'price')
x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']
mape(y_test, model.predict(x_test))

0.15509822965282283