# 4 Pre-Processing and Training Data<a id='4_Pre-Processing_and_Training_Data'></a>

# 4.1 Imports<a id='4.3_Imports'></a>

In [3]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression

import datetime

#from library.sb_utils import save_file

## 4.4 Load Data<a id='4.4_Load_Data'></a>

In [4]:
house_data = pd.read_csv('kc_house_data_features.csv')
house_data.head()

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
date,20141013T000000,20141209T000000,20150225T000000,20141209T000000,20150218T000000
price,221900.0,538000.0,180000.0,604000.0,510000.0
bedrooms,3,3,2,4,3
bathrooms,1.0,2.25,1.0,3.0,2.0
sqft_living,1180,2570,770,1960,1680
sqft_lot,5650,7242,10000,5000,8080
floors,1.0,2.0,1.0,1.0,1.0
waterfront,0,0,0,0,0
view,0,0,0,0,0


## 4.6 Train/Test Split<a id='4.6_Train/Test_Split'></a>

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(house_data[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
                                                       'waterfront', 'view', 'condition', 'grade', 'sqft_living15', 'sqft_lot15',]],   
                                                    house_data[['price']], # input x=features, y=targets    
                                                    test_size = 0.15,
                                                    random_state = 1)

print("number of test samples:", X_test.shape[0])
print("number of training samples:",X_train.shape[0])

number of test samples: 3240
number of training samples: 18356


In [13]:
X_train.shape, X_test.shape

((18356, 11), (3240, 11))

In [14]:
Y_train.shape, Y_test.shape

((18356, 1), (3240, 1))

In [16]:
# Create the Scaler object
std_scaler = MinMaxScaler()

# apply the transformation to the training data
X_train_std = std_scaler.fit_transform(X_train) 

# apply the transformation to the testing data
X_test_std = std_scaler.transform(X_test) # but we only transform our testing data with already fit scaler

# convert resulting array back to dataframe
X_test_std_df = pd.DataFrame(X_test_std, columns = X_train.columns)

X_test_std_df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_living15,sqft_lot15
0,0.1,0.166667,0.064541,0.004762,0.0,0.0,0.0,0.5,0.3,0.120633,0.008881
1,0.2,0.3,0.18527,0.007447,0.4,0.0,0.0,0.5,0.5,0.256582,0.011125
2,0.3,0.266667,0.21716,0.012879,0.4,0.0,0.0,0.5,0.6,0.487179,0.024271
3,0.3,0.266667,0.245254,0.012887,0.4,0.0,0.0,0.75,0.8,0.518155,0.02207
4,0.1,0.266667,0.069096,0.000181,0.4,0.0,0.0,0.5,0.5,0.15333,0.002583


In [25]:

linear_model = LinearRegression()
linear_model.fit(X_train_std, Y_train)

In [26]:
print('Intercept: {}'.format(linear_model.intercept_))
print('Columns: {}'.format(x_train.columns))
print('Coefficients: {}'.format(linear_model.coef_))

Intercept: [-295986.22716152]
Columns: Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_living15',
       'sqft_lot15'],
      dtype='object')
Coefficients: [[-396806.07949756  -72629.40927101 2684345.70817188   59799.2678725
   -36785.52907598  590116.21052011  246332.29751904  219444.13526584
  1000974.88080029  -11127.0328719  -730178.84785382]]
