In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [6]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 
             'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 
             'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 
             'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 
             'view':int}
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict, parse_dates=['date'])
# add the new features
def add_features(df):
    df['bedrooms_squared'] = df['bedrooms'] * df['bedrooms']
    df['bed_bath_rooms'] = df['bedrooms'] * df['bathrooms']
    df['log_sqft_living'] = df['sqft_living'].map(lambda x: np.log(x))
    df['lat_plus_long'] = df['long'] + df['lat']
    
    return df
    
sales = add_features(sales)
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict, parse_dates=['date'])
train_data = add_features(train_data)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict, parse_dates=['date'])
test_data = add_features(test_data)

In [7]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
0,7129300520,2014-10-13,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,0,98178,47.5112,-122.257,1340.0,5650.0,9.0,3.0,7.07327,-74.7458
1,6414100192,2014-12-09,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,1991,98125,47.721,-122.319,1690.0,7639.0,9.0,6.75,7.851661,-74.598
2,5631500400,2015-02-25,180000.0,2.0,1.0,770.0,10000,1,0,0,...,0,98028,47.7379,-122.233,2720.0,8062.0,4.0,2.0,6.646391,-74.4951
3,2487200875,2014-12-09,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,0,98136,47.5208,-122.393,1360.0,5000.0,16.0,12.0,7.5807,-74.8722
4,1954400510,2015-02-18,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,0,98074,47.6168,-122.045,1800.0,7503.0,9.0,6.0,7.426549,-74.4282


In [27]:
test_data['lat_plus_long'].mean()

-74.65333355403185

In [9]:
from sklearn.linear_model import LinearRegression

In [13]:
model_1 = LinearRegression()
model_1.fit(train_data[['sqft_living','bedrooms','bathrooms','lat','long']], train_data['price'])
model_1.coef_

array([  3.12258646e+02,  -5.95865332e+04,   1.57067421e+04,
         6.58619264e+05,  -3.09374351e+05])

In [33]:
np.sum(np.square(test_data['price'] - 
       model_1.predict(test_data[['sqft_living','bedrooms','bathrooms','lat','long']])))

225500469795490.25

In [32]:
model_2 = LinearRegression()
model_2.fit(train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms']],
           train_data['price'])
model_2.coef_

array([  3.06610053e+02,  -1.13446368e+05,  -7.14613083e+04,
         6.54844630e+05,  -2.94298969e+05,   2.55796520e+04])

In [34]:
np.sum(np.square(test_data['price'] - 
       model_2.predict(test_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms']])))

223377462976466.56

In [16]:
model_3 = LinearRegression()
model_3.fit(train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms', 
                        'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']],
           train_data['price'])
model_3.coef_

array([  5.29422820e+02,   3.45142296e+04,   6.70607813e+04,
         5.34085611e+05,  -4.06750711e+05,  -8.57050439e+03,
        -6.78858667e+03,  -5.61831484e+05,   1.27334900e+05])

In [35]:
np.sum(np.square(test_data['price'] - 
       model_3.predict(test_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms', 
                        'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']])))

259236319207179.47