In [1]:
import graphlab
from __future__ import division

In [2]:
sales = graphlab.SFrame('data/home_data.gl/')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\emirl\AppData\Local\Temp\graphlab_server_1477046951.log.0


This non-commercial license of GraphLab Create for academic use is assigned to emir.lej@gmail.com and will expire on September 20, 2017.


In [3]:
sales.head(n=5)

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900,3,1.0,1180,5650,1,0
6414100192,2014-12-09 00:00:00+00:00,538000,3,2.25,2570,7242,2,0
5631500400,2015-02-25 00:00:00+00:00,180000,2,1.0,770,10000,1,0
2487200875,2014-12-09 00:00:00+00:00,604000,4,3.0,1960,5000,1,0
1954400510,2015-02-18 00:00:00+00:00,510000,3,2.0,1680,8080,1,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274
0,3,6,770,0,1933,0,98028,47.73792661
0,5,7,1050,910,1965,0,98136,47.52082
0,3,8,1680,0,1987,0,98074,47.61681228

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0


In [4]:
sales.dtype

<bound method SFrame.dtype of Columns:
	id	str
	date	datetime
	price	int
	bedrooms	str
	bathrooms	str
	sqft_living	int
	sqft_lot	int
	floors	str
	waterfront	int
	view	int
	condition	int
	grade	int
	sqft_above	int
	sqft_basement	int
	yr_built	int
	yr_renovated	int
	zipcode	str
	lat	float
	long	float
	sqft_living15	float
	sqft_lot15	float

Rows: 21613

Data:
+------------+---------------------------+---------+----------+-----------+-------------+
|     id     |            date           |  price  | bedrooms | bathrooms | sqft_living |
+------------+---------------------------+---------+----------+-----------+-------------+
| 7129300520 | 2014-10-13 00:00:00+00:00 |  221900 |    3     |     1     |     1180    |
| 6414100192 | 2014-12-09 00:00:00+00:00 |  538000 |    3     |    2.25   |     2570    |
| 5631500400 | 2015-02-25 00:00:00+00:00 |  180000 |    2     |     1     |     770     |
| 2487200875 | 2014-12-09 00:00:00+00:00 |  604000 |    4     |     3     |     1960    |
| 195440051

# Question 1

In [5]:
import graphlab.aggregate as agg

# Find the zipcode with largest mean prices
sales.groupby('zipcode', operations={'avg_price': agg.MEAN('price')}).sort('avg_price', ascending=False).head(5)

zipcode,avg_price
98039,2160606.6
98004,1355927.09779
98040,1194230.00355
98112,1095499.36803
98102,901258.238095


# Question 2

In [6]:
total_rows = sales.num_rows()
filter_rows = sales[(sales['sqft_living'] >= 2000) & (sales['sqft_living'] <=4000)].num_rows()

# Houses in the area 2000 <= sqft_living <= 4000
filter_rows / total_rows

0.4266413732475825

# Question 3

Linear regression

## Features

In [7]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

advanced_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15', # average lot size of 15 nearest neighbors 
]

In [8]:
train_data, test_data = sales.random_split(fraction=0.8, seed=0)

In [9]:
# Build the models
my_linreg = graphlab.linear_regression.create(train_data, 
                                              target="price",
                                              verbose=False,
                                              features=my_features,
                                              validation_set=None)

adv_linreg = graphlab.linear_regression.create(train_data, 
                                               target="price",
                                               verbose=False,
                                               features=advanced_features,
                                               validation_set=None)

In [10]:
# Evaluate the results
print my_linreg.evaluate(test_data)
print adv_linreg.evaluate(test_data)

# Graphlab my_features rmse
gl_rmse = my_linreg.evaluate(test_data)['rmse']

{'max_error': 3486584.509381705, 'rmse': 179542.4333126903}
{'max_error': 3556849.413858208, 'rmse': 156831.1168021901}


In [11]:
my_linreg.evaluate(test_data)['rmse'] - adv_linreg.evaluate(test_data)['rmse']

22711.316510500183

# Comparing with scikit learn

Will only compare the my_features model

In [12]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
#from sklearn.cross_validation import train_test_split

In [13]:
# Change from SFrame to pandas dataframe
train_data = train_data.to_dataframe()
test_data = test_data.to_dataframe()

In [14]:
# Set up the training and testing sets
X_train = train_data[my_features]
X_test = test_data[my_features]
y_train = train_data['price']
y_test = test_data['price']

In [15]:
# Check the shape
print X_train.shape
print y_train.shape

(17384, 6)
(17384L,)


In [16]:
linreg = LinearRegression(normalize=True, )

In [17]:
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [18]:
# Get the metrics function
from sklearn.metrics import mean_squared_error as mse

def rmse(y, y_pred):
    return(np.sqrt(mse(y, y_pred)))

In [19]:
y_pred = linreg.predict(X_test)

In [20]:
# Difference between graphlab and sklearn LinearRegression
print rmse(y_test, y_pred) # Sklearn
print gl_rmse

251491.159318
179542.433313
