In [221]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from __future__ import division

%matplotlib inline

In [222]:
# Read file
df = pd.read_csv("./data/home_data.csv")

# Write only a couple of lines to a csv file. 
# Easier to look at the data in e.g. Excel
df[1:6].to_csv("./data/home_data_small.csv", sep=";")

In [223]:
# Size of the dataframe
df.shape

(21613, 21)

In [224]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [225]:
# List all features and look at the data types
df.dtypes
# Print an array of the values using
# df.columns.values

id                 int64
date              object
price              int64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [226]:
def change_coltype(df, colList, newType=None):
    """Change dtype for specific columns in the dataframe"""
    if newType:
        for col in colList:
            df[col] = df[col].astype(newType)
    else:
        raise AssertionError("No newType entered")
    return df
        
# New column types
category_cols = ["waterfront", "view", "condition", "grade", "zipcode"]
float_cols = ["price"]

df = change_coltype(df, category_cols, newType="category")
df = change_coltype(df, float_cols, newType="float")

In [227]:
df.dtypes

id                  int64
date               object
price             float64
bedrooms            int64
bathrooms         float64
sqft_living         int64
sqft_lot            int64
floors            float64
waterfront       category
view             category
condition        category
grade            category
sqft_above          int64
sqft_basement       int64
yr_built            int64
yr_renovated        int64
zipcode          category
lat               float64
long              float64
sqft_living15       int64
sqft_lot15          int64
dtype: object

### Question 1: 

Selection and summary statistics: We found the zip code with the highest average house price. What is the average house price of that zip code?

In [228]:
# Get the house with specific zipcode and print the 
df.groupby("zipcode")['price'].mean().sort_values(ascending=False).head()

zipcode
98039    2.160607e+06
98004    1.355927e+06
98040    1.194230e+06
98112    1.095499e+06
98102    9.012582e+05
Name: price, dtype: float64

### Question 2:
Filtering data: What fraction of the houses have living space between 2000 sq.ft. and 4000 sq.ft.?

In [229]:
nrows_all = df.shape[0]
nrows_subset = df[(df.sqft_living >= 2000) & (df.sqft_living <= 4000)].shape[0]

nrows_subset / nrows_all

0.4266413732475825

# Building a regression model

- Compute the RMSE (root mean squared error) on the test_data for the model using just my_features, and for the one using advanced_features.

In [267]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split

## Feature possibilities

Will compare the RMSE on the test data using the two feature possibilites under:

In [268]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

advanced_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15', # average lot size of 15 nearest neighbors 
]

In [269]:
# Setting up the two feature matrices
my_X = df[my_features]; adv_X = df[advanced_features]

# Labels are the same for both
y = df.price # Labels

In [270]:
print my_X.shape
print y.shape

(21613, 6)
(21613L,)


### My features learning

In [271]:
def setup_train_test(X, y, train_size=0.8, seed=0):
    return(train_test_split(X, y, train_size=train_size, random_state=seed))

In [272]:
# Split to train and test set
my_X_train, my_X_test, y_train, y_test = setup_train_test(my_X, y)

In [273]:
my_X_train.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,zipcode
5268,3,1.0,1570,5510,1.0,98115
16909,3,2.5,1780,11000,1.0,98006
16123,3,1.5,1090,9862,1.0,98074
12181,4,2.5,2210,7079,2.0,98031
12617,3,2.5,1800,4763,2.0,98119


In [274]:
# Set up classifier
my_linreg = Ridge(alpha=0.01, normalize=True)

In [275]:
# Fit the parameters
my_linreg.fit(my_X_train, y_train)

Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [276]:
print my_linreg.coef_
print my_linreg.intercept_

[ -5.35891612e+04   1.64180263e+04   3.11770241e+02  -2.66470823e-01
  -2.01464473e+03   6.08124987e+02]
-59596985.7027


### Predict my_features

In [277]:
my_y_pred = my_linreg.predict(my_X)

### Advanced features learning

In [278]:
# Check the dimensions
print adv_X.shape
print y.shape

(21613, 18)
(21613L,)


In [279]:
# Create train and test sets
adv_X_train, adv_X_test, y_train, y_test = setup_train_test(adv_X, y)

In [280]:
# Set up linear reg.
adv_linreg = LinearRegression(normalize=True)

In [281]:
# Fit the model
adv_linreg.fit(adv_X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [282]:
adv_y_pred = adv_linreg.predict(adv_X)

# Compare the models

In [283]:
def rss(y, y_pred):
    """
    Calculate the residual sum of squares of two 1-D arrays.
    """
    
    #print (y-y_pred)**2
    
    return np.sum((y-y_pred)**2)

def rmse(y, y_pred):
    """
    Calculate the root mean squared error of two 1-D arrays.
    """
    n = len(y)    
    return np.sqrt(rss(y, y_pred) / n)


# Testing the functions
test_y, test_y_pred = np.array([3.4, 5]), np.array([3.3, 5.6])

assert round(rss(test_y, test_y_pred), 4) == 0.370
assert round(rmse(test_y, test_y_pred), 4) == 0.4301

In [284]:
print rmse(y, my_y_pred)
print rmse(y, adv_y_pred)

255689.264165
201209.444071


In [219]:
my_rmse = rmse(y_test, my_y_pred)
adv_rmse = rmse(y_test, adv_y_pred)

adv_rmse - my_rmse

ValueError: operands could not be broadcast together with shapes (4323,) (21613,) 