In [3]:
import pandas as pd

# Load the training subset
training_sample = pd.read_table('/Users/David/Desktop/Zillow/training_sample.tsv')
training_sample.head(3)

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,random_value
0,13839640,-0.0131,2016-09-23,,,,1.0,3.0,,,...,,32214.0,241762.0,2015.0,209548.0,3084.48,,,60590750000000.0,-4.187693
1,14648482,-0.006,2016-09-14,,,,2.0,2.0,,,...,,54034.0,90539.0,2015.0,36505.0,889.76,,,60590630000000.0,-4.04245
2,13096102,-0.0294,2016-09-15,1.0,,,3.0,3.0,,4.0,...,,120968.0,267866.0,2015.0,146898.0,3605.78,,,60374080000000.0,-4.030566


In [4]:
training_sample.columns

Index(['parcelid', 'logerror', 'transactiondate', 'airconditioningtypeid',
       'architecturalstyletypeid', 'basementsqft', 'bathroomcnt', 'bedroomcnt',
       'buildingclasstypeid', 'buildingqualitytypeid', 'calculatedbathnbr',
       'decktypeid', 'finishedfloor1squarefeet',
       'calculatedfinishedsquarefeet', 'finishedsquarefeet12',
       'finishedsquarefeet13', 'finishedsquarefeet15', 'finishedsquarefeet50',
       'finishedsquarefeet6', 'fips', 'fireplacecnt', 'fullbathcnt',
       'garagecarcnt', 'garagetotalsqft', 'hashottuborspa',
       'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet',
       'poolcnt', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertycountylandusecode', 'propertylandusetypeid',
       'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt',
       'storytypeid', 'threequarterbathnbr', 'typeconstructiontypeid',
       'unitcn

In [10]:
training_sample.isnull().head(3)

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,random_value
0,False,False,False,True,True,True,False,False,True,True,...,True,False,False,False,False,False,True,True,False,False
1,False,False,False,True,True,True,False,False,True,True,...,True,False,False,False,False,False,True,True,False,False
2,False,False,False,False,True,True,False,False,True,False,...,True,False,False,False,False,False,True,True,False,False


In [11]:
# How many nulls per column?
training_sample.isnull().sum()

parcelid                            0
logerror                            0
transactiondate                     0
airconditioningtypeid           11523
architecturalstyletypeid        16951
basementsqft                    16993
bathroomcnt                         3
bedroomcnt                          3
buildingclasstypeid             16996
buildingqualitytypeid            6126
calculatedbathnbr                 191
decktypeid                      16869
finishedfloor1squarefeet        15627
calculatedfinishedsquarefeet       96
finishedsquarefeet12              834
finishedsquarefeet13            16993
finishedsquarefeet15            16338
finishedsquarefeet50            15627
finishedsquarefeet6             16931
fips                                3
fireplacecnt                    15114
fullbathcnt                       191
garagecarcnt                    11390
garagetotalsqft                 11390
hashottuborspa                  16592
heatingorsystemtypeid            6322
latitude    

In [13]:
# What's a good measure of square footage? Found two candidates
training_sample[['calculatedfinishedsquarefeet', 'finishedsquarefeet12']].head(5)

Unnamed: 0,calculatedfinishedsquarefeet,finishedsquarefeet12
0,1056.0,1056.0
1,1188.0,1188.0
2,1300.0,1300.0
3,1231.0,1231.0
4,1352.0,1352.0


In [27]:
# calculatedfinishedsquarefeet is the same as finishedsquarefeet12 in most cases
# calculatedfinishedsquarefeet has fewer nulls/NaNs so I'm going to use it as our square footage measure
(training_sample.calculatedfinishedsquarefeet != training_sample.finishedsquarefeet12).sum()

834

In [43]:
# Selecting 4 features for a first pass at linear regression
# Treating yearbuilt as continuous might be problematic but proceeding for now
# Get median values of features to populate missing data
bedroomcnt_median = training_sample.bedroomcnt.median()
calculatedfinishedsquarefeet_median = training_sample.calculatedfinishedsquarefeet.median()
lotsizesquarefeet_median = training_sample.lotsizesquarefeet.median()
yearbuilt_median = training_sample.yearbuilt.median()


In [44]:
# Sampling rows with missing values for validation later
training_sample[training_sample.bedroomcnt.isnull()]

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,random_value
972,11905748,0.084014,2017-01-18,,,,,,,,...,,,,,,,,,,-2.536321
4765,12133448,-1.062436,2017-01-05,,,,,,,,...,,,,,,,,,,-1.905818
12277,12039176,0.175751,2017-01-24,,,,,,,,...,,,,,,,,,,-1.45537


In [49]:
# Replacing NaN values with median
training_sample['bedroomcnt'].fillna(bedroomcnt_median, inplace=True)
training_sample['calculatedfinishedsquarefeet'].fillna(calculatedfinishedsquarefeet_median, inplace=True)
training_sample['lotsizesquarefeet'].fillna(lotsizesquarefeet_median, inplace=True)
training_sample['yearbuilt'].fillna(yearbuilt_median, inplace=True)

# Check the NaN values have been replaced
training_sample[training_sample.parcelid == 11905748]

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,random_value
972,11905748,0.084014,2017-01-18,,,,,3.0,,,...,,,,,,,,,,-2.536321


In [50]:
# Define dependent and independent variables
from sklearn.preprocessing import scale
X = scale(training_sample[['bedroomcnt', 'calculatedfinishedsquarefeet', 'lotsizesquarefeet', 'yearbuilt']])
y = training_sample.logerror


In [51]:
# Fit regression model
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X, y)

# Return R^2 value
# 0.001 is a terrible value. lol
print(regression.score(X, y))

0.00121149339181
