# Reading and cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("properties_2016.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.head(5)

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2015.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,,,,,...,,,650756.0,1413387.0,2015.0,762631.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,7.0,,,...,1.0,,571346.0,1156834.0,2015.0,585488.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,,,193796.0,433491.0,2015.0,239695.0,5725.17,,,


In [4]:
df2 = pd.read_csv("properties_2017.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df2.head(5)

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2016.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,5.0,,,,...,1.0,,660680.0,1434941.0,2016.0,774261.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,6.0,,,...,1.0,,580059.0,1174475.0,2016.0,594416.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,1.0,,196751.0,440101.0,2016.0,243350.0,5725.17,,,


In [6]:
# we're going to drop a lot of this data,
#    Since our X is going to be entered by the user.
df = pd.DataFrame({"Zip": df['regionidzip'],"YearBuilt": df['yearbuilt'],
                   "LotSize": df['lotsizesquarefeet'],
                   "Bedrooms": df['bedroomcnt'],
                   "Bathrooms": df['bathroomcnt'],
                   "Value": df['taxvaluedollarcnt']})

In [7]:
df2 = pd.DataFrame({"Zip": df2['regionidzip'],"YearBuilt": df2['yearbuilt'],
                   "LotSize": df2['lotsizesquarefeet'],
                   "Bedrooms": df2['bedroomcnt'],
                   "Bathrooms": df2['bathroomcnt'],
                   "Value": df2['taxvaluedollarcnt']})

In [8]:
# Check our shape too see how many observations we have:
df.shape

(2985217, 6)

In [9]:
df2.shape

(2985217, 6)

In [10]:
for col in df:
  print(col)

Zip
YearBuilt
LotSize
Bedrooms
Bathrooms
Value


In [11]:
df = df.append(df2)

In [12]:
df.shape

(5970434, 6)

In [13]:
df.head(5)

Unnamed: 0,Zip,YearBuilt,LotSize,Bedrooms,Bathrooms,Value
0,96337.0,,85768.0,0.0,0.0,9.0
1,96337.0,,4083.0,0.0,0.0,27516.0
2,96095.0,,63085.0,0.0,0.0,1413387.0
3,96424.0,1948.0,7521.0,0.0,0.0,1156834.0
4,96450.0,1947.0,8512.0,0.0,0.0,433491.0


In [14]:
dfc = df.dropna()

In [15]:
dfc.shape

(5385914, 6)

In [16]:
df = dfc.copy()

# Statistics

In [17]:
import scipy.stats as scs

In [18]:
yermask = df['YearBuilt'] < 2000
lowyear = df[yermask]

In [19]:
scs.ttest_ind(lowyear['Value'],df['Value'])

Ttest_indResult(statistic=-82.98715691147723, pvalue=0.0)

In [20]:
yermask = df['YearBuilt'] > 2005
highyear = df[yermask]

In [21]:
scs.ttest_ind(highyear['Value'],df['Value'])

Ttest_indResult(statistic=301.4451067825913, pvalue=0.0)

Strong statistical significance for the year's effect on the value; The year is a good feature for predicting the price.

In [22]:
# Get an idea of what sort of lotsize we can expect
print(sum(df['LotSize'])/len(df['LotSize']))

22505.373238841912


In [23]:
# Above the mean
#    And below the mean:
below = df['LotSize'] < 22844
above = df['LotSize'] < 22844
low = df[below]
high = df[above]

In [24]:
scs.ttest_ind(low['Value'],df['Value'])

Ttest_indResult(statistic=-48.54074088760842, pvalue=0.0)

In [25]:
scs.ttest_ind(high['Value'],df['Value'])

Ttest_indResult(statistic=-48.54074088760842, pvalue=0.0)

**Suprisingly,** Area is not as significant of a feature, definitely not what I was expecting.

# Preprocessing

In [26]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df)

In [27]:
target = 'Value'
trainX = train.drop(columns='Value')
trainy = train[target]
testy = test[target]
testX = test.drop(columns='Value')


# Trying Models

In [28]:
from sklearn.pipeline import make_pipeline
from sklearn.isotonic import IsotonicRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [29]:
pipl = make_pipeline(
    StandardScaler(),
    LogisticRegression() 
)

In [None]:
pipl.fit(trainX,trainy)



In [None]:
rr = pipl.predict(testX)

# Accuracy

# Pickle