In [1]:
import pandas as pd
import numpy as np
import csv
from matplotlib import pyplot as plt
import seaborn as sns


In [3]:
df = pd.read_csv("County_MedianValuePerSqFt_AllHomes.csv", usecols=["RegionName", "2017-05", "2017-06"])

In [4]:
df.head()

Unnamed: 0,RegionName,2017-05,2017-06
0,Los Angeles County,389,390
1,Cook County,156,157
2,Harris County,91,91
3,Maricopa County,139,140
4,San Diego County,343,345


In [5]:
df.isnull().sum()

RegionName    0
2017-05       0
2017-06       0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1805 entries, 0 to 1804
Data columns (total 3 columns):
RegionName    1805 non-null object
2017-05       1805 non-null int64
2017-06       1805 non-null int64
dtypes: int64(2), object(1)
memory usage: 42.4+ KB


In [8]:
df.describe()

Unnamed: 0,2017-05,2017-06
count,1805.0,1805.0
mean,107.272022,107.751247
std,79.173178,79.584081
min,21.0,21.0
25%,65.0,66.0
50%,89.0,89.0
75%,125.0,126.0
max,1463.0,1466.0


In [10]:
df_by_county = df.groupby(["RegionName"]).mean()
df_by_county.head()

Unnamed: 0_level_0,2017-05,2017-06
RegionName,Unnamed: 1_level_1,Unnamed: 2_level_1
Ada County,145.0,146.0
Adair County,61.0,61.0
Adams County,130.6,131.2
Aiken County,74.0,75.0
Alachua County,105.0,106.0


In [19]:
df_by_county["delta_05_to_06"] = df_by_county["2017-06"] - df_by_county["2017-05"]
df_by_county.head()

Unnamed: 0_level_0,2017-05,2017-06,delta_05_to_06
RegionName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ada County,145.0,146.0,1.0
Adair County,61.0,61.0,0.0
Adams County,130.6,131.2,0.6
Aiken County,74.0,75.0,1.0
Alachua County,105.0,106.0,1.0


In [20]:
df_by_county.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1202 entries, Ada County to Yuma County
Data columns (total 3 columns):
2017-05           1202 non-null float64
2017-06           1202 non-null float64
delta_05_to_06    1202 non-null float64
dtypes: float64(3)
memory usage: 37.6+ KB


In [21]:
df_by_county.max()

2017-05           1463.0
2017-06           1466.0
delta_05_to_06      13.0
dtype: float64

In [22]:
df_by_county.idxmax()

2017-05           New York County
2017-06           New York County
delta_05_to_06    Gunnison County
dtype: object

In [23]:
df_by_county.min()

2017-05           23.0
2017-06           24.0
delta_05_to_06    -2.0
dtype: float64

In [24]:
df_by_county.idxmin()

2017-05                Baylor County
2017-06                Baylor County
delta_05_to_06    Clear Creek County
dtype: object

#### Project Walkthrough

-First Iteration: 

    Have a baseline by the end of Thursday
    
        -Drop NULLs, scale, model (can skip scale if you want)
        -sqft of home, number of bedrooms, and number of bathrooms
        
-Second Iteration:
    
    Have an MVP by the end of Friday with same features from baseline
    
-SQL Best Practice:

    -Limit the data you pull in to 100 rows when you request from database
    
    -Filter the data you bring it to limit what you bring in
    
    -You should end up with around 20,000 records ONLY
    
    -Subqueries will run first, but you probably won't have to do one.
    
        For this, query pretty straight forward.
        
    -Predictions and Properties, make sure you're using the right year. Last transaction date will be in predictions which is what you use to filter for "hot months." 
    
-Work in your Notebook bc it is the report you will deliver.

    Document everything you do and functions you use, so that someone can reproduce your work.

x = features that best predict the value of the house 
y = assess value (value of the house)

-Distribution chart of tax rate by county, a histogram by count of properties at each tax rate

-Report and Presentation summarizing your findings about the drivers of the assess value of a home. Presentation 5 minutes or less.

-A github repo

-You do not need to do feature selection for this project

### SQL Query

In [None]:
SELECT transactiondate
FROM predictions_2017
WHERE transactiondate LIKE "2017-05%" and "2017-06%"
ORDER BY transactiondate;

In [None]:
SELECT bedroomcnt, 
       bathroomcnt, 
       taxamount,
       taxvaluedollarcnt
       propertylandusedesc,
       propertylandusetypeid, 
       fips
FROM predictions_2017
JOIN properties_2017 USING(id)
JOIN propertylandusetype USING(propertylandusetypeid)
WHERE (transactiondate LIKE "2017-05%" OR transactiondate LIKE "2017-06%") 
	AND propertylandusetypeid = "261" 
	OR (propertylandusetypeid = "279" AND propertylandusedesc="Single Family Residential")
ORDER BY fips;