In [1]:
import matplotlib.pyplot as plt
import requests
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [18]:
# This automatically closes the connection.
#%%
# ST. Louis County url 
#url = 'https://opendata.arcgis.com/datasets/4382d4a88f15450fbd6fb28ac9c64996_7.geojson'
# ST. Louis County url with only first 100 land parcels shown
#url = 'https://arcgiswapub.stlouiscountymn.gov/arcgis/rest/services/GeneralUse/OpenData/MapServer/7/query?where=OBJECTID%20%3E%3D%200%20AND%20OBJECTID%20%3C%3D%20100&outFields=*&outSR=4326&f=json'
# only useful columns with 100 samples
#url = 'https://arcgiswapub.stlouiscountymn.gov/arcgis/rest/services/GeneralUse/OpenData/MapServer/7/query?where=OBJECTID%20%3E%3D%200%20AND%20OBJECTID%20%3C%3D%20100&outFields=ACREAGE,TOWNSHIP,TAXABLE_BUILDING,NET_TAX,PHYSZIP,Shape,Shape.STArea(),TAX_YR,TOT_SPEC_ASMT,TAXABLE_LAND_VALUE,DEEDED_ACRES,SCH_DIST,LAND_EST,BalDue,ASMT_YR,Shape.STLength(),PHYSADDR,TaxableMarketValue,BUILDING,EstTotalValue,PHYSCITY&outSR=4326&f=json'
# 100 samples with useful columns selected
url = 'https://arcgiswapub.stlouiscountymn.gov/arcgis/rest/services/GeneralUse/OpenData/MapServer/7/query?where=OBJECTID%20%3E%3D%200%20AND%20OBJECTID%20%3C%3D%20100&outFields=ACREAGE,TOWNSHIP,TAXABLE_BUILDING,NET_TAX,PHYSZIP,Shape,Shape.STArea(),TAX_YR,TOT_SPEC_ASMT,TAXABLE_LAND_VALUE,DEEDED_ACRES,SCH_DIST,LAND_EST,BalDue,ASMT_YR,Shape.STLength(),PHYSADDR,TaxableMarketValue,BUILDING,EstTotalValue,PHYSCITY&outSR=4326&f=json'
# First 1000                                                                                                                                                          # Here's the difference
#url = 'https://arcgiswapub.stlouiscountymn.gov/arcgis/rest/services/GeneralUse/OpenData/MapServer/7/query?where=OBJECTID%20%3E%3D%200%20AND%20OBJECTID%20%3C%3D%201000&outFields=ACREAGE,TOWNSHIP,TAXABLE_BUILDING,NET_TAX,PHYSZIP,Shape,Shape.STArea(),TAX_YR,TOT_SPEC_ASMT,TAXABLE_LAND_VALUE,DEEDED_ACRES,SCH_DIST,LAND_EST,BalDue,ASMT_YR,Shape.STLength(),PHYSADDR,TaxableMarketValue,BUILDING,EstTotalValue,PHYSCITY&outSR=4326&f=json'
# Import all data
#url = 'https://arcgiswapub.stlouiscountymn.gov/arcgis/rest/services/GeneralUse/OpenData/MapServer/7/query?where=1%3D1&outFields=ACREAGE,TAXABLE_BUILDING,PHYSZIP,Shape,Shape.STArea(),TAX_YR,TAXABLE_LAND_VALUE,SCH_DIST,LAND_EST,Shape.STLength(),PHYSADDR,TaxableMarketValue,BUILDING,EstTotalValue,PHYSCITY&outSR=4326&f=json'
# package the request, send it, and get the response r
r= requests.get(url)
#%%
# get the json form from the request
json_data = r.json()

In [None]:
# get name of each field
cols = {}
for field in json_data['fields'] :
    cols[field['name']] = []

# Turn dtaa into a dictionary that pandas can convert
for observation in json_data['features'] :
    for attr, val in observation['attributes'].items() :
        cols[attr] = cols[attr] + [val]
# Convert to pd dataframe        
df = pd.DataFrame.from_dict(cols)
# Drop all NA values because they are not useful right now
df_clean = df.dropna()
# Fix variable types to make regression modeling easier.
df_clean['PHYSZIP'] = df_clean['PHYSZIP'].astype('int64')
df_clean['TAX_YR'] = df_clean['TAX_YR'].astype('int64')

In [None]:
print(df_clean.head())

In [None]:
#%%
model = sm.OLS(endog = df_clean['EstTotalValue'], exog = df_clean['BUILDING']).fit()
predictions = model.predict(df_clean['BUILDING'])
plt.scatter(df_clean['BUILDING'], df_clean['EstTotalValue'])
plt.plot(df_clean['BUILDING'], predictions)
sns.lmplot('BUILDING', 'EstTotalValue', data = df_clean)
#%%
# plot data to see how it looks
#f, axes = plt.subplots(1, 2)
sns.lmplot('ACREAGE', 'LAND_EST', data= df, hue = 'SCH_DIST')
sns.lmplot('LAND_EST', 'EstTotalValue', data = df)
sns.lmplot('BUILDING', 'EstTotalValue', data = df, row = 'PHYSZIP')

#%%
df_num = df_clean.drop(['PHYSZIP', 'TAX_YR', 'PHYSADDR', 'PHYSCITY', 'SCH_DIST'] , axis = 1)
# plot all variables with one another to get an idea of how they behave
sns.pairplot(df_num)
# From this it was cleat that NET_TAX, DEEDED_ACRES, ASMT, BAL_DUE, ASMT_YR were just 0's so I removed them all from the data set
# From this School disctric didn't have any variance and was thusremoved too. but this might be just becuase of the smaller dataset
#%% 

In [None]:
# REGRESSION MODELING

# Convert discrete variables to categories
df['famhist_ord'] = pd.Categorical(df.famhist).labels

est = smf.ols(formula='EstTotalValue ~ ACREAGE * BUILDING', data=df).fit()