# Maps for Data Visualizations

## Using folium to map house price data by zipcode and latitude/longitude

In [3]:
# import libraries
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import folium
from folium.plugins import HeatMap
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [4]:
df=pd.read_csv('kc_house_data.csv')

In [5]:
df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.00,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.7210,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.00,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.00,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.00,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,263000018,5/21/2014,360000.0,3,2.50,1530,1131,3.0,0.0,0.0,...,8,1530,0.0,2009,0.0,98103,47.6993,-122.346,1530,1509
21593,6600060120,2/23/2015,400000.0,4,2.50,2310,5813,2.0,0.0,0.0,...,8,2310,0.0,2014,0.0,98146,47.5107,-122.362,1830,7200
21594,1523300141,6/23/2014,402101.0,2,0.75,1020,1350,2.0,0.0,0.0,...,7,1020,0.0,2009,0.0,98144,47.5944,-122.299,1020,2007
21595,291310100,1/16/2015,400000.0,3,2.50,1600,2388,2.0,,0.0,...,8,1600,0.0,2004,0.0,98027,47.5345,-122.069,1410,1287


In [6]:
# turn zipcode into strings; zipcodes are categorical
df['zipcode']=df['zipcode'].astype(str)
df['zipcode']

0        98178
1        98125
2        98028
3        98136
4        98074
         ...  
21592    98103
21593    98146
21594    98144
21595    98027
21596    98144
Name: zipcode, Length: 21597, dtype: object

## Heatmap of locations

In [7]:
# find the row of the house which has the highest price
maxpr=df.loc[df['price'].idxmax()]

# define a function to draw a basemap easily
def generateBaseMap(default_location=[47.5112, -122.257], default_zoom_start=9.4):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

df_copy = df.copy()
# select a zipcode for the heatmap
df_copy['count'] = 1
basemap = generateBaseMap()
# add carton position map
folium.TileLayer('cartodbpositron').add_to(basemap)
s=folium.FeatureGroup(name='icon').add_to(basemap)
# add a marker for the house which has the highest price
folium.Marker([maxpr['lat'], maxpr['long']],popup='Highest Price: $'+str(format(maxpr['price'],'.0f')),
              icon=folium.Icon(color='green')).add_to(s)
# add heatmap
HeatMap(data=df_copy[['lat','long','count']].groupby(['lat','long']).sum().reset_index().values.tolist(),
        radius=8,max_zoom=13,name='Heat Map').add_to(basemap)
folium.LayerControl(collapsed=False).add_to(basemap)
# basemap

<folium.map.LayerControl at 0x1f611c88c50>

In [None]:
# uncomment to display, file size is too large for github
#basemap

## Map for all price ranges by zipcode

In [11]:
kc_map=folium.Map(location=[47.5112, -122.257], default_zoom_start=9.4)
# kc_map

In [12]:
kc_map.choropleth(geo_data="Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson",
             data=df, # dataset
             columns=['zipcode', 'price'], # zip code is here for matching the geojson zipcode, sales price is the column that changes the color of zipcode areas
             key_on='feature.properties.ZIPCODE', # this path contains zipcodes in str type, this zipcodes should match with our zipcode column
             fill_color='BuPu', fill_opacity=0.7, line_opacity=0.3,
             legend_name='Price')

In [None]:
# uncomment to display, file size is too large for github
#kc_map

The map and color scale do not accurately reflect the data, so we need to remove outliers.

In [13]:
# removing all prices above $1.3 million
df_low_price=df.drop(df[df.price>1300000].index)
df_low_price

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.00,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.7210,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.00,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.00,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.00,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,263000018,5/21/2014,360000.0,3,2.50,1530,1131,3.0,0.0,0.0,...,8,1530,0.0,2009,0.0,98103,47.6993,-122.346,1530,1509
21593,6600060120,2/23/2015,400000.0,4,2.50,2310,5813,2.0,0.0,0.0,...,8,2310,0.0,2014,0.0,98146,47.5107,-122.362,1830,7200
21594,1523300141,6/23/2014,402101.0,2,0.75,1020,1350,2.0,0.0,0.0,...,7,1020,0.0,2009,0.0,98144,47.5944,-122.299,1020,2007
21595,291310100,1/16/2015,400000.0,3,2.50,1600,2388,2.0,,0.0,...,8,1600,0.0,2004,0.0,98027,47.5345,-122.069,1410,1287


## Map for prices by zipcode with high-end homes removed

In [14]:
kc_map_low=folium.Map(location=[47.6112, -122.257], default_zoom_start=9.4)
# kc_map_low

In [15]:
kc_map_low.choropleth(geo_data="Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson",
             data=df_low_price, # my dataset
             columns=['zipcode', 'price'], # zip code is here for matching the geojson zipcode, sales price is the column that changes the color of zipcode areas
             key_on='feature.properties.ZIPCODE', # this path contains zipcodes in str type, this zipcodes should match with our zipcode column
             fill_color='PuBuGn', fill_opacity=0.7, line_opacity=0.3,
             legend_name='Price')

In [None]:
# uncomment to display, file size is too large for github
#kc_map_low

Removing lower priced houses

In [16]:
df_mid=df_low_price.drop(df_low_price[df_low_price.price<150000].index)
df_mid

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.00,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.7210,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.00,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.00,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.00,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,263000018,5/21/2014,360000.0,3,2.50,1530,1131,3.0,0.0,0.0,...,8,1530,0.0,2009,0.0,98103,47.6993,-122.346,1530,1509
21593,6600060120,2/23/2015,400000.0,4,2.50,2310,5813,2.0,0.0,0.0,...,8,2310,0.0,2014,0.0,98146,47.5107,-122.362,1830,7200
21594,1523300141,6/23/2014,402101.0,2,0.75,1020,1350,2.0,0.0,0.0,...,7,1020,0.0,2009,0.0,98144,47.5944,-122.299,1020,2007
21595,291310100,1/16/2015,400000.0,3,2.50,1600,2388,2.0,,0.0,...,8,1600,0.0,2004,0.0,98027,47.5345,-122.069,1410,1287


## Map for low and high end homes removed

In [1]:
kc_map_mid=folium.Map(location=[47.6112, -122.257], default_zoom_start=9.4)
# kc_map_mid

NameError: name 'folium' is not defined

In [17]:
kc_map_mid.choropleth(geo_data="Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson",
             data=df_mid,
             columns=['zipcode', 'price'],
             key_on='feature.properties.ZIPCODE', 
             fill_color='GnBu', fill_opacity=0.7, line_opacity=0.3,
             legend_name='Price')
kc_map_mid

NameError: name 'kc_map_mid' is not defined

# Building Dataframe for Averages for Zipcodes

In [19]:
zipcode_data = df_mid.groupby('zipcode').aggregate(np.mean)
zipcode_data.reset_index(inplace=True)

# counting homes in zipcode
df_mid['count'] = 1
count_houses_zipcode = df_mid.groupby('zipcode').sum()
count_houses_zipcode.reset_index(inplace=True)
count_houses_zipcode = count_houses_zipcode[['zipcode','count']]
df_mid.drop(['count'], axis = 1, inplace = True)

# merging dataframes for count and averages for zipcodes
zipcode_data = pd.merge(zipcode_data, count_houses_zipcode, how='left', on=['zipcode'])
zipcode_data.head()

Unnamed: 0,zipcode,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15,count
0,98001,4664922000.0,284916.0,3.422096,2.03187,1925.17847,15010.141643,1.44051,0.0,0.096866,3.33711,7.320113,1741.070822,1981.351275,21.57971,47.309577,-122.270861,1843.206799,11223.133144,353
1,98002,4810465000.0,240542.941489,3.382979,1.885638,1663.515957,7551.542553,1.353723,0.0,0.010638,3.75,6.75,1552.154255,1969.06383,63.685897,47.309158,-122.213138,1500.143617,7619.840426,188
2,98003,4677967000.0,295263.158273,3.359712,2.054856,1935.888489,10601.402878,1.31295,0.0,0.215827,3.374101,7.546763,1668.647482,1977.043165,16.781513,47.315621,-122.310032,1880.053957,9752.298561,278
3,98004,4008513000.0,888927.960894,3.513966,2.019553,2101.564246,11024.117318,1.290503,0.0,0.061798,3.586592,7.782123,1736.648045,1963.351955,148.621622,47.612227,-122.200916,2367.374302,10978.636872,179
4,98005,5050382000.0,770053.84472,3.84472,2.375776,2583.496894,18792.118012,1.254658,0.0,0.099379,3.720497,8.360248,2069.273292,1968.956522,45.416667,47.610488,-122.16754,2515.720497,17732.484472,161


In [20]:
# checking our dataframe
zipcode_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70 entries, 0 to 69
Data columns (total 20 columns):
zipcode          70 non-null object
id               70 non-null float64
price            70 non-null float64
bedrooms         70 non-null float64
bathrooms        70 non-null float64
sqft_living      70 non-null float64
sqft_lot         70 non-null float64
floors           70 non-null float64
waterfront       70 non-null float64
view             70 non-null float64
condition        70 non-null float64
grade            70 non-null float64
sqft_above       70 non-null float64
yr_built         70 non-null float64
yr_renovated     70 non-null float64
lat              70 non-null float64
long             70 non-null float64
sqft_living15    70 non-null float64
sqft_lot15       70 non-null float64
count            70 non-null int64
dtypes: float64(18), int64(1), object(1)
memory usage: 11.5+ KB


## Functions to Display Maps to Save Memory

In [21]:
# function to display houses sold per zipcode
def show_zipcode_map(col):
    # geojson file containing zipcoode polygons
    geo_path = 'Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson'
    # creating basemap using average of lat and long of entire dataset
    zipcode = folium.Map(location=[df_mid['lat'].mean(), df_mid['long'].mean()], zoom_start=9.4)
    # display color for zipcodes using choropleth
    zipcode.choropleth(geo_data="Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson",
             data=zipcode_data,
             columns=['zipcode',col],
             key_on='feature.properties.ZIPCODE', 
             fill_color='YlOrRd', fill_opacity=0.7, line_opacity=0.2,legend_name='Number of Houses Sold per Zipcode')
    return zipcode

In [2]:
show_zipcode_map('count')

NameError: name 'folium' is not defined

In [22]:
# function for average sale price for zipcodes
def show_zipcode_price_map(col):
    # refer to previous function for details; copied and pasted, changed legend and color scheme
    geo_path = 'Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson'
    zipcode = folium.Map(location=[df_mid['lat'].mean(), df_mid['long'].mean()], zoom_start=9.4)
    zipcode.choropleth(geo_data="Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson",
             data=zipcode_data,
             columns=['zipcode',col],
             key_on='feature.properties.ZIPCODE', 
             fill_color='GnBu', fill_opacity=0.7, line_opacity=0.2,legend_name='Average Sale Price')
    return zipcode

In [4]:
show_zipcode_price_map('price')

NameError: name 'show_zipcode_price_map' is not defined

In [23]:
# function for home size average by zipcode
def show_zipcode_sqft_living_map(col):
    # refer to first function for details; copied and pasted, changed legend and color scheme
    geo_path = 'Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson'
    zipcode = folium.Map(location=[df_mid['lat'].mean(), df_mid['long'].mean()], zoom_start=9.4)
    zipcode.choropleth(geo_data="Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson",
             data=zipcode_data,
             columns=['zipcode',col],
             key_on='feature.properties.ZIPCODE', 
             fill_color='GnBu', fill_opacity=0.7, line_opacity=0.2,legend_name='Average Home Living Area Square Ft')
    return zipcode

In [None]:
# uncomment to display, file size is too large for github
#show_zipcode_sqft_living_map('sqft_living')

In [27]:
# function for average bedrooms in home by zipcode
def show_zipcode_bed_map(col):
    # refer to first function for details; copied and pasted, changed legend and color scheme
    geo_path = 'Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson'
    zipcode = folium.Map(location=[df_mid['lat'].mean(), df_mid['long'].mean()], zoom_start=9.4)
    zipcode.choropleth(geo_data="Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson",
             data=zipcode_data,
             columns=['zipcode',col],
             key_on='feature.properties.ZIPCODE', 
             fill_color='PuRd', fill_opacity=0.7, line_opacity=0.2,legend_name='Average # of Bedrooms per Home')
    return zipcode

In [None]:
# uncomment to display, file size is too large for github
#show_zipcode_bed_map('bedrooms')

In [29]:
# function for average bedrooms in home by zipcode
def show_zipcode_bath_map(col):
    # refer to first function for details; copied and pasted, changed legend and color scheme
    geo_path = 'Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson'
    zipcode = folium.Map(location=[df_mid['lat'].mean(), df_mid['long'].mean()], zoom_start=9.4)
    zipcode.choropleth(geo_data="Zipcodes_for_King_County_and_Surrounding_Area_Shorelines__zipcode_shore_area.geojson",
             data=zipcode_data,
             columns=['zipcode',col],
             key_on='feature.properties.ZIPCODE', 
             fill_color='PuRd', fill_opacity=0.7, line_opacity=0.2,legend_name='Average # of Bathrooms per Home')
    return zipcode

In [None]:
# uncomment to display, file size is too large for github
#show_zipcode_bath_map('bathrooms')

Maps can be a good tool to visualize our data and draw conclusions for geographic parameters that models may not be able to fullly explain.