In [1]:
import numpy as np # NumPy is the fundamental package for scientific computing

import pandas as pd # Pandas is an easy-to-use data structures and data analysis tools
pd.set_option('display.max_columns', None) # To display all columns

import matplotlib.pyplot as plt # Matplotlib is a python 2D plotting library
%matplotlib inline 
# A magic command that tells matplotlib to render figures as static images in the Notebook.

import seaborn as sns # Seaborn is a visualization library based on matplotlib (attractive statistical graphics).
sns.set_style('whitegrid') # One of the five seaborn themes
import warnings
warnings.filterwarnings('ignore') # To ignore some of seaborn warning msg

from scipy import stats, linalg

import folium # for map visualization
from folium import plugins

### Load data

In [9]:
data_all = pd.read_csv("listings.csv")

In [16]:
data_map = pd.read_csv("listings_map.csv")

In [17]:
data_map = data_map[data_map.zipcode >= 0]
#data_map['zipcode'] = data_map['zipcode'].astype()

In [18]:
data_map['street'] = data_map['street'].astype('category', ordered = True)
data_map['neighbourhood'] = data_map['neighbourhood'].astype('category', ordered = True)
data_map['neighbourhood_cleansed'] = data_map['neighbourhood_cleansed'].astype('category', ordered = True)

data_map['city'] = data_map['city'].astype('category', ordered = True)
data_map['state'] = data_map['state'].astype('category', ordered = True)
data_map['zipcode'] = data_map['zipcode'].astype(str)

In [19]:
data_map.dtypes

id                                 int64
street                          category
neighbourhood                   category
neighbourhood_cleansed          category
neighbourhood_group_cleansed      object
city                            category
state                           category
zipcode                           object
latitude                         float64
longitude                        float64
price                              int64
dtype: object

In [20]:
data_map.head(2)

Unnamed: 0,id,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,latitude,longitude,price
0,685006,"Ocean Avenue, Brooklyn, NY 11225, United States",Lefferts Garden,Prospect-Lefferts Gardens,Brooklyn,Brooklyn,NY,11225,40.661408,-73.96175,75
1,9461238,"Roebling Street, Brooklyn, NY 11211, United St...",Williamsburg,Williamsburg,Brooklyn,Brooklyn,NY,11211,40.71632,-73.957255,117


### Clustered number of listings

In [21]:
houses_map = folium.Map(location = [data_map['latitude'].mean(), data_map['longitude'].mean()], zoom_start = 10)
marker_cluster = folium.MarkerCluster().add_to(houses_map)
MAX_RECORDS = 100
for name, row in data_map.iterrows():
    folium.Marker([row["latitude"], row["longitude"]],
                  popup="Price {0}$".format(row['price'])).add_to(marker_cluster)

In [22]:
type(houses_map)

folium.folium.Map

In [23]:
houses_map.save('houses.html')

### Map the price with zipcode  


#### Zipcode format

In [24]:
zipcode_data = data_map.groupby('zipcode').aggregate(np.mean)
zipcode_data.reset_index(inplace=True)

data_map['count'] = 1
count_houses_zipcode = data_map.groupby('zipcode').sum()
count_houses_zipcode.reset_index(inplace=True)
count_houses_zipcode = count_houses_zipcode[['zipcode','count']]
data_map.drop(['count'], axis = 1, inplace = True)

zipcode_data = pd.merge(zipcode_data, count_houses_zipcode, how='left', on=['zipcode'])
zipcode_data.head(4)

Unnamed: 0,zipcode,id,latitude,longitude,price,count
0,10001,9434253.0,40.74909,-73.994349,222.150901,444
1,10002,8032648.0,40.717797,-73.988928,171.464988,1271
2,10003,8450358.0,40.730454,-73.987713,216.505396,1112
3,10003-8623,833927.0,40.724109,-73.990818,215.0,1


In [25]:
min_val = zipcode_data.price.min()
q1 = zipcode_data.price.quantile( .2)
q2 = zipcode_data.price.quantile( .4)
q3 = zipcode_data.price.quantile( .6)
q4 = zipcode_data.price.quantile( .8)

In [26]:
zipcode_data=zipcode_data.rename(columns = {'zipcode':'ZCTA5CE10'})

In [27]:
zipcode_data.head(10)

Unnamed: 0,ZCTA5CE10,id,latitude,longitude,price,count
0,10001,9434253.0,40.74909,-73.994349,222.150901,444
1,10002,8032648.0,40.717797,-73.988928,171.464988,1271
2,10003,8450358.0,40.730454,-73.987713,216.505396,1112
3,10003-8623,833927.0,40.724109,-73.990818,215.0,1
4,10004,9234468.0,40.705678,-74.013235,196.078125,64
5,10005,11199640.0,40.705473,-74.00845,190.207692,130
6,10006,10248560.0,40.70856,-74.013841,210.528571,70
7,10007,8877462.0,40.714792,-74.008,368.44,25
8,10009,8271224.0,40.725959,-73.981486,166.396274,1181
9,10010,8787314.0,40.739312,-73.983671,224.125356,351


#### plot

In [29]:
state_geo = r'NY_zip.json'

#Let Folium determine the scale
map = folium.Map(location=[data_map['latitude'].mean(), data_map['longitude'].mean()], zoom_start=9)
map.choropleth(geo_path=state_geo, data=zipcode_data,
             columns=['ZCTA5CE10', 'price'],
             key_on='feature.properties.ZCTA5CE10',
             threshold_scale=[min_val, q1, q2, q3, q4],
             fill_color='OrRd', fill_opacity=0.7, line_opacity=0.2,
             legend_name='price of Airbnb')
map.save('NY_price.html')

### Pearson Correlation Heat Map

In [30]:
data_all = pd.read_csv("listings.csv")

In [None]:
cols = ['price',
        'accommodates',
        'bedrooms',
        'beds',
        'neighbourhood_cleansed',
        'room_type',
        'cancellation_policy',
        'instant_bookable',
        'reviews_per_month',
        'number_of_reviews',
        'availability_30',
        'review_scores_rating',
#         'property_type',
#         'bed_type',
#         'cleaning_fee',
        ]
#read the file into a dataframe
df=pd.read_csv(listing, usecols=cols)
