In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
from math import sqrt

In [3]:
airbnb=pd.read_csv("AB_NYC_2019.csv")

#Remove unsignificant data
airbnb.drop(['name','id','host_name','last_review'], axis=1, inplace=True)
airbnb.fillna({'reviews_per_month':0}, inplace=True)
airbnb.head(3)

Unnamed: 0,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,0.0,1,365


In [28]:
quantiles = airbnb['price'].quantile([0.33, 0.66])
quantiles

0.33     80.0
0.66    150.0
Name: price, dtype: float64

In [31]:


def categorize_price(price):
    if price < 80.0:
        return 'Below 80'
    elif price >= 80.0 and price <= 150.0:
        return 'Above 80 and below 150'
    else:
        return 'Above 150'

# Apply the function to create a new column
airbnb['price_category'] = airbnb['price'].apply(categorize_price)

In [4]:
airbnb_group = airbnb.groupby('neighbourhood_group')
airbnb_group['price'].agg(['mean'])

Unnamed: 0_level_0,mean
neighbourhood_group,Unnamed: 1_level_1
Bronx,87.496792
Brooklyn,124.383207
Manhattan,196.875814
Queens,99.517649
Staten Island,114.812332


In [7]:
new_df = airbnb_group.agg({'price': ['count', 'mean']})
new_df.columns = ['Count', 'Mean']
new_df.reset_index()
new_df['Sqrt_Count'] = new_df['Count'].apply(sqrt)
new_df['Sqrt_Mean'] = new_df['Mean'].apply(sqrt)
count_median = new_df['Sqrt_Count'].median()
mean_median = new_df['Sqrt_Mean'].median()

new_df['Sqrt_Count_Div_Median'] = new_df['Sqrt_Count'] / count_median
new_df['Sqrt_Mean_Div_Median'] = new_df['Sqrt_Mean'] / mean_median
new_df[['Count', 'Mean', 'Sqrt_Count_Div_Median', 'Sqrt_Mean_Div_Median']].to_csv('district_scales_data.csv', index=False)
new_df

Unnamed: 0_level_0,Count,Mean,Sqrt_Count,Sqrt_Mean,Sqrt_Count_Div_Median,Sqrt_Mean_Div_Median
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bronx,1091,87.496792,33.030289,9.353972,0.438808,0.872975
Brooklyn,20104,124.383207,141.788575,11.152722,1.883662,1.040846
Manhattan,21661,196.875814,147.176764,14.031244,1.955244,1.309489
Queens,5666,99.517649,75.272837,9.975853,1.0,0.931013
Staten Island,373,114.812332,19.313208,10.715052,0.256576,1.0


In [6]:
new_df2 = airbnb_group2 = airbnb.groupby(['neighbourhood_group', 'room_type']).agg({'price': ['count', 'mean']})
new_df2.columns = ['Count', 'Mean']
new_df2.reset_index()
new_df2['Sqrt_Count'] = new_df2['Count'].apply(sqrt)
new_df2['Sqrt_Mean'] = new_df2['Mean'].apply(sqrt)
new_df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Mean,Sqrt_Count,Sqrt_Mean
neighbourhood_group,room_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bronx,Entire home/apt,379,127.506596,19.467922,11.291882
Bronx,Private room,652,66.788344,25.534291,8.172414
Bronx,Shared room,60,59.8,7.745967,7.733046
Brooklyn,Entire home/apt,9559,178.327545,97.770139,13.353934
Brooklyn,Private room,10132,76.500099,100.657836,8.746433
Brooklyn,Shared room,413,50.527845,20.322401,7.108294
Manhattan,Entire home/apt,13199,249.239109,114.886901,15.787308
Manhattan,Private room,7982,116.776622,89.342039,10.806323
Manhattan,Shared room,480,88.977083,21.908902,9.432766
Queens,Entire home/apt,2096,147.050573,45.782093,12.126441


In [8]:
nodes = list(set(airbnb['neighbourhood']).union(set(airbnb['neighbourhood_group'])))
links = []
for i in range(len(airbnb)):
    links.append({'source': airbnb['neighbourhood'].iloc[i], 'target': airbnb['neighbourhood_group'].iloc[i]})




In [23]:
def get_sankey(src,tar):
    grouped_df = airbnb.groupby([src, tar]).size().reset_index(name='value')
    grouped_df = grouped_df.rename(columns={src: 'source', tar: 'target'})
    counts_list = grouped_df.to_dict(orient='records')
    return counts_list

In [38]:
sankey_list1 = get_sankey('neighbourhood_group', 'room_type')
sankey_list2 = get_sankey('room_type', 'price_category')

sankey_list = sankey_list1 + sankey_list2
sankey_list

[{'source': 'Bronx', 'target': 'Entire home/apt', 'value': 379},
 {'source': 'Bronx', 'target': 'Private room', 'value': 652},
 {'source': 'Bronx', 'target': 'Shared room', 'value': 60},
 {'source': 'Brooklyn', 'target': 'Entire home/apt', 'value': 9559},
 {'source': 'Brooklyn', 'target': 'Private room', 'value': 10132},
 {'source': 'Brooklyn', 'target': 'Shared room', 'value': 413},
 {'source': 'Manhattan', 'target': 'Entire home/apt', 'value': 13199},
 {'source': 'Manhattan', 'target': 'Private room', 'value': 7982},
 {'source': 'Manhattan', 'target': 'Shared room', 'value': 480},
 {'source': 'Queens', 'target': 'Entire home/apt', 'value': 2096},
 {'source': 'Queens', 'target': 'Private room', 'value': 3372},
 {'source': 'Queens', 'target': 'Shared room', 'value': 198},
 {'source': 'Staten Island', 'target': 'Entire home/apt', 'value': 176},
 {'source': 'Staten Island', 'target': 'Private room', 'value': 188},
 {'source': 'Staten Island', 'target': 'Shared room', 'value': 9},
 {'sour

In [39]:
import json

# Assuming you have the dictionary stored in the 'counts_list' variable

# Define the path and filename for the JSON file
json_file = 'data/counts.json'

# Save the dictionary as JSON
with open(json_file, 'w') as file:
    json.dump(sankey_list, file)