### AirBnB in New York City

In [1]:
import pandas as pd
import numpy as np
import panel as pn
pn.extension('tabulator')

import hvplot.pandas

In [2]:
airbnb = pd.read_csv("AB_NYC_2019.csv")
airbnb.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [3]:
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [4]:
#checking null values
airbnb.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

### Data Preprocessing🧹

In [5]:
# Dropping any rows with missing values in the name or host_name column
airbnb.dropna(subset=['name', 'host_name'], inplace=True)

# Converting the last_review column to datetime format
airbnb['last_review'] = pd.to_datetime(airbnb['last_review'])
# Impute missing values in the last_review column with the median value
airbnb['last_review'].fillna(airbnb['last_review'].median(), inplace=True)

# Filling in missing values in the reviews_per_month column with the mean value of the column
mean_reviews_per_month = airbnb['reviews_per_month'].mean()
airbnb['reviews_per_month'].fillna(mean_reviews_per_month, inplace=True)

# Renaming reviews_per_month to a more intuitive name: avg_reviews_per_month
airbnb.rename(columns={'reviews_per_month': 'avg_reviews_per_month'}, inplace=True)

# Checking for and removing any duplicate rows
airbnb.drop_duplicates(inplace=True)

# Removing any outliers in the price and minimum_nights columns
airbnb = airbnb[(airbnb['price'] >= 10) & (airbnb['price'] <= 1000)]
airbnb = airbnb[(airbnb['minimum_nights'] >= 1) & (airbnb['minimum_nights'] <= 30)]

# Removing any irrelevant columns
#airbnb.drop(['id', 'host_id'], axis=1, inplace=True)

In [6]:
#checking null values
def check_for_nulls(df):
    if df.isnull().values.any():
        print("There are null values in the DataFrame.")
    else:
        print("All clean ⭐!")
        return df.isnull().sum()

In [7]:
check_for_nulls(airbnb)

All clean ⭐!


id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
avg_reviews_per_month             0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [None]:
### Define the widgets
min_price_slider = pn.widgets.FloatSlider(name="Minimum price (USD)", start=0, end=1000, step=10, value=0)
city_dropdown = pn.widgets.Select(name='City',  options=['All'] + list(airbnb['neighbourhood_group'].unique()))
room_type_dropdown = pn.widgets.Select(name="Room Type", options=['All'] + list(airbnb['room_type'].unique()))

### Exploratory data analysis

In [16]:
### Define the widgets
min_price_slider = pn.widgets.FloatSlider(name="Minimum price (USD)", start=0, end=1000, step=10, value=0)
min_price_slider

In [17]:
city_dropdown = pn.widgets.Select(name='City',  options=['All'] + list(airbnb['neighbourhood_group'].unique()))
city_dropdown

In [18]:
room_type_dropdown = pn.widgets.Select(name="Room Type", options=['All'] + list(airbnb['room_type'].unique()))
room_type_dropdown

### 1. Price distribution by neighborhood group:

In [14]:
price_by_ng = airbnb.groupby('neighbourhood_group')['price'].describe()[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]
price_by_ng

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bronx,1074.0,85.530726,78.219262,10.0,45.0,65.0,99.0,1000.0
Brooklyn,19781.0,117.894495,94.358514,10.0,60.0,90.0,150.0,1000.0
Manhattan,21064.0,179.187714,134.31954,10.0,95.0,149.0,220.0,1000.0
Queens,5602.0,95.220278,74.669288,10.0,50.0,75.0,110.0,1000.0
Staten Island,367.0,98.950954,96.568489,13.0,50.0,75.0,109.5,1000.0


In [34]:
price_by_ng = airbnb.groupby('neighbourhood_group')['price'].describe()[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]
price_by_ng_plot = price_by_ng.hvplot.bar(x='neighbourhood_group', y=['mean'], rot=90, width=800, height=400, title='Price Distribution by Neighborhood Group')

pn.Row(price_by_ng_plot)

### 2. 

In [20]:
##availability = airbnb.groupby(['neighbourhood_group', 'room_type'])['availability_365'].mean().reset_index(name='avg_availability')
#availability_plot = availability.hvplot.heatmap(x='room_type', y='neighbourhood_group', C='avg_availability', cmap='RdBu_r', width=800, height=400, title='Availability by Neighborhood Group and Room Type')

#pn.Row(availability_plot)

In [30]:
# Define the function to create the scatter plot
def create_geo_scatter(city, room_type):
    if city == 'All':
        data = airbnb.copy()
    else:
        data = airbnb[airbnb['neighbourhood_group'] == city].copy()
        
    if room_type != 'All':
        data = data[data['room_type'] == room_type]
        
    geo_scatter = data.hvplot.scatter(x='longitude', y='latitude', 
                                      by='neighbourhood_group',cmap = '
                                     title='Geographic Distribution')
    return geo_scatter

# Create the interactive plot
geo_scatter = pn.interact(create_geo_scatter, city=city_dropdown, room_type=room_type_dropdown)

# Show the plot
geo_scatter

In [23]:
# Define the function to create the box plot
def create_neighborhood_box(city, room_type):
    if city == 'All':
        data = airbnb.copy()
    else:
        data = airbnb[airbnb['neighbourhood_group'] == city].copy()
        
    if room_type != 'All':
        data = data[data['room_type'] == room_type]
        
    neighborhood_box = data.hvplot.box(y='price', by='neighbourhood', 
                                       rot=90, height=500, width=900, title='Neighborhood Pricing')
    return neighborhood_box

# Create the interactive plot
neighborhood_box = pn.interact(create_neighborhood_box, city=city_dropdown, room_type=room_type_dropdown)

# Show the plot
neighborhood_box.show()



Launching server at http://localhost:51401


<panel.io.server.Server at 0x26c42282ad0>

