### AirBnB in New York City

In [3]:
import pandas as pd
import numpy as np
import panel as pn
pn.extension('tabulator')

import hvplot.pandas

In [6]:
airbnb = pd.read_csv("AB_NYC_2019.csv")
airbnb.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [7]:
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [8]:
#checking null values
airbnb.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

### Data Preprocessing🧹

In [9]:
# Dropping any rows with missing values in the name or host_name column
airbnb.dropna(subset=['name', 'host_name'], inplace=True)

# Converting the last_review column to datetime format
airbnb['last_review'] = pd.to_datetime(airbnb['last_review'])

# Filling in missing values in the reviews_per_month column with the mean value of the column
mean_reviews_per_month = airbnb['reviews_per_month'].mean()
airbnb['reviews_per_month'].fillna(mean_reviews_per_month, inplace=True)

# Renaming reviews_per_month to a more intuitive name: avg_reviews_per_month
airbnb.rename(columns={'reviews_per_month': 'avg_reviews_per_month'}, inplace=True)

# Checking for and removing any duplicate rows
airbnb.drop_duplicates(inplace=True)

# Removing any outliers in the price and minimum_nights columns
airbnb = airbnb[(airbnb['price'] >= 10) & (airbnb['price'] <= 1000)]
airbnb = airbnb[(airbnb['minimum_nights'] >= 1) & (airbnb['minimum_nights'] <= 30)]

# Removing any irrelevant columns
airbnb.drop(['id', 'host_id'], axis=1, inplace=True)




In [10]:
#checking null values
airbnb.isnull().sum()

name                                 0
host_name                            0
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       9616
avg_reviews_per_month                0
calculated_host_listings_count       0
availability_365                     0
dtype: int64