## `Import Libraries`

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings as w

## `Load Data Set`

In [2]:
df = pd.read_csv('Airbnb_Open_Data.csv', low_memory=False)

## `Explore Datasets`

#### `Check Shape of Dataset`

In [3]:
df.shape

(102599, 26)

#### Total Number of Missing Values

In [4]:
df.isna().sum().sum()

190769

#### Missing Values Column vise

In [5]:
df.isna().sum()

id                                     0
NAME                                 250
host id                                0
host_identity_verified               289
host name                            406
neighbourhood group                   29
neighbourhood                         16
lat                                    8
long                                   8
country                              532
country code                         131
instant_bookable                     105
cancellation_policy                   76
room type                              0
Construction year                    214
price                                247
service fee                          273
minimum nights                       409
number of reviews                    183
last review                        15893
reviews per month                  15879
review rate number                   326
calculated host listings count       319
availability 365                     448
house_rules     

## `Handle Missing Values`

#### Drop Columns With Alot Number Of Missing Values
- Licence ---> More Than 99% Values are Missing
- House_rules ----> More than 50% Values are Missing

In [6]:
df.drop(columns=['license','house_rules'], axis=1, inplace=True)

#### Drop Columns Which Don't Provide Any Valuable Information For Analysis
- id
- NAME
- host_id
- host name
- **These Columns are not valuable For Our Analysis**

In [7]:
df.drop(columns=['id', 'NAME', 'host id', 'host name'], axis=1, inplace=True)

#### Varify Again Totat Number Of Missing Values

In [8]:
df.isna().sum().sum()

35385

#### For Handling With Other Columns, Check Their Data Type

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102599 entries, 0 to 102598
Data columns (total 20 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   host_identity_verified          102310 non-null  object 
 1   neighbourhood group             102570 non-null  object 
 2   neighbourhood                   102583 non-null  object 
 3   lat                             102591 non-null  float64
 4   long                            102591 non-null  float64
 5   country                         102067 non-null  object 
 6   country code                    102468 non-null  object 
 7   instant_bookable                102494 non-null  object 
 8   cancellation_policy             102523 non-null  object 
 9   room type                       102599 non-null  object 
 10  Construction year               102385 non-null  float64
 11  price                           102352 non-null  object 
 12  service fee     

#### Last Review Is Object, And It have More Than 15000+ Missing Values
- Check It's Rows

In [10]:
df['last review']

0         10/19/2021
1          5/21/2022
2                NaN
3           7/5/2019
4         11/19/2018
             ...    
102594           NaN
102595      7/6/2015
102596           NaN
102597    10/11/2015
102598           NaN
Name: last review, Length: 102599, dtype: object

#### As It is a Date Columns, So we cannot fill these values on The Base Of Prediction
- Remove These Values

In [11]:
df.dropna(subset=['last review'], inplace=True)

#### Verify Total Number Of Missing Values Again

In [12]:
df.isna().sum().sum()

2860

#### Check Missing Columns Again

In [13]:
df.isna().sum()

host_identity_verified            247
neighbourhood group                25
neighbourhood                      16
lat                                 7
long                                7
country                           489
country code                      116
instant_bookable                   93
cancellation_policy                67
room type                           0
Construction year                 185
price                             221
service fee                       241
minimum nights                    345
number of reviews                  59
last review                         0
reviews per month                  13
review rate number                288
calculated host listings count    261
availability 365                  180
dtype: int64

#### Check Unique Values Of Country

In [14]:
df.country.unique()

array(['United States', nan], dtype=object)

#### Check Unique Values Of Country Code

In [15]:
df['country code'].unique()

array(['US', nan], dtype=object)

#### Both Country and Country Code Provide Same Information So Drop One Column

In [16]:
df.drop(columns=['country'], axis=1, inplace=True)

#### Fill country code missing values with `US`

In [17]:
df['country code'] = df['country code'].fillna(value='US')

#### Drop Missing Values of Minimum Nights Column and Correct Then

In [18]:
df.dropna(subset=['minimum nights'], inplace=True)
df['minimum nights'] = df['minimum nights'].astype(int)

In [19]:
df['minimum nights'].unique()

array([  10,   30,    3,   45,    2,    1,    5,    4,   90,   14,    7,
        180,    9,   31,    6,   15,   29,    8,   47,   81,  144,  371,
        149,  273,  131,  275,  323,   59,   99,  268,  -10,  189,  340,
        128,  350,  166,  110,   57,  142,  366,   28,  200,   50,  399,
         17,   11,  452,   25,   13,   -5,   21,   27,   18,  365,   -1,
         20,  -12,   40,   44,  398,   -2,   65,   -3,   55,   60,  120,
        122,   19,  240,   88,  115,  150,  370,   16,   80,  181,   26,
        265,  300,  360,   12,   70,   39,   24,   32,  270,   22,   75,
        250,   23, 1250,  364,   74,  198,  100,  500,   35,   91,   53,
        160,  999,  186,   68,   93,   87,  183,  175,  133,   42,   33,
         37,  105,  222,   58,  210,  954, 5645, 2645, -365, -200,   85,
         48,  155,  307,  129,  145,   64,  182,  333,  119, -125,  125,
        458,  825])

In [20]:
df['minimum nights'] = abs(df['minimum nights'])

In [21]:
df['minimum nights'].unique()

array([  10,   30,    3,   45,    2,    1,    5,    4,   90,   14,    7,
        180,    9,   31,    6,   15,   29,    8,   47,   81,  144,  371,
        149,  273,  131,  275,  323,   59,   99,  268,  189,  340,  128,
        350,  166,  110,   57,  142,  366,   28,  200,   50,  399,   17,
         11,  452,   25,   13,   21,   27,   18,  365,   20,   12,   40,
         44,  398,   65,   55,   60,  120,  122,   19,  240,   88,  115,
        150,  370,   16,   80,  181,   26,  265,  300,  360,   70,   39,
         24,   32,  270,   22,   75,  250,   23, 1250,  364,   74,  198,
        100,  500,   35,   91,   53,  160,  999,  186,   68,   93,   87,
        183,  175,  133,   42,   33,   37,  105,  222,   58,  210,  954,
       5645, 2645,   85,   48,  155,  307,  129,  145,   64,  182,  333,
        119,  125,  458,  825])

#### Analyze Neighborhood and neighborhood group column

In [None]:
df['neighbourhood'].unique()

In [23]:
df['neighbourhood group'].unique()

array(['Brooklyn', 'Manhattan', 'brookln', 'manhatan', nan, 'Queens',
       'Staten Island', 'Bronx'], dtype=object)

#### Some Columns are Repeated In neighbourhood group Column, drop them

In [26]:
df = df[(df['neighbourhood group'] != 'brookln') & (df['neighbourhood group'] != 'manhatan')]

##### Explanation
- In Pandas For Filtring we use & instead of and
- We Need To Wrap Each Condition In Paranthesis

#### Drop Neighbourhood Column

In [27]:
df.drop(columns=['neighbourhood'], axis=1, inplace=True)

#### Looks For Total Missing Values and Shape Of Data Again

In [28]:
df.isna().sum().sum()

1822

In [29]:
df.isna().sum()

host_identity_verified            237
neighbourhood group                24
lat                                 7
long                                7
country code                        0
instant_bookable                   85
cancellation_policy                67
room type                           0
Construction year                 175
price                             221
service fee                       240
minimum nights                      0
number of reviews                  58
last review                         0
reviews per month                  11
review rate number                269
calculated host listings count    261
availability 365                  160
dtype: int64

In [30]:
df.shape

(86359, 18)

#### Analyze Host Identity Verified

In [31]:
df['host_identity_verified'].head()

0    unconfirmed
1       verified
3    unconfirmed
4       verified
5       verified
Name: host_identity_verified, dtype: object

In [32]:
df['host_identity_verified'].value_counts()

host_identity_verified
unconfirmed    43097
verified       43025
Name: count, dtype: int64

#### Values are Too Close To Each Other So We Cannot Fill On The Base Of Mode
- Drop Missing Values

In [33]:
df.dropna(subset=['host_identity_verified'], inplace=True)

#### Analyze Number Of Reviews

In [34]:
df['number of reviews'].head()

0      9.0
1     45.0
3    270.0
4      9.0
5     74.0
Name: number of reviews, dtype: float64

In [35]:
max(df['number of reviews'].head())

270.0

In [36]:
min(df['number of reviews'].head())

9.0

In [39]:
df.dropna(subset=['number of reviews'], inplace=True)

In [40]:
df['number of reviews'] = df['number of reviews'].astype(int)

#### Analyze Construction Year Columns

In [42]:
df['Construction year'].head()

0    2020.0
1    2007.0
3    2005.0
4    2009.0
5    2013.0
Name: Construction year, dtype: float64

In [44]:
df['Construction year'].value_counts()

Construction year
2006.0    4423
2008.0    4413
2014.0    4394
2019.0    4381
2015.0    4318
2018.0    4301
2010.0    4300
2003.0    4294
2009.0    4291
2005.0    4290
2017.0    4285
2011.0    4283
2012.0    4283
2020.0    4274
2007.0    4268
2022.0    4251
2016.0    4249
2021.0    4245
2004.0    4229
2013.0    4127
Name: count, dtype: int64

#### We have Two Options
- As 175 values are missing and our all years have values greater than 4200, except 2013, we can fill missing with 2013
- Drop Missing Values

In [45]:
df['Construction year'] = df['Construction year'].fillna(value='2013', axis=0)

In [46]:
df['Construction year'] = df['Construction year'].astype(int)

#### Analyze Availability 365 Column

In [47]:
df['availability 365'].head()

0    286.0
1    228.0
3    322.0
4    289.0
5    374.0
Name: availability 365, dtype: float64

#### Check Shape of Dataset and Total Number Of Missing Values

In [48]:
df.shape

(86064, 18)

In [50]:
df.isna().sum().sum()

1306

#### Drop All Missing Values b/c They Don't affect our Dataset Now

In [51]:
df.dropna(inplace=True)