# Capstone: Airbnb Price Listing Prediction
## Part 1 Data Cleaning

_Authors: Evonne Tham_


## 1. Import Necessary Libraries & Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns   

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
country = 'japan/'
path = '../datasets/'

calendar = pd.read_csv(path + country + 'calendar.csv')
listing = pd.read_csv(path + country + 'listings.csv')
neighbourhood = pd.read_csv(path + country + 'neighbourhoods.csv')
reviews = pd.read_csv(path + country + 'reviews.csv')

In [None]:
# # show all rows and columns
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

---
## 2. Initial Exploratory Analysis 
Evaluate the current state of the data

##### Defining Functions

In [None]:
def eda(dataframe):
    
    """
    Runs basic EDA on dataframe.
    """
    
    print("DF shape: ", dataframe.shape)
    print("")
    print("DF types:\n", dataframe.dtypes)
    print("***********************************************************************************")
    print("")
    print("DF describe: \n", dataframe.describe())
    print("***********************************************************************************")
    print("")
    print("DF describe: \n", dataframe.describe(exclude=np.number).T)
    print("***********************************************************************************")
    print("")
    print("Missing values: \n", 
          (dataframe.isnull().sum())[dataframe.isnull().sum()>0].sort_values(ascending=False))
    print("***********************************************************************************")
    print("")
    print("duplicate values: ", dataframe.duplicated().sum())
    print("***********************************************************************************")
    print("")
    #check if column is unique
    for i in dataframe.columns:
        print('{} is unique: {}'.format(i, dataframe[i].is_unique))
    print("***********************************************************************************")

### 2.1. Calendar Dataset

In [None]:
eda(calendar)

In [None]:
calendar.head()

In [None]:
print(calendar['date'].min())
print(calendar['date'].max())

<div class="alert alert-block alert-info">

<b> Calendar Dataset Observations: </b>
- calendar ranged from 2020-05-30 to 21-05-30
- most likely t and f represents true and false
- columns that might be useful 
    - listing_id (to combine with other datasets)
    - date
    - available
    - price

<b> To-Dos: </b>
- convert date object to datetime
- remove $, from price column
- get dummy for available column
- replace t and f value to 1s and 0s
- fill Null values fro minimum and maximum nights
</div>

### 2.2. Listing Dataset 

In [None]:
eda(listing)

In [None]:
listing.head().T

<div class="alert alert-block alert-info">

<b> Observations: </b>
- Dataset consist of 15009 listings 
- has over 100 features

<b> To-Dos: </b>
- To dropping columns (entirely made up of NaNs): 
    - neighbourhood_group_cleansed,
    - medium_url, 
    - thumbnail_url, 
    - xl_picture_url, 
    - jurisdiction_names
    - experiences_offered
- consider dropping columns(consist of only about 100 data):
    - square_feet 
    - monthly_price
    - weekly_price
- consider dropping columns that are text focused
- drop rows with mostly null value
- replace t and f value to 1s and 0s
- one hot coding for categorical variables, for example
    - cancellation policy ('strict_14_with_grace_period', 'moderate', 'flexible', 'strict',
       'super_strict_60', 'super_strict_30')
    - property_type
    - room_type
- clean columns with $ 
- review scores - imputate with median
</div>

---
### 2.3. Neighbourhood Dataset 

In [None]:
eda(neighbourhood)

In [None]:
neighbourhood.head()

<div class="alert alert-block alert-info">

<b> Neighbourhood Dataset Observations: </b>
- fairly simple dataset 
- total 62 neighbourhood
- might not need this dataset for analysis

<b> To-Dos: </b>
- to drop neighbourhood_group column (fully NaNs)
</div>

---
### 2.4. Reviews Dataset 

In [None]:
eda(reviews)

In [None]:
reviews.head()

In [None]:
print(reviews['date'].min())
print(reviews['date'].max())

<div class="alert alert-block alert-info">

<b> Reviews Dataset Observations: </b>
- there are 398181 reviews
- dated from 2011-09-21 to 2020-05-30
- 280 missing comments

<b> To dos: </b>

- convert date object to datetime
- clean comments

</div>

---
## 3. Data Cleaning

##### Defining Functions

In [None]:
def mass_cleaning(df):
    
    """
    Clean date and replace ts and fs
    """
    
    # Parse date-time string to datetime object
    if 'date' in df.columns: 
        df['date'] = pd.to_datetime(df['date'])

    
    #Columns with Ts and Fs
    df.replace({'f': 0, 't': 1}, inplace=True)
    
    # Clean Currency
    df = df.applymap(lambda x: int(x.replace('$','').replace(',','').replace('.00','')) 
                     if (type(x) == str) and (x[0]=='$') else x )
    
    return df

In [None]:
def impute_median(df, column):
    
    """
    Replace NaN values with median
    """
    
    df[column].fillna((df[column].median()), inplace=True)

---
### 3.1. Calendar 

In [None]:
calendar = mass_cleaning(calendar)
calendar.head()

In [None]:
# Imputate missing values in minimum maximum nights with 1s, 0s 
calendar['minimum_nights'].fillna(0, inplace=True)
calendar['maximum_nights'].fillna(0, inplace=True)

# Replace continuous to discrete
calendar['minimum_nights'] = calendar['minimum_nights'].astype(int)
calendar['maximum_nights'] = calendar['maximum_nights'].astype(int)

#Check for nulls
calendar.isnull().sum()

<div class="alert alert-block alert-warning">
<b>Note to self:</b>
^ Might consider dropping `minimum_nights` and  `maximum_nights`
</div>

In [None]:
plt.figure(figsize=(15,5))
calendar['price'].plot.line(alpha=0.5, color = '#FF5A5F')
calendar['adjusted_price'].plot.line(alpha=0.5, color = '#00A699')
plt.ylabel('$ ', fontsize = 25, rotation=0)
plt.legend(['Price', 'Adjusted Price']);

<div class="alert alert-block alert-warning">

<b>Note to self:</b>
Doesnt seem to have much difference in majority of the lising, can consider dropping `adjusted_price`?


</div>

##### Dropping Columns

<div class="alert alert-block alert-warning">

<b>Note to self:</b>
To remove lines in line18, if dropping cols

</div>

In [None]:
# cols_to_drop = ['adjusted_price', 'minimum_nights', 'maximum_nights']

# calendar = calendar.drop(cols_to_drop, axis=1)
# calendar.shape

##### Save DataFrame

In [None]:
calendar.to_csv("../datasets/calendar_cleaned.csv")

---
### 3.2. Listing

In [None]:
listing = mass_cleaning(listing)

##### Isolate and inspect categorical and numerical variable

<div class="alert alert-block alert-info">
As the the number of features is fairly large, I will be splitting them up by categorical and numerical.
</div>

In [None]:
categorical_df = listing.select_dtypes(include=['object', 'bool'])
categorical_cols = listing.select_dtypes(include=['object', 'bool']).columns.values

In [None]:
categorical_df.head(2).T

In [None]:
numerical_df = listing.select_dtypes(include=['int', 'float'])
numerical_cols = listing.select_dtypes(include=['int', 'float']).columns.values

In [None]:
numerical_df.head(2).T

### 3.2.1. Dropping  and Convert Columns

<div class="alert alert-block alert-info">
    
<b>Explanation:</b> From the data shown above (categorical/numerical), I will be dropping text columns for now and columns that might not be useful for predicting price. 
</div>

In [None]:
cols_to_drop = ['listing_url', 'last_scraped', 'name', 'summary','space', 
                'description', 'neighborhood_overview', 'notes', 'transit', 'access', 
                'interaction', 'house_rules', 'picture_url', 'host_url','host_name', 
                'host_location', 'host_about', 'host_thumbnail_url', 'host_picture_url','host_neighbourhood', 
                'host_verifications', 'calendar_last_scraped', 'license', 'scrape_id','thumbnail_url',
                'medium_url', 'xl_picture_url'
               ]

listing = listing.drop(cols_to_drop, axis=1)
listing.shape

In [None]:
# Convert rest of the datetime string column to datetime object

listing['host_since'] = pd.to_datetime(listing['host_since'])
listing['first_review'] = pd.to_datetime(listing['first_review'])
listing['last_review'] = pd.to_datetime(listing['last_review'])

In [None]:
# Check for columns that has more than 10000 Null values

listing.isnull().sum()[listing.isnull().sum()>10000].sort_values(ascending=False)

In [None]:
# Drop columns consist a majority of NaNs 
# reviews_per_month and number_of_reviews_ltm columns will be dropped as well as there is number_of_reviews 
# experiences_offered are all 'none' values as well

listing.drop(['jurisdiction_names','neighbourhood_group_cleansed','square_feet',
              'monthly_price','weekly_price', 'reviews_per_month','number_of_reviews_ltm', 
              'experiences_offered'], 
             axis=1, inplace=True)

In [None]:
listing.shape

### 3.2.2  Imputation of Missing Values

In [None]:
listing.isnull().sum()[listing.isnull().sum()>1000].sort_values(ascending=False)

<div class="alert alert-block alert-info">
    
<b>Assumption:</b>

Missing data from (cols that start with name): 

- review_scores
- host_response
- last_review 
- first_review
- host_acceptance_rate

Might be an indication of properties that are new listing that has not been contacted yet. Hence they do not have any response time and rate, and/or ratings . Hence, null values for these features will be filled with `unrated`, `unknown` accordingly. 

I will also be splitting them into categories.
- reviews_scores does not have all the rating from 0-10, hence I will categorise them according to 
    - low 
    - acceptable 
    - good
    - very good 
    - excellent

</div>

##### a. Review_Scores_

In [None]:
print('Unique Review Scores Value: ', listing.review_scores_value.unique())
print('**************************************************************')
print('Unique Review Scores Location: ', listing.review_scores_location.unique())
print('**************************************************************')
print('Unique Review Scores Comm: ', listing.review_scores_communication.unique())
print('**************************************************************')
print('Unique Review Scores Checkin: ', listing.review_scores_checkin.unique())
print('**************************************************************')
print('Unique Review Scores Cleaniness: ', listing.review_scores_cleanliness.unique())
print('**************************************************************')
print('Unique Review Scores Acc: ', listing.review_scores_accuracy.unique())
print('**************************************************************')
print('Unique Review Scores Rating: ', listing.review_scores_rating.unique())

In [None]:
# Base on the the distribution of the scores, I will create bins accordingly.
# All Review Score excluding Review Score Rating as it is in different weightage

for col in listing.columns:
    if (col in listing.columns[listing.columns.str.startswith('review_scores_')]) and not (col in listing.columns[listing.columns.str.endswith('_rating')]):
        
        # Create Categories
        bins = [0,3,7,8,9,10]
        labels = ['0-3','4-7','8','9','10']
        
        listing[col] = pd.cut(listing[col],
                                      bins=bins, 
                                      labels=labels, 
                                      include_lowest= True)
        
        # Create unrated Category for 'NaN'
        listing[col] = (listing[col].cat.add_categories('unrated').fillna('unrated'))
        
listing.review_scores_value.value_counts()

In [None]:
# Create Categories for Review Score Rating

bins = [0, 50, 90, 99, 100]
labels = ['0-49%', '50-89%', '90-99%', '100%']

listing['review_scores_rating'] = pd.cut(listing['review_scores_rating'],
                                      bins=bins, 
                                      labels=labels, 
                                      include_lowest= True)

# Create Unknown Category for 'NaN'
listing['review_scores_rating'] = (listing['review_scores_rating'].cat
                                                            .add_categories('unrated')
                                                            .fillna('unrated'))

listing.review_scores_rating.value_counts()

In [None]:
listing.isnull().sum()[listing.isnull().sum()>0].sort_values(ascending=False)

##### b. _Reviews

<div class="alert alert-block alert-warning">
    
- Not sure what to do with this.
- fillna with ??? 
- drop last and first review??? 

- BRB

</div>

In [None]:
# last and first reviews
listing['time_since_first_review'] = listing.last_review - listing.first_review
# listing.time_since_first_review.hist(figsize=(11,9), bins=30);

In [None]:
listing['time_since_first_review'].value_counts()

In [None]:
listing.isnull().sum()[listing.isnull().sum()>0].sort_values(ascending=False)

##### b. Host 
_i. Response time_

In [None]:
listing.host_response_time.value_counts()

In [None]:
listing.host_response_time.fillna("unknown", inplace=True)
listing.host_response_time.value_counts(normalize=True)

_ii. Response rate_

In [None]:
listing.host_response_rate.unique()

In [None]:
# Replace Unneccessary Character and convert string to float
listing.host_response_rate = listing.host_response_rate.str.replace('%','')
listing.host_response_rate = listing.host_response_rate.astype('float64')

# Create Categories
bins = [0, 50, 90, 99, 100]
labels = ['0-49%', '50-89%', '90-99%', '100%']

listing['host_response_rate'] = pd.cut(listing['host_response_rate'],
                                      bins=bins, 
                                      labels=labels, 
                                      include_lowest= True)

# Create Unknown Category for 'NaN'
listing['host_response_rate'] = (listing['host_response_rate'].cat
                                                            .add_categories('unknown')
                                                            .fillna('unknown'))

listing.host_response_rate.value_counts()

_iii. Acceptance rate_

In [None]:
listing.host_acceptance_rate.unique()

In [None]:
# Replace Unneccessary Character and convert string to float
listing.host_acceptance_rate = listing.host_acceptance_rate.str.replace('%','')
listing.host_acceptance_rate = listing.host_acceptance_rate.astype('float64')

# Create Categories
bins = [0, 50, 90, 99, 100]
labels = ['0-49%', '50-89%', '90-99%', '100%']

listing['host_acceptance_rate'] = pd.cut(listing['host_acceptance_rate'],
                                      bins=bins, 
                                      labels=labels, 
                                      include_lowest= True)

# Create Unknown Category for 'NaN'
listing['host_acceptance_rate'] = (listing['host_acceptance_rate'].cat
                                                            .add_categories('unknown')
                                                            .fillna('unknown'))

listing.host_acceptance_rate.value_counts()

In [None]:
listing.isnull().sum()[listing.isnull().sum()>0].sort_values(ascending=False)

##### c. Address Related

<div class="alert alert-block alert-info">
    
<b>Observation:</b>

Looking at the categorical_df dataset. There are a couple of columns that are address related 
- street
- neighbourhood
- neighbourhood_cleansed --> to keep (Since the column is kept, Neighbourhood Dataset will not be required
- city
- state
- zipcode
- market
- smart_location
- country_code --> to keep
- country

From numerical_df, to safe in a new dataframe for later use
- longitute 
- latitude

</div>

In [None]:
# To Check and Compare Address Related Columns

# print(listing.street.value_counts())
# print(listing.street.nunique())
# print(listing.neighbourhood_cleansed.value_counts())
print(listing.neighbourhood_cleansed.nunique())
# print(listing.city.value_counts())
# print(listing.city.nunique())
# print(listing.state.value_counts())
# print(listing.state.nunique())
# print(listing.market.value_counts())
# print(listing.market.nunique())
# print(listing.smart_location.value_counts())
# print(listing.smart_location.nunique())
# print(listing.country_code.value_counts())
# print(listing.country_code.nunique())

# listing[listing.state == 'Shibuya']['neighbourhood']

In [None]:
long_lat_df = listing[['longitude','latitude']]
long_lat_df.to_csv('../datasets/long_lat_df.csv')

In [None]:
listing.drop(['street','neighbourhood', 'city','state','zipcode',
              'market','smart_location','country'],    #,'longitude','latitude'
             axis=1, inplace=True)

listing.rename(columns={'neighbourhood_cleansed': "neighbourhood"}, inplace=True)

# Check
listing.isnull().sum()[listing.isnull().sum()>0].sort_values(ascending=False)

##### d. Fees

<div class="alert alert-block alert-info">
    
<b>Assumption:</b> Hosts can opt for no security_deposit or cleaning_fee. hence, will imputate null values as `0` and change float to int

</div>

In [None]:
# Fillna 
listing.security_deposit.fillna(0, inplace=True)
listing.cleaning_fee.fillna(0, inplace=True)

# Change dtype
listing.security_deposit = listing.security_deposit.astype('int64')
listing.cleaning_fee = listing.cleaning_fee.astype('int64')

listing.isnull().sum()[listing.isnull().sum()>0].sort_values(ascending=False)

##### e. Beds,  Bedrooms , Bathrooms and Host_

<div class="alert alert-block alert-info">
    
<b>Assumption:</b> It is very usual for a listing to have no beds, and/or bathrooms. On top of that, a fully signed up Host on Airbnb will have a date of signing up. Hence I will be checking if they are of the same before i drop them. As for bedrooms, there is a possility that it is a studio apartment however, i cant entire assume. To avoid strange fractions.

</div>

In [None]:
# Beds and Bathrooms
listing.dropna(subset=['beds','bathrooms'], inplace=True)
impute_median(listing, 'bedrooms')

In [None]:
# Convert continuous to discrete variable
listing.beds = listing.beds.astype('int64')
listing.bathrooms = listing.bathrooms.astype('int64')
listing.bedrooms = listing.bedrooms.astype('int64')

In [None]:
listing.isnull().sum()[listing.isnull().sum()>0].sort_values(ascending=False)

##### f. Host_

In [None]:
# Host_
# Checking if the 7 rows are of the same 
listing[listing['host_identity_verified'].isnull()| 
        listing['host_has_profile_pic'].isnull() | 
        listing['host_total_listings_count'].isnull() |
        listing['host_listings_count'].isnull() |
        listing['host_is_superhost'].isnull() | 
        listing['host_since'].isnull()]

In [None]:
listing.dropna(inplace=True)
listing.shape

#### 3.2.3  Individual Columns

##### a. Listing Count

In [None]:
print('Total number of listings: ', listing.shape[1])
print('Differences between columns: ', sum((listing.host_listings_count == listing.host_total_listings_count) == False))

<div class="alert alert-block alert-info">

<b>Observations:</b> Host listing count and host total listings counts seem to have the same data. There are also other columns that are similar to listing counts (which are split up into multiple columns:

- calculated_host_listings_count, 
- calculated_host_listings_count_entire_homes, 
- calculated_host_listings_count_private_rooms, 
- calculated_host_listings_count_shared_rooms

Hence these columns will be dropped as well.
</div>

In [None]:
listing.drop(['host_total_listings_count','calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
              'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms'], 
             axis=1, inplace=True)

In [None]:
listing.host_listings_count = listing.host_listings_count.astype('int64')

##### b. Price

_Listing price at zero are not found on airbnb site. Hence I will be dropping those._

<img src="../image/listing_zero_value.png"
	title="Zero Dollar Listing" width="550" height="500" />

In [None]:
listing.drop(listing[listing.price == 0].index, inplace=True)

##### c. Boolean Columns

In [None]:
# Columns with only booleans for Verfication Purposes:
listing[['host_is_superhost',
         'host_has_profile_pic', 
         'host_identity_verified', 
         'require_guest_profile_picture',
         'require_guest_phone_verification',
         'requires_license', 
         'instant_bookable', 
         'is_business_travel_ready', 
         'is_location_exact',
         'has_availability'
        ]].hist(figsize=(20,10), color='#FF5A5F');

<div class="alert alert-block alert-info">

<b>Observation:</b> As shown above some of the features showed uniform result. Hence will be dropping them as they will not be affect the price. 
- host_has_profile_pic
- is_business_travel_ready
- require_guest_phone_verification
- require_guest_profile_picture
- requires_license

</div>

In [None]:
listing.drop(['host_has_profile_pic',
              'is_business_travel_ready',
              'require_guest_phone_verification',
              'require_guest_profile_picture',
              'requires_license', 
              'has_availability'], axis=1, inplace=True)

In [None]:
# Convert the rest of the boolean to int 

listing.host_is_superhost = listing.host_is_superhost.astype('int64')
listing.host_identity_verified = listing.host_identity_verified.astype('int64')

##### d. Minimum and Maximum Nights

<div class="alert alert-block alert-info">

<b>Observation:</b> There are multiple columns that are related to minimum and maximum night stays. To avoid overlapping of data, I will using the basic two which have not been altered.

</div>

In [None]:
listing.drop(['minimum_minimum_nights',
             'maximum_minimum_nights',
             'minimum_maximum_nights',
             'maximum_maximum_nights',
             'minimum_nights_avg_ntm',
             'maximum_nights_avg_ntm'], axis=1, inplace=True)

##### e. Categorical Columns

##### i. property_type

<div class="alert alert-block alert-info">

<b>Observation:</b> There are a total of 29 unique property types with a majority being the Apartment. However other property types (e.g. house, condominium, serviced apartment) can be considered to be in the same category as well. 

</div>

In [None]:
print(listing.property_type.nunique())
listing.property_type.value_counts()

In [None]:
# # Regroup and rename property type
# listing.property_type.replace({
#     'Apartment': 'house',
#     'House': 'house',
#     'Hostel': 'hostel',
#     'Condominium': 'apartment', 
#     'Aparthotel': 'hotel', 
#     'Boutique hotel': 'hotel', 
#     'Villa': 'house', 
#     'Serviced apartment': 'apartment',
#     'Ryokan (Japan)': 'apartment',
#     'Hut': 'house',
#     'Loft': 'apartment',
#     'Guesthouse': 'house',
#     'Bed and breakfast': '',
#     'Townhouse': 'house',
#     'Guest suite': '',
#     'Tiny house': 'house',
#     'Other': '',
#     'Cabin': '',
#     'Dome house': 'house',
#     'Camper/RV': '',
#     'Bungalow': 'house',
#     'Tent': '',
#     'Nature lodge': 'house',
#     'Resort','house',
#     'Earth house','',
#     'Cottage','house',
#     'Castle','castle',
#     'Dorm','hostel'
#     }, inplace=True)

##### ii. room_type

In [None]:
listing.room_type.value_counts()

In [None]:
# Rename room type
listing.room_type.replace({
    'Entire home/apt': 'entire_home_apt',
    'Private room': 'private_room',
    'Hotel room': 'hotel_room',
    'Shared room': 'shared_room',  
    }, inplace=True)

##### iii. bed_type

<div class="alert alert-block alert-info">

<b>Observation:</b> This column seem kinda redundent since majority are the same bed. Hence will be dropping this column. 

</div>

In [None]:
listing.bed_type.value_counts()

In [None]:
listing.drop(['bed_type'], axis=1, inplace=True)

##### iv. amenities

<div class="alert alert-block alert-info">


<b>Observation:</b> Amenities is a list of additional features in the property where i believe people look at before (e.g. wifi, air conditioning, swimming pool, etc), committing the booking. However not all amenities are of the same importance.

There are over 150 unique type of amenities

</div>

In [None]:
listing.amenities.head()

In [None]:
listing['amenities'] = listing['amenities'].str.replace('{', '').str.replace('}','').str.replace('"', '')
listing['amenities'] = listing['amenities'].str.lower()

##### v. Calendar_updated

<div class="alert alert-block alert-warning">

<b>Observation:</b>

- to get back to this
- or else to be dropped

</div>

In [None]:
print("Total categories:", listing.calendar_updated.nunique())
print('')
print("Top 5 categories:")
listing.calendar_updated.value_counts()[:5]

##### vi. Availability

<div class="alert alert-block alert-warning">

<b>Observation:</b>

- to get back to this
- or else to be dropped

</div>

In [None]:
print("Avail 30days categories:", listing.availability_30.nunique())
print("Avail 60days categories:", listing.availability_60.nunique())
print("Avail 90days categories:", listing.availability_90.nunique())
print("Avail 365days categories:", listing.availability_365.nunique())
print('')
print("Top 5 categories of 30days:")
listing.availability_30.value_counts()[:5]

##### vi. Cancellation Policy

In [None]:
listing.cancellation_policy.value_counts()

In [None]:
listing.cancellation_policy.replace({
    'strict_14_with_grace_period': 'strict',
    'super_strict_30': 'super_strict',
    'super_strict_60': 'super_strict',
    'strict_14_with_grace_period': 'strict',
    'luxury_moderate': 'moderate'
    }, inplace=True)
listing.cancellation_policy.value_counts()

In [None]:
print(listing.shape)

##### Save DataFrame

In [None]:
# listing.to_csv("../datasets/listing_cleaned.csv")

##### Merge Dataframe

In [None]:
# listing_cal = pd.merge(listing, calendar, how='left', on=['id', date','price'])

---
### 3.3. Neighbourhood 

In [None]:
neighbourhood = neighbourhood.drop(['neighbourhood_group'], axis=1)
neighbourhood

<div class="alert alert-block alert-info">

<b>Observation:</b>
Since listing dataframe has the column on neighbourhood and country code. This dataframe will not be required. 

</div>

##### Save DataFrame

In [None]:
neighbourhood.to_csv("../datasets/neighbourhood_cleaned.csv")

---
### 3.4 Reviews 

In [None]:
reviews.head()

In [None]:
# Parse date-time string to datetime object
reviews.date = pd.to_datetime(reviews['date'])

In [None]:
# Drop Unneccessary Columns
reviews.drop(['id','reviewer_id','reviewer_name'], axis=1, inplace=True)

<div class="alert alert-block alert-info">

<b>Observation:</b> There are no ratings for the individual comments and does not provide neccessary information for this analysis. Hence, I will not be using this dataset.

</div>

##### Save DataFrame

In [None]:
reviews.to_csv("../datasets/reviews_cleaned.csv")

---
##### Save Final Dataframe

In [None]:
listing.to_csv("../datasets/final.csv", index = False)

----> Proceed to the next notebook for the full exploratory data analysis