In [None]:
from IPython.display import Image

Image(url='https://img4.cityrealty.com/neo/i/p/mig/airbnb_guide.jpg')

# <font color = 'blue'> AirBnB Data Analysis with Python Final Project
    

### <font color='blue'>Content: 
We are focusing on California’s data for analysis. Being new to California, it would be interesting to learn about places which people prefer for AirBnb in California. Our main focus was around affordability, customer preference for AirBnb property based on customer reviews and how close are these AirBnb properties from airport, beach and other touristy location.

### <font color='blue'>Dataset:
https://www.kaggle.com/gkdoad/airbnb

### <font color='blue'>Team:
<ol>
<li> Resham Uttamchandani - ruttamchandani@scu.edu</li>
<li> Mukesh Ganesh - mganesh2@scu.edu</li>
<li> Pratiksha Raval - praval@scu.edu</li>
<li> Dnyanai Surkutwar - dsurkutwar@scu.edu</li>
</ol>

### <font color='blue'>Following are a few questions that we aim to answer through our analysis:
<ol>
<li>Affordability analysis: How do prices of listings vary by location?</li>
<li>Airbnb property preference based on cutomer reviews.</li>
<li>What are the different types of properties in and around LA? Do they vary by neighborhood?</li>
<li>What neighborhoods are rated highly by guests?</li>
<li>Customer preference based Airbnb recommendations.</li>  
</ol>


##### <font color='blue'> Importing libraries 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import json
import branca

pd.set_option('display.max_columns',50)
pd.options.display.max_rows

##### <font color='blue'> Setting the map location co-ordinates for the map graphs later

In [None]:
# Setting a base map
lat = 34.1000
long = -118.4662

##### <font color='blue'> Read the listings csv file and setting the index column as id 

In [None]:
initFile = pd.read_csv('listings.csv',index_col= 'id',low_memory=False)
initFile.head(1)

##### <font color='blue'> Description of Data
<ol>
<font color='blue'><li>The dataset comprises of three main tables:</li></font>
<ul>
<font color='blue'><li>listings -</li></font> Detailed listings data showing 106 atttributes for each of the listings. Some of the attributes used in the analysis are price, longitude, latitude, listing_type (categorical),neighbourhood_cleansed(categorical), ratings among others.
<font color='blue'><li>reviews -</li></font> Detailed reviews given by the guests with 6 attributes. Key attributes include review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, review_scores_communication, review_scores_location, review_scores_value, cancellation_policy.
</ul>
<br>
</ol>

<font color='blue'>                          
2. A quick glance at the data shows that there are:</font>
<ul>
<li>Most of the listings in California are based in and near LA, predominately in LA county. 
<li>Over 35,959 reviews have been written by guests since 26th May 2009.</li>
<li>The price for a listing ranges from 10 per night to 25,000 dollars(!) per night. Listing with $25,000 price tag are in Hollywood hills west.</li>
</ul>

In [None]:
initFile.first_review.count()

In [None]:
initFile.first_review.sort_values(ascending=True).head(1)

In [None]:
initFile.head()

##### <font color='blue'> Reseting the index and assigning a new column as the dataset id 

In [None]:
initFile.reset_index(inplace=True)

In [None]:
initFile['colid'] = initFile.index 

In [None]:
initFile.drop('id',axis=1,inplace=True)

In [None]:
initFile.rename(columns={'colid':'id'},inplace=True)
initFile.id +=1

In [None]:
initFile.set_index('id',inplace=True)
initFile.head(1)

##### <font color='blue'> Cleaning the dataset to make sure we have only CA related values

In [None]:
df_cleansed = initFile.copy()

In [None]:
### Taking only the CA listings and making sure everything in state as uppercase
df_cleansed.state.loc[df_cleansed.state=='California'] = 'CA'
df_cleansed.state.unique()
df_cleansed.state = df_cleansed['state'].str.upper()

In [None]:
## As we can see there are some unwanted & non-CA entries in the state column
df_cleansed.state.value_counts()

##### <font color = 'blue'> We need to remove the non-CA values from the dataset

In [None]:
df_cleansed.drop(df_cleansed[~(df_cleansed.state.values=='CA')].index,axis=0,inplace=True)

##### <font color='blue'> Convert the city column to lower case

In [None]:
df_cleansed.city = df_cleansed.city.str.lower()
df_cleansed.city.unique()
df_cleansed.city.dropna(inplace=True)

##### <font color='blue'> Cleaning the city column and dropping NaN values in city column 

In [None]:
df_cleansed['city'] = df_cleansed['city'].str.replace(r'[^\x00-\x7F]+', '')
df_cleansed.city.dropna(inplace=True)
df_cleansed.city.value_counts()

In [None]:
df_cleansed.groupby('city')['neighbourhood_cleansed'].value_counts()

In [None]:
df_cleansed.smart_location.shape

In [None]:
df_cleansed.smart_location = df_cleansed.smart_location.str.lower()

In [None]:
df_cleansed.isnull().sum()

##### <font color='blue'> Cleaning the smart_location column, removing the non-English characters in the smart_location column 

In [None]:
df_cleansed.drop(df_cleansed[(df_cleansed['smart_location'] == '蒙特利帕克, ca') | \
                       (df_cleansed['smart_location'] == '因達斯特里, ca') | \
                       (df_cleansed['smart_location'] == '艾尔蒙地, ca') | \
                       (df_cleansed['smart_location'] == '沃尔纳特, ca') | \
                       (df_cleansed['smart_location'] == '哈仙达岗, ca') | \
                       (df_cleansed['smart_location'] == '罗兰高地, ca') | \
                       (df_cleansed['smart_location'] == '罗兰岗, ca') | \
                       (df_cleansed['smart_location'] == '天普市, ca') | \
                       (df_cleansed['smart_location'] == '洛杉矶, ca')  \
                      ].index, inplace = True)

##### <font color='blue'> Replacing ', ca' in smart_location column so that it is uniform with the other location related columns 

In [None]:
df_cleansed.smart_location = df_cleansed.smart_location.str.replace(', ca',' ', regex=True)
df_cleansed.smart_location = df_cleansed.smart_location.str.strip()
## Removing the spaces from the neighbourhood_cleansed
df_cleansed.neighbourhood_cleansed = df_cleansed.neighbourhood_cleansed.str.strip()

In [None]:
## Convert smart_location and neighbourhood_cleansed columns into lower case
df_cleansed.smart_location = df_cleansed.smart_location.str.lower()
df_cleansed.neighbourhood_cleansed = df_cleansed.neighbourhood_cleansed.str.lower() 
df_cleansed.head(2)

### <font color='Red'><b> Interesting Fact 1</b></font>

#### <font color='green'>Q) Make a dataset copy for Interesting Fact 1 which is focused on:
<font color='green'>1)Affordability analysis: How do prices of listings vary by location</font><br>
<font color='green'>2)Building an autoML solution and testing to see if it is a good solution to invest in for making the priceType tags for the listings.</font>


##### <div class="alert alert-block alert-success">Fact 1 insights:<br><li>For the Low price range,almost all of the prices in Low price range are located in Hollywood.<br>Other top locations with maximum low price range are: Venice, pico-union, Downtown and Long Beach.<br>The average price in the Low range is around 100 dollars.</li><br><li>For the Mid price range,almost all of the prices in the Mid price range are located in Venice.<br>Other top locations with maximum mid price range are: Hollywood, Downtown and Long Beach.<br>The average price in the Mid range is around 150-200 dollars.</li><br><li>As observed above,Almost all of the prices in the High price range are located in Venice and Beverly grove.Venice seems a real versatile location for Airbnb listings.<br>Other top locations with maximum high price range are: Hollywood hills West, Malibu and Avalon.<br>The average price in the High range is around $550-800.</li></div>

### <font color='black'> 1) Affordability analysis: How do prices of listings vary by location

In [None]:
df_fact1 = df_cleansed[['price','security_deposit','cleaning_fee','guests_included','extra_people','minimum_nights',\
          'maximum_nights','amenities','neighbourhood_cleansed',\
       'state','zipcode','latitude','longitude']].copy()
## Renaming the neighbourhood_cleansed column to location 
df_fact1.rename(columns={'neighbourhood_cleansed':'location'},inplace=True)

In [None]:
df_fact1.info()

In [None]:
df_fact1.isna().sum()
## We see that we have NaN values in the security_deposit, cleaning_fee, zipcode columns 
## So, lets clean these columns 

In [None]:
## Lets see what the amenities column holds..
df_fact1.groupby('amenities')['price'].value_counts().nlargest(3)

##### <font color='blue'> Cleaning the Amenities column and converting it to lower case

In [None]:
df_fact1.amenities = df_fact1.amenities.str.strip('{}')
df_fact1.amenities = df_fact1.amenities.str.replace('"','')
df_fact1.amenities = df_fact1.amenities.str.lower()

##### <font color='blue'> Trying to understand which are the more frequently used values in the amenities column 

In [None]:
pd.Series(' '.join(df_fact1['amenities']).lower().split(',')).value_counts().nlargest(12)

##### <font color='blue'> Creating new coulmns around most frequently used values in the Amenities column 

In [None]:
## Taking the most sort after amenities by customers and creating new columns for them 
df_fact1['wifi'] = ''
df_fact1['ac'] = ''
df_fact1['pet'] = ''
df_fact1['park'] = ''
df_fact1['htub'] = ''
df_fact1['dishwasher'] = ''
df_fact1['bbq'] = ''
df_fact1['pool'] = ''
df_fact1['hr'] = ''
df_fact1['coffee'] = ''
df_fact1['heating'] = ''
df_fact1['iron'] = ''
df_fact1['washer'] = ''
df_fact1['dryer'] = ''
df_fact1['parking'] = ''

##### <font color='blue'> If the amenities satisfy the condition of being present in the Amenities column then set the column as 'True'

In [None]:
df_fact1.wifi = df_fact1.amenities.str.contains("wifi").apply(lambda x: 'wifi' if x==True else '')
df_fact1.ac = df_fact1.amenities.str.contains("air conditioning").apply(lambda x: 'air conditioning' if x==True else '')
df_fact1.pet = df_fact1.amenities.str.contains("pet").apply(lambda x: 'pet' if x==True else '')
df_fact1.park = df_fact1.amenities.str.contains("park").apply(lambda x: 'park' if x==True else '')
df_fact1.htub = df_fact1.amenities.str.contains("hot tub").apply(lambda x: 'hot tub' if x==True else '')
df_fact1.dishwasher = df_fact1.amenities.str.contains("dishwasher").apply(lambda x: 'dishwasher' if x==True else '')
df_fact1.bbq = df_fact1.amenities.str.contains("bbq").apply(lambda x: 'BBQ' if x==True else '')
df_fact1.pool = df_fact1.amenities.str.contains("pool").apply(lambda x: 'pool' if x==True else '')
df_fact1.hr = df_fact1.amenities.str.contains("24-hour check-in").apply(lambda x: '24-hour check-in' if x==True else '')
df_fact1.coffee = df_fact1.amenities.str.contains("coffee").apply(lambda x: 'coffee' if x==True else '')
df_fact1.heating = df_fact1.amenities.str.contains("heating").apply(lambda x: 'heating' if x==True else '')
df_fact1.iron = df_fact1.amenities.str.contains("iron").apply(lambda x: 'iron' if x==True else '')
df_fact1.washer = df_fact1.amenities.str.contains("washer").apply(lambda x: 'washer' if x==True else '')
df_fact1.dryer = df_fact1.amenities.str.contains("dryer").apply(lambda x: 'dryer' if x==True else '')
df_fact1.parking = df_fact1.amenities.str.contains("parking").apply(lambda x: 'parking' if x==True else '')

##### <font color='blue'> Drop the old amenities column as we have made columns for amenities and did one-hot encoding

In [None]:
df_fact1.drop('amenities',axis=1,inplace=True)

##### <font color='blue'> Adding amenities to the Amenities column 

In [None]:
df_fact1['amenities'] = df_fact1.wifi+' '+df_fact1.ac+' '+df_fact1.pet+' '+df_fact1.park+' '+\
df_fact1.htub+' '+df_fact1.dishwasher+' '+df_fact1.bbq+' '+df_fact1.pool+' '+df_fact1.hr+' '+df_fact1.coffee+' '+\
df_fact1.heating+' '+df_fact1.iron+' '+df_fact1.washer+' '+df_fact1.dryer+' '+df_fact1.parking


##### <font color='blue'> Converting the individual amenities columns using one-hot encoding 

In [None]:
df_fact1.wifi = df_fact1.wifi.apply(lambda x: 1 if x=='wifi' else 0)
df_fact1.ac = df_fact1.ac.apply(lambda x: 1 if x=='air conditioning' else 0)
df_fact1.pet = df_fact1.pet.apply(lambda x: 1 if x=='pet' else 0)
df_fact1.park = df_fact1.park.apply(lambda x:1 if x=='park' else 0)
df_fact1.htub = df_fact1.htub.apply(lambda x:1 if x=='hot tub' else 0)
df_fact1.dishwasher = df_fact1.dishwasher.apply(lambda x:1 if x=='dishwasher' else 0)
df_fact1.bbq = df_fact1.bbq.apply(lambda x:1 if x=='BBQ' else 0)
df_fact1.pool = df_fact1.pool.apply(lambda x:1 if x=='pool' else 0)
df_fact1.hr = df_fact1.hr.apply(lambda x:1 if x=='24-hour check-in' else 0)
df_fact1.coffee = df_fact1.coffee.apply(lambda x:1 if x=='coffee' else 0)
df_fact1.heating = df_fact1.heating.apply(lambda x:1 if x=='heating' else 0)
df_fact1.iron = df_fact1.iron.apply(lambda x:1 if x=='iron' else 0)
df_fact1.washer = df_fact1.washer.apply(lambda x:1 if x=='washer' else 0)
df_fact1.dryer = df_fact1.dryer.apply(lambda x:1 if x=='dryer' else 0)
df_fact1.parking = df_fact1.parking.apply(lambda x:1 if x=='parking' else 0)

In [None]:
df_fact1.isna().sum()

##### <font color='blue'> Working on the security_deposit column and adjusting its NaN values

In [None]:
## Removing the '$' & ',' symbols form the security_deposit column 
df_fact1.security_deposit = df_fact1.security_deposit.str.strip('$')
df_fact1.security_deposit = df_fact1.security_deposit.str.replace(',','')

In [None]:
## Converting the security_deposit column to Float from str
df_fact1.security_deposit = df_fact1.security_deposit.astype('float64')

In [None]:
## Understanding the security_deposit column using describe()
df_fact1.security_deposit.describe()

In [None]:
## Rounding the security_deposit column values after filling the NaN values to mean() of
## the security_deposit values
df_fact1.security_deposit.fillna(round(df_fact1.security_deposit.mean(),2),inplace=True)

##### <font color='blue'> Working on the cleaning_fee column and adjusting its NaN values

In [None]:
## Understanding the cleaning_fee column using describe()
df_fact1.cleaning_fee.describe()

In [None]:
df_fact1.cleaning_fee.isna().sum()

In [None]:
## Removing the '$' & ',' symbols form the cleaning_fee column 
df_fact1.cleaning_fee = df_fact1.cleaning_fee.str.strip('$')
df_fact1.cleaning_fee = df_fact1.cleaning_fee.str.replace(',','')

In [None]:
## Converting the cleaning_fee column to Float from str
df_fact1.cleaning_fee = df_fact1.cleaning_fee.astype('float64')

In [None]:
## Understanding the cleaning_fee column using describe()
df_fact1.cleaning_fee.describe()

In [None]:
## Converting the cleaning_fee column NaN values to the mean() of values in that column
df_fact1.cleaning_fee.fillna(round(df_fact1.cleaning_fee.mean(),2),inplace=True)

In [None]:
## Describing the cleaning_fee column after filling the NaN values in the column
df_fact1.cleaning_fee.describe()

In [None]:
df_fact1.isna().sum()

##### <font color='blue'> Checking the zipcode column and its NaN values

In [None]:
df_fact1.zipcode.isna().sum()

In [None]:
df_fact1.zipcode.dropna(inplace=True)

In [None]:
df_fact1.shape

##### <font color='blue'> Checking the price column

In [None]:
## Removing the '$' & ',' symbols form the price column 
df_fact1.price = df_fact1.price.str.strip('$')
df_fact1.price = df_fact1.price.str.replace(',','')

In [None]:
## Converting the price column to Float from str
df_fact1.price = df_fact1.price.astype('float64')

In [None]:
## Understanding the price column using describe()
df_fact1.price.describe()

In [None]:
## Price '0' makes no sense as the listing need to charge for per night stay and there are
## only 9 values 
df_fact1.drop(df_fact1[df_fact1.price==0].index,axis=0,inplace=True)

In [None]:
df_fact1.price.count()

##### <font color='blue'> Checking the extra_people column and making it numerical 

In [None]:
## Removing the '$' & ',' symbols form the extra_people column 
df_fact1.extra_people = df_fact1.extra_people.str.strip('$')
df_fact1.extra_people = df_fact1.extra_people.str.replace(',','')

In [None]:
## Converting the extra_people column to Float from str
df_fact1.extra_people = df_fact1.extra_people.astype('float64')

In [None]:
## Understanding the extra_people column using describe()
df_fact1.extra_people.describe()

In [None]:
df_fact1.guests_included.dtype
df_fact1.minimum_nights.dtype
df_fact1.maximum_nights.dtype
## Since the numerical columns are alreay in integer type we do not need to make any change

In [None]:
## Converting the df1 to a csv file 
df_fact1.to_csv('AffordabilityDataset.csv',index=False)

##### <font color='blue'> Dividing the price column into 10 equal parts 

In [None]:
pd.qcut(df_fact1.price,10)

##### <font color='blue'> Create a new priceType column which will be have price ranges - Low,Mid,High

In [None]:
df_fact1['priceType'] = ''

##### <font color='blue'> Dividing the price column into categories and assigning them a label in the priceType column  

In [None]:
df_fact1.priceType.loc[(df_fact1['price']> 9.999) & (df_fact1['price'] <= 110.0)] = 'Low'#Low
df_fact1.priceType.loc[(df_fact1['price'] > 110.0) & (df_fact1['price'] <= 500.0)] = 'Mid'#Mid
df_fact1.priceType.loc[(df_fact1['price'] > 500.0) & (df_fact1['price'] <= 25000.0)] = 'High'#High

##### <font color='blue'> We have created labels based on the Airbnb prices, lets have a look at the airbnb in these particular ranges

In [None]:
df_fact1['mapPriceTags'] = ''

In [None]:
df_fact1.mapPriceTags.loc[(df_fact1['price']> 9.999) & (df_fact1['price'] <= 110.0)] = 1#Low
df_fact1.mapPriceTags.loc[(df_fact1['price'] > 110.0) & (df_fact1['price'] <= 500.0)] = 2#Mid
df_fact1.mapPriceTags.loc[(df_fact1['price'] > 500.0) & (df_fact1['price'] <= 25000.0)] = 3#High

In [None]:
df_fact1.head(1)

##### <font color='blue'> First, we will check for the Low range Airbnb's

In [None]:
## Making a lowRange dataframe which has all the values with the priceType as "Low"
lowRnge = pd.DataFrame(df_fact1[df_fact1.priceType=='Low'])
lowRnge.head(2)

##### <font color='blue'>Making the map for representing the Low ranged Airbnb listings using folium library 

In [None]:
## Importing the markercluster module from the folium library 
from folium.plugins import MarkerCluster
mc = MarkerCluster()

## Assigning base_low to save the map
### making the world map to zoom on the latitude and longitude of LA county as maximum 
### number of airbnb listings are in LA county.
base_low = folium.Map(location=[lowRnge.latitude.mean(), 
 lowRnge.longitude.mean()], 
 zoom_start=10)##This simply zoom in on the CA state

## For every value in the Low range airbnb's put a popup marker with the Airbnb per night
## prices
for row in lowRnge.itertuples():
    mc.add_child(folium.Marker(location=[row.latitude, row.longitude],popup=row.price))
    
base_low.add_child(mc)    

## Saving the map in html format in the current directory with the name map_lowprices
base_low.save('map_lowprices.html')

##### <div class="alert alert-block alert-danger"> Please note, you might have to run the jupyter notebook to see the map below or refer to the map_<name>.html for map details 

In [None]:
base_low

##### <font color='blue'>Now, lets see what kind of rates are included in the price ranges - Low,Mid,High for the top  places in these categories

##### <font color='blue'> Checking the Low price range closely to see what kind of prices in the Low range 

In [None]:
## Checking the low prices in the Low range for the top 10 places 
low_1 = df_fact1[df_fact1.priceType=='Low'].groupby(['price'])['location'].\
value_counts().nlargest(10).unstack().plot(kind='bar',stacked=True,figsize=(10,7));
plt.title('Top locations with top low range prices')
plt.show()
## Almost all the top 10 prices are in the locations below 

In [None]:
## Checking the low prices in the Low range for the top 10 places 
low_2 = df_fact1[df_fact1.priceType=='Low'].groupby(['location'])['price'].\
value_counts().nlargest(10).unstack().plot(kind='bar',stacked=True,figsize=(10,7));
plt.title('Top locations with top low range prices')
plt.show()
## Almost all the top 10 prices are in the locations below 

##### <div class="alert alert-block alert-info">As observed above,Almost all of the prices in Low price range are located in Hollywood.<br>Other top locations with maximum low price range are: Venice, pico-union, Downtown and Long Beach.<br>The average price in the Low range is around $100   
</div> 

##### <font color='blue'> Checking the Mid price range closely to see what kind of prices in the Mid range 

In [None]:
## Making a midRange dataframe which has all the values with the priceType as "Mid"
midRnge = pd.DataFrame(df_fact1[df_fact1.priceType=='Mid'])
midRnge.head(2)

In [None]:
## Importing the markercluster module from the folium library 
mc = MarkerCluster()

## Assigning base_mid to save the map
### making the world map to zoom on the latitude and longitude of LA county as maximum 
### number of airbnb listings are in LA county.
base_mid = folium.Map(location=[midRnge.latitude.mean(), 
 midRnge.longitude.mean()], 
 zoom_start=10)##This simply zoom in on the CA state

## For every value in the Mid range airbnb's put a popup marker with the Airbnb per night
## prices
for row in midRnge.itertuples():
    mc.add_child(folium.Marker(location=[row.latitude, row.longitude],popup=row.price))

base_mid.add_child(mc)    
## Saving the map in html format in the current directory with the name map_midprices
base_mid.save('map_midprices.html')

##### <div class="alert alert-block alert-danger"> Please note, you might have to run the jupyter notebook to see the map below or refer to the map_<name>.html for map details 

In [None]:
base_mid

##### <font color='blue'> Second, lets have a closer look into the mid range rates...

In [None]:
## Checking the mid prices in the Mid range for the top 10 places 
mid_1 = df_fact1[df_fact1.priceType=='Mid'].groupby(['price'])['location'].\
value_counts().nlargest(10).unstack().plot(kind='bar',stacked=True,figsize=(10,7));
plt.title('Top locations with top mid range prices')
plt.show()
## Almost all the top 10 prices are in the locations below 

In [None]:
## Checking the mid prices in the Mid range for the top 10 places 
mid_2 = df_fact1[df_fact1.priceType=='Mid'].groupby(['location'])['price'].\
value_counts().nlargest(10).unstack().plot(kind='bar',stacked=True,figsize=(10,7));
plt.title('Top locations with top mid range prices')
plt.show()
## Almost all the top 10 prices are in the locations below 


##### <div class="alert alert-block alert-info">As observed above,Almost all of the prices in the Mid price range are located in Venice.<br>Other top locations with maximum mid price range are: Hollywood, Downtown and Long Beach.<br>The average price in the Mid range is around $150-200   
</div> 

In [None]:
## Making a highRange dataframe which has all the values with the priceType as "High"
highRnge = pd.DataFrame(df_fact1[df_fact1.priceType=='High'])
highRnge.head(2)

In [None]:
## Importing the markercluster module from the folium library 
mc = MarkerCluster()

## Assigning base_high to save the map
### making the world map to zoom on the latitude and longitude of LA county as maximum 
### number of airbnb listings are in LA county.
base_high = folium.Map(location=[highRnge.latitude.mean(), 
 highRnge.longitude.mean()], 
 zoom_start=10)##This simply zoom in on the CA state

## For every value in the High range airbnb's put a popup marker with the Airbnb per night
## prices
for row in highRnge.itertuples():
    mc.add_child(folium.Marker(location=[row.latitude,  row.longitude],popup=row.price))
    
base_high.add_child(mc)    
## Saving the map in html format in the current directory with the name map_highprices
base_high.save('map_highprices.html')

##### <div class="alert alert-block alert-danger"> Please note, you might have to run the jupyter notebook to see the map below or refer to the map_<name>.html for map details 

In [None]:
base_high

##### <font color='blue'>Now, lets have a closer look into the high range rates...

In [None]:
## Finding out which listing has the highest price per night!
high_info = df_fact1[df_fact1.priceType=='High']
high_info.location[high_info['price']==25000]

In [None]:
## Checking the high prices in the High range for the top 10 places 
high_1 = df_fact1[df_fact1.priceType=='High'].groupby(['price'])['location'].\
value_counts().nlargest(10).unstack().plot(kind='bar',stacked=True,figsize=(10,7));
plt.title('Top locations with top high range prices')
plt.show()
## Almost all the top 10 prices are in the locations below 

In [None]:
## Checking the high prices in the High range for the top 10 places 
high_1 = df_fact1[df_fact1.priceType=='High'].groupby(['location'])['price'].\
value_counts().nlargest(10).unstack().plot(kind='bar',stacked=True,figsize=(10,7));
plt.title('Top locations with top high range prices')
plt.show()
## Almost all the top 10 prices are in the locations below 

##### <div class="alert alert-block alert-info">As observed above,Almost all of the prices in the High price range are located in Venice and Beverly grove.Venice seems a real versatile location for Airbnb listings.<br>Other top locations with maximum high price range are: Hollywood hills West, Malibu and Avalon.<br>The average price in the High range is around $550-800   
</div> 

### <font color='red'><b>Interesting Fact 2</b></font>

#### <font color='green'>Q) Make a dataset copy for Interesting Fact 2 which is focused on:
<font color='green'>1) What are the different types of properties in and around LA? Do they vary by neighborhood?<br>2) Airbnb property preference based on cutomer reviews.<br>3) What neighbhourhoods are rated highly by guests?</font>

##### <div class="alert alert-block alert-success"><li>Most popular room type in the Airbnb listings is "Entire House/apt" which meaning mostly all of the customers prefer to book entire house or apartment when they select an Airbnb.The second most preferred is the "Private room" type which means that the room is private but the house or apartment is not available to only one customer group.The least popular is the "Shared room" type of Airbnb in which the customers may share the House or apartment with the host or other people already living at the Airbnb.<br><br></li><li>The top 5 neighbhourhoods according to properties and customer reviews  are:Venice,Downtown, Hollywood, Santa Monica and Long Beach.<br><br><li>We can see that Apartment style listings are highest in number in all the neighbhourhoods.As we can see, Hollywood has the highest number of apartment style listings. In Venice, we observe that both Apartment and House styled listings are highest. In Downtown we also observe that Other type of listings like tent, camper/RV etc are popular.<br><br></li><li>We see that for a perfect score of 10.0 in the categories- accuracy,cleanliness,checkin,communication,location and value:<br><li>Hollywood has the highest cutomer ratings for Apartment style listings.</li><br><li>It is interesting to see that in Venice & Long Beach - House type listings have higher customer ratings overall in contrast to Apartment style listings.</li><br><li> Overall for the category of value, Hollywood apartments, Venice & Long Beach's houses are customer's favorite. Even the Downtown region has notiecable preference for apartment styled listings.<br><br><li> It is also observed that in Downtown customers have varied property type experience as the "Other" category in property_type which includes tent, bus,camper/RV etc, has been scored high.</li> </div>

In [None]:
## Creating a dataset for interesting Fact 2
df_Fact2 = initFile[['name','property_type', 'room_type', 'neighbourhood_cleansed', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin','review_scores_communication', 'review_scores_location',
       'review_scores_value', 'number_of_reviews', 'first_review', 'last_review',  'minimum_nights',
       'price', 'reviews_per_month', 'host_response_time', 'cancellation_policy']].copy()

In [None]:
## Understanding the number of reviews through their value_counts
df_Fact2['number_of_reviews'].value_counts()

In [None]:
df_Fact2.shape

##### <font color='blue'> Dropping the NaN values from the number_of_reviews column

In [None]:
df_Fact2.drop(df_Fact2[df_Fact2.number_of_reviews.values==0].index,axis=0,inplace=True)

In [None]:
df_Fact2.shape

##### <font color='blue'>Finding out the top 5 neighbourhoods with maximum properties 

In [None]:
df_Fact2.groupby('neighbourhood_cleansed')['property_type'].count().nlargest(5)

### <div class="alert alert-block alert-info"><li>Most popular room type in the Airbnb listings is "Entire House/apt" which meaning mostly all of the customers prefer to book entire house or apartment when they select an Airbnb. The second most preferred is the "Private room" type which means that the room is private but the house or apartment is not available to only one customer group. The least popular is the "Shared room" type of Airbnb in which the customers may share the House or apartment with the host or other people already living at the Airbnb.<br></li></div>

In [None]:
## Doing a countplot on room_type column to find out which is the most preferred type
room_plot = sns.countplot(df_Fact2['room_type'])
room_plot.set(xlabel='Room type of the listing', ylabel='Number of Airbnb')
plt.show()

##### <font color='blue'> Finding out the top 5 neighbhourhoods for Airbnb listings 

In [None]:
## Plotting a bar graph here 
feq=df_Fact2['neighbourhood_cleansed'].value_counts().nlargest(5).\
    sort_values(ascending=True)
feq.plot.barh(figsize=(10, 8), color='salmon', width=1)
plt.title("Number of listings by neighbourhood", fontsize=20)
plt.xlabel('Number of listings', fontsize=12)
plt.show()

##### <font color='blue'> Now, lets see what type of property are most common 

In [None]:
df_Fact2.property_type.value_counts()

##### <font color='blue'> We have a lot of similar kind of properties therefore clubbing them into one category 

In [None]:
## Converting other apartment type listings to Apartment 
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Serviced apartment'] = 'Apartment'
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Aparthotel'] = 'Apartment'

## Converting other house type listings to House 
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Villa'] = 'House'
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Cottage'] = 'House'
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Bungalow'] = 'House'
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Tiny house'] = 'House'
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Chalet'] = 'House'
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Farm stay'] = 'House'
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Cabin'] = 'House'
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Dome house'] = 'House'

## Converting other guesthouse type listings to Guesthouse 
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Guest suite'] = 'Guesthouse'

## Converting other guesthouse type listings to Hotel
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Boutique hotel'] = 'Hotel'
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Bed and breakfast'] = 'Hotel'

## Converting other hostel type listings to Hostel
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Dorm'] = 'Hostel'

## Converting other hostel type listings to Hostel
df_Fact2.property_type.loc[df_Fact2.property_type.values=='Campsite'] = 'Camper/RV'


top5types = df_Fact2.property_type.value_counts().nlargest(5).index

df_Fact2.property_type.value_counts()

##### <font color='blue'>Taking the top 5 types of Airbnb properties 

In [None]:
top5types

##### <font color='blue'>Setting non-top properties as Other to differentiate better

In [None]:
df_Fact2.loc[~df_Fact2.property_type.isin(top5types),'property_type'] = 'Other'
df_Fact2.property_type.value_counts()

##### <font color='blue'> Understanding what kind of properties are famous in the top 5 neighbhourhoods

In [None]:
#grabbing top 5 neighbourhoods for sub-dataframe
sub_5=df_Fact2.loc[df_Fact2['neighbourhood_cleansed'].\
                 isin(['Venice','Hollywood','Downtown','Long Beach',
                 'Santa Monica'])]

#using catplot to represent multiple interesting attributes together and a count
viz=sns.catplot(x='neighbourhood_cleansed',\
                  col='room_type', data=sub_5,\
                   kind='count',hue='property_type',edgecolor=".6")
viz.set_xticklabels(rotation=90)


###### <div class="alert alert-block alert-info"> We can see that Apartment style listings are highest in number in all the neighbhourhoods.As we can see, Hollywood has the highest number of apartment style listings. In Venice, we observe that both Apartment and House styled listings are highest. In Downtown we also observe that Other type of listings like tent, camper/RV etc are popular.    

#### <font color='blue'> Grouping the top 5 property types with the number of reviews

In [None]:
df_Fact2.groupby('property_type')['number_of_reviews'].count().nlargest(5)

In [None]:
df_Fact2.dtypes

In [None]:
property_col = df_Fact2.property_type.value_counts().nlargest(5)

##### <font color='blue'> Top 5 property types by the number of reviews

In [None]:
plt.style.use('fivethirtyeight')

## Setting the subplots and color for the piechart and barplot
fig,ax=plt.subplots(1,2,figsize=(15,8))
clr = ("blue", "forestgreen", "gold", "red", "brown")

## Plotting a bar plot to get the top 5 property_type values 
df_Fact2.property_type.value_counts().sort_values(ascending=False)[:5].\
sort_values().plot(kind='barh',color=clr,ax=ax[0])

## Settting the x axis and title 
ax[0].set_title("Top 5 property types by the number of reviews",size=20)
ax[0].set_xlabel('number_of_reviews',size=18)

## Doing a count and grouping the property_type for the top 5
count=df_Fact2['property_type'].value_counts()
groups=list(df_Fact2['property_type'].value_counts().index)[:5]
counts=list(count[:5])
counts.append(count.agg(sum)-count[:5].agg('sum'))
groups.append('Other')
type_dict=pd.DataFrame({"group":groups,"counts":counts})

clr1=("brown","red", "gold", "forestgreen","blue")
qx = type_dict.plot(kind='pie', y='counts', labels=groups,colors=clr1,\
                    autopct='%1.1f%%', pctdistance=0.9, radius=1.2,ax=ax[1])

plt.legend(loc=0, bbox_to_anchor=(1.15,0.4)) 
plt.subplots_adjust(wspace =0.5, hspace =0)
plt.ioff()

##### <font color='blue'> Specific type of reviews which people look at when booking an airbnb

In [None]:
df_Fact2.review_scores_accuracy.count()

In [None]:
## Making sublplots to plot the countplots 
fig,axes= plt.subplots(nrows=2, ncols= 3,figsize=(20,20))

## Customer review count on the basis of accuracy
percentage = 
sns.countplot(df_Fact2['review_scores_accuracy'],ax=axes[0][0],estimator=percentage)        

## Customer review count on the basis of cleanliness
sns.countplot(df_Fact2['review_scores_cleanliness'],ax=axes[0][1])

## Customer review count on the basis of checkin
sns.countplot(df_Fact2['review_scores_checkin'],ax=axes[0][2])

## Customer review count on the basis of communication
sns.countplot(df_Fact2['review_scores_communication'],ax=axes[1][0])

## Customer review count on the basis of location
sns.countplot(df_Fact2['review_scores_location'],ax=axes[1][1])

## Customer review count on the basis of value
sns.countplot(df_Fact2['review_scores_value'],ax=axes[1][2])


###### <div class="alert alert-block alert-info"> As observed customers who have good Airbnb experience have reviewed their stay 8.0 or above. Lets take a closer look at how the top 5 neighbhourhoods are rated in these high scores.

##### <font color='blue'> Understanding the customer reviews at the top 5 neighbourhoods against the property type

In [None]:
from  matplotlib.ticker import PercentFormatter

In [None]:
a = df_Fact2.loc[(df_Fact2['neighbourhood_cleansed'].\
                 isin(['Venice','Hollywood','Downtown','Long Beach',
                 'Santa Monica']))]

tot_holly_9 = a[a['review_scores_accuracy']==9.0].neighbourhood_cleansed\
[a.neighbourhood_cleansed=='Hollywood'].count()

tot_holly = a[a['neighbourhood_cleansed']=='Hollywood'].count()
P_holly = tot_holly_9/tot_holly

df_Fact2

In [None]:
## Checking for review_scores_accuracy
df_Fact2.groupby('neighbourhood_cleansed')['review_scores_accuracy'].value_counts().nlargest(5)

#grabbing top 5 neighbourhoods for sub-dataframe
sub_5=df_Fact2.loc[(df_Fact2['neighbourhood_cleansed'].\
                 isin(['Venice','Hollywood','Downtown','Long Beach',
                 'Santa Monica']))&(df_Fact2.review_scores_accuracy.isin(['9.0','10.0']))]
                                                                          
                                                                          
#using catplot to represent multiple interesting attributes together and a count
viz=sns.catplot(x='neighbourhood_cleansed',\
                  col='review_scores_accuracy', data=sub_5,\
                   kind='count',hue='property_type',edgecolor=".6")
viz.set_xticklabels(rotation=90)

plt.show()

In [None]:
df_Fact2.groupby('neighbourhood_cleansed')['review_scores_cleanliness'].value_counts().nlargest(5)

#grabbing top 5 neighbourhoods for sub-dataframe
sub_5=df_Fact2.loc[(df_Fact2['neighbourhood_cleansed'].\
                 isin(['Venice','Hollywood','Downtown','Long Beach',
                 'Santa Monica']))&(df_Fact2.review_scores_cleanliness.isin(['9.0'\
                                                                            ,'10.0']))]

#using catplot to represent multiple interesting attributes together and a count
viz=sns.catplot(x='neighbourhood_cleansed',\
                  col='review_scores_cleanliness', data=sub_5,\
                   kind='count',hue='property_type',edgecolor=".6")
viz.set_xticklabels(rotation=90)

for ax in viz.axes.flat:
    ax.yaxis.set_major_formatter(PercentFormatter(1000))
plt.show()

In [None]:
df_Fact2.groupby('neighbourhood_cleansed')['review_scores_checkin'].value_counts().nlargest(5)

#grabbing top 5 neighbourhoods for sub-dataframe
sub_5=df_Fact2.loc[(df_Fact2['neighbourhood_cleansed'].\
                 isin(['Venice','Hollywood','Downtown','Long Beach',
                 'Santa Monica']))&(df_Fact2.review_scores_checkin.isin(['9.0'\
                                                                            ,'10.0']))]

#using catplot to represent multiple interesting attributes together and a count
viz=sns.catplot(x='neighbourhood_cleansed',\
                  col='review_scores_checkin', data=sub_5,\
                   kind='count',hue='property_type',edgecolor=".6")
viz.set_xticklabels(rotation=90)

for ax in viz.axes.flat:
    ax.yaxis.set_major_formatter(PercentFormatter(1000))
plt.show()

In [None]:
df_Fact2.groupby('neighbourhood_cleansed')['review_scores_communication'].value_counts().nlargest(5)

#grabbing top 5 neighbourhoods for sub-dataframe
sub_5=df_Fact2.loc[(df_Fact2['neighbourhood_cleansed'].\
                 isin(['Venice','Hollywood','Downtown','Long Beach',
                 'Santa Monica']))&(df_Fact2.review_scores_communication.isin(['9.0'\
                                                                            ,'10.0']))]

#using catplot to represent multiple interesting attributes together and a count
viz=sns.catplot(x='neighbourhood_cleansed',\
                  col='review_scores_communication', data=sub_5,\
                   kind='count',hue='property_type',edgecolor=".6")
viz.set_xticklabels(rotation=90)

for ax in viz.axes.flat:
    ax.yaxis.set_major_formatter(PercentFormatter(1000))
plt.show()

In [None]:
df_Fact2.groupby('neighbourhood_cleansed')['review_scores_location'].value_counts().nlargest(5)

#grabbing top 5 neighbourhoods for sub-dataframe
sub_5=df_Fact2.loc[(df_Fact2['neighbourhood_cleansed'].\
                 isin(['Venice','Hollywood','Downtown','Long Beach',
                 'Santa Monica']))&(df_Fact2.review_scores_location.isin(['9.0'\
                                                                            ,'10.0']))]

#using catplot to represent multiple interesting attributes together and a count
viz=sns.catplot(x='neighbourhood_cleansed',\
                  col='review_scores_location', data=sub_5,\
                   kind='count',hue='property_type',edgecolor=".6")
viz.set_xticklabels(rotation=90)
for ax in viz.axes.flat:
    ax.yaxis.set_major_formatter(PercentFormatter(1000))
plt.show()

In [None]:
df_Fact2.groupby('neighbourhood_cleansed')['review_scores_value'].value_counts().nlargest(5)

#grabbing top 5 neighbourhoods for sub-dataframe
sub_5=df_Fact2.loc[(df_Fact2['neighbourhood_cleansed'].\
                 isin(['Venice','Hollywood','Downtown','Long Beach',
                 'Santa Monica']))&(df_Fact2.review_scores_value.isin(['9.0'\
                                                                            ,'10.0']))]

#using catplot to represent multiple interesting attributes together and a count
viz=sns.catplot(x='neighbourhood_cleansed',\
                  col='review_scores_value', data=sub_5,\
                   kind='count',hue='property_type',edgecolor=".6")
viz.set_xticklabels(rotation=90)

###### <div class="alert alert-block alert-info">We see that for a perfect score of 10.0 in the categories- accuracy,cleanliness,checkin,communication,location and value:<br><li>Hollywood has the highest cutomer ratings for Apartment style listings.</li><br><li>It is interesting to see that in Venice & Long Beach - House type listings have higher customer ratings overall in contrast to Apartment style listings.</li><br><li> Overall for the category of value, Hollywood apartments, Venice & Long Beach's houses are customer's favorite. Even the Downtown region has notiecable preference for apartment styled listings.<br><br><li> It is also observed that in Downtown customers have varied property type experience as the "Other" category in property_type which includes tent, bus,camper/RV etc, has been scored high.</li>  
    

### <font color='red'> Interesting Fact 3

#### <font color='green'>Q) Make a dataset copy for Interesting Fact 3 which is focused on:
<font color='green'> Predicting where and what kind of listing a Host should invest in for a profitable Airbnb business.
</font>

###### <div class="alert alert-block alert-success"> Using a decision tree class model we have come to a conclusion that there is a strong relationship between the f and t v c 
  this model has a 64% accuracy
    the host can refer this to make investing 
    there is little to no correlation between the price and the follow variables 

In [None]:
#importing libraries for the decision tree classification model 
import sklearn as sk
import sklearn.tree as tree
from IPython.display import Image  
import pydotplus

In [None]:
## Taking a copy of the original file 
df_host = initFile.copy()

##### <font color='blue'> Cleaning the df_host dataset as we did for df_fact1 and df_Fact2 

In [None]:
### Taking only the CA listings and making sure everything in state as uppercase
df_host.state.loc[df_host.state=='California'] = 'CA'
df_host.state.unique()
df_host.state = df_host['state'].str.upper()

In [None]:
## Making sure we have only CA state values
df_host.drop(df_host[~(df_host.state.values=='CA')].index,axis=0,inplace=True)

In [None]:
df_host.city = df_host.city.str.lower()
df_host.city.unique()
df_host.city.dropna(inplace=True)

In [None]:
df_host['city'] = df_host['city'].str.replace(r'[^\x00-\x7F]+', '')
df_host.city.dropna(inplace=True)

##### <font color='blue'> Cleaning up all the location related columns 

In [None]:
## Removing the spaces from the neighbourhood_cleansed
df_host.neighbourhood_cleansed = df_host.neighbourhood_cleansed.str.strip()

## Cleaning the smart_location column 
df_host.smart_location = df_host.smart_location.str.replace(', ca',' ', regex=True)
df_host.smart_location = df_host.smart_location.str.strip()
df_host.drop(df_host[(df_host['smart_location'] == '蒙特利帕克, ca') | \
                       (df_host['smart_location'] == '因達斯特里, ca') | \
                       (df_host['smart_location'] == '艾尔蒙地, ca') | \
                       (df_host['smart_location'] == '沃尔纳特, ca') | \
                       (df_host['smart_location'] == '哈仙达岗, ca') | \
                       (df_host['smart_location'] == '罗兰高地, ca') | \
                       (df_host['smart_location'] == '罗兰岗, ca') | \
                       (df_host['smart_location'] == '天普市, ca') | \
                       (df_host['smart_location'] == '洛杉矶, ca')  \
                      ].index, inplace = True)

## Convert smart_location and neighbourhood_cleansed columns into lower case
df_host.smart_location = df_host.smart_location.str.lower()
df_host.neighbourhood_cleansed = df_host.neighbourhood_cleansed.str.lower() 

##### <font color='blue'> Cleaning the listing price related columns 

In [None]:
## Removing the '$' & ',' symbols form the security_deposit column 
df_host.security_deposit = df_host.security_deposit.str.strip('$')
df_host.security_deposit = df_host.security_deposit.str.replace(',','')

## Converting the security_deposit column to Float from str
df_host.security_deposit = df_host.security_deposit.astype('float64')

## Rounding the security_deposit column values after filling the NaN values to mean() of
## the security_deposit values
df_host.security_deposit.fillna(round(df_host.security_deposit.mean(),2),inplace=True)

## Removing the '$' & ',' symbols form the cleaning_fee column 
df_host.cleaning_fee = df_host.cleaning_fee.str.strip('$')
df_host.cleaning_fee = df_host.cleaning_fee.str.replace(',','')

## Converting the cleaning_fee column to Float from str
df_host.cleaning_fee = df_host.cleaning_fee.astype('float64')

## Converting the cleaning_fee column NaN values to the mean() of values in that column
df_host.cleaning_fee.fillna(round(df_host.cleaning_fee.mean(),2),inplace=True)

## Removing the '$' & ',' symbols form the price column 
df_host.price = df_host.price.str.strip('$')
df_host.price = df_host.price.str.replace(',','')

## Converting the price column to Float from str
df_host.price = df_host.price.astype('float64')

## Price '0' makes no sense as the listing need to charge for per night stay and there are
## only 9 values 
df_host.drop(df_host[df_host.price==0].index,axis=0,inplace=True)

## Removing the '$' & ',' symbols form the extra_people column 
df_host.extra_people = df_host.extra_people.str.strip('$')
df_host.extra_people = df_host.extra_people.str.replace(',','')

## Converting the extra_people column to Float from str
df_host.extra_people = df_host.extra_people.astype('float64')


##### <font color='blue'> Cleaning the important columns we are considering off their NaN values

In [None]:
df_host.zipcode.dropna(inplace=True)

##### <font color='blue'> Making sure the columns related to df_Fact2 are also clean and uniform 

In [None]:
## Dropping the nan reviews from the number_of_Reviews column 
df_host.drop(df_host[df_host.number_of_reviews.values==0].index,axis=0,inplace=True)

## Converting other apartment type listings to Apartment 
df_host.property_type.loc[df_host.property_type.values=='Serviced apartment'] = 'Apartment'
df_host.property_type.loc[df_host.property_type.values=='Aparthotel'] = 'Apartment'

## Converting other house type listings to House 
df_host.property_type.loc[df_host.property_type.values=='Villa'] = 'House'
df_host.property_type.loc[df_host.property_type.values=='Cottage'] = 'House'
df_host.property_type.loc[df_host.property_type.values=='Bungalow'] = 'House'
df_host.property_type.loc[df_host.property_type.values=='Tiny house'] = 'House'
df_host.property_type.loc[df_host.property_type.values=='Chalet'] = 'House'
df_host.property_type.loc[df_host.property_type.values=='Farm stay'] = 'House'
df_host.property_type.loc[df_host.property_type.values=='Cabin'] = 'House'
df_host.property_type.loc[df_host.property_type.values=='Dome house'] = 'House'
df_host.property_type.loc[df_host.property_type.values=='Townhouse'] = 'House'

## Converting other guesthouse type listings to Guesthouse 
df_host.property_type.loc[df_host.property_type.values=='Guest suite'] = 'Guesthouse'

## Converting other guesthouse type listings to Hotel
df_host.property_type.loc[df_host.property_type.values=='Boutique hotel'] = 'Hotel'
df_host.property_type.loc[df_host.property_type.values=='Bed and breakfast'] = 'Hotel'

## Converting other hostel type listings to Hostel
df_host.property_type.loc[df_host.property_type.values=='Dorm'] = 'Hostel'

## Converting other hostel type listings to Hostel
df_host.property_type.loc[df_host.property_type.values=='Campsite'] = 'Camper/RV'

top5types = df_host.property_type.value_counts().nlargest(5).index

df_host.loc[~df_host.property_type.isin(top5types),'property_type'] = 'Other'

In [None]:
df_host.property_type.value_counts()

In [None]:
df_host['property_style'] = ''

In [None]:
df_Fact3['property_style'].replace('', np.nan, inplace=True)

df_Fact3.property_style.fillna(0,inplace=True)


In [None]:
df_host.property_style.loc[df_host.property_type=='House'] = 1
df_host.property_style.loc[df_host.property_type=='Apartment'] = 2
df_host.property_style.loc[df_host.property_type=='Guesthouse'] = 3
df_host.property_style.loc[df_host.property_type=='Condominium'] = 4
df_host.property_style.loc[df_host.property_type=='Other'] = 0


In [None]:
df_Fact3.property_style.value_counts()

In [None]:
df_host['priceRange'] = ''

In [None]:
df_host.priceRange.loc[(df_host['price']> 9.999) & (df_host['price'] <= 110.0)] = 0#Low
df_host.priceRange.loc[(df_host['price'] > 110.0) & (df_host['price'] <= 500.0)] = 1#Mid
df_host.priceRange.loc[(df_host['price'] > 500.0) & (df_host['price'] <= 25000.0)] =2#High

##### <font color='blue'> Lets predict where a future airbnb host should invest in the top5 neighbhourhoods?

In [None]:
df_Fact = df_host[['property_style','priceRange',\
                   'guests_included','neighbourhood_cleansed']].copy()
df_Fact.reset_index()

In [None]:
df_Fact3  = df_Fact.loc[(df_Fact['neighbourhood_cleansed'].\
                 isin(['venice','hollywood','downtown','long beach',
                 'santa monica']))].copy()
df_Fact3.head()

In [None]:
df_Fact3.reset_index(inplace=True)
df_Fact3.head(1)

In [None]:
df_Fact3.drop('id',axis=1,inplace=True)

In [None]:
#df_Fact3['neighbourhood_group'] = ''

In [None]:
'''
df_Fact3.neighbourhood_group.loc[df_Fact3.neighbourhood_cleansed\
                                   =='venice'] = 0

df_Fact3.neighbourhood_group.loc[df_Fact3.neighbourhood_cleansed\
                                   =='hollywood'] = 1

df_Fact3.neighbourhood_group.loc[df_Fact3.neighbourhood_cleansed\
                                   =='downtown'] = 2

df_Fact3.neighbourhood_group.loc[df_Fact3.neighbourhood_cleansed\
                                   =='long beach'] = 3

df_Fact3.neighbourhood_group.loc[df_Fact3.neighbourhood_cleansed\
                                   =='santa monica'] = 4
'''

In [None]:
df_Fact3.info()

In [None]:
df_dt = pd.get_dummies(df_Fact3, columns=['neighbourhood_cleansed'])
df_dt.head(3)

In [None]:
## Creating a dataframe including the independent variables except the dependent variable
X = df_dt.drop(['priceRange'],axis=1)

In [None]:
## Creating the dependent variable with the neighbourhood_cleansed column. Y is a series.
Y = df_dt.priceRange    

In [None]:
## Splitting the data into train and test data for the decision tree model
from sklearn.model_selection import train_test_split

## Here, we are dividing the data into 70% train and 30% test data
X_train, X_test, Y_train, Y_test = train_test_split\
(X, Y, test_size=0.3, random_state = 0)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
## Defining the decision tree classifier to maximum depth of 3
dt = tree.DecisionTreeClassifier(max_depth=3) 

In [None]:
#Fitting the model 
dt.fit(X_train,Y_train)

In [None]:
# This code will visualize a decision tree dt, trained with the attributes in X and the class labels in Y
dt_feature_names = list(X.columns) #Converting the independent variables to a list 
dt_target_names = [str(s) for s in Y.unique()] #take unique values of Y i.e. DogName
tree.export_graphviz(dt, out_file='tree.dot', 
    feature_names=dt_feature_names, class_names=dt_target_names,
    filled=True)  
graph = pydotplus.graph_from_dot_file('tree.dot') #generating a tree
Image(graph.create_png())

##### <font color='blue'> Understanding the important features for the model 

In [None]:
importances = dt.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Rearrange feature names so they match the sorted feature importances
names = [df_dt.columns[i] for i in indices]

# Create plot
plt.figure()

# Create plot title
plt.title("Feature Importance")

# Add bars
plt.bar(range(X.shape[1]), importances[indices])

# Add feature names as x-axis labels
plt.xticks(range(X.shape[1]), names, rotation=90)

# Show plot
plt.show()

In [None]:
dt.predict(X_test)

##### <font color='blue'> Accuracy of the prediction model 

In [None]:
(dt.predict(X_test) == Y_test).mean()

In [None]:
y_pred = dt.predict(X_test)

In [None]:
dt.predict_proba(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
import sklearn.metrics as met

In [None]:
confusion_matrix(Y_test,y_pred)

In [None]:
(y_pred == Y_test).mean()

In [None]:
met.accuracy_score(Y_test, y_pred)

In [None]:
Y_test.mean()

In [None]:
df_new = df_host[['room_type','price','neighbourhood_cleansed',\
                  'minimum_nights',\
                  'number_of_reviews',\
              'reviews_per_month','availability_365']].copy()

In [None]:
df_new['priceRange'] = ''

In [None]:
df_new.priceRange.loc[(df_new['price']> 9.999) & (df_host['price'] <= 110.0)] = 0#Low
df_new.priceRange.loc[(df_new['price'] > 110.0) & (df_host['price'] <= 500.0)] = 1#Mid
df_new.priceRange.loc[(df_new['price'] > 500.0) & (df_host['price'] <= 25000.0)] =2#High

In [None]:
df_new.drop(columns=['price'],axis=1,inplace=True)

In [None]:
df_new.room_type.isna().sum()

In [None]:
df_new.minimum_nights.isna().sum()

In [None]:
df_new.availability_365.value_counts()

In [None]:
df_new.head()

In [None]:
df_new.info()

In [None]:
df_Fact3_2 = df_new.loc[(df_new['neighbourhood_cleansed'].\
                 isin(['venice','hollywood','downtown','long beach',
                 'santa monica']))].copy()
df_Fact3_2.head()

In [None]:
df_Fact3_2.corr()

In [None]:
df_dt2 = pd.get_dummies(df_Fact3_2, columns=['neighbourhood_cleansed'])
df_dt2.head(3)

In [None]:
## Creating a dataframe including the independent variables except the dependent variable
X2 = df_dt2.drop(['room_type'],axis=1)

In [None]:
## Creating the dependent variable with the neighbourhood_cleansed column. Y is a series.
Y2 = df_dt2.room_type    

In [None]:
## Splitting the data into train and test data for the decision tree model
from sklearn.model_selection import train_test_split

## Here, we are dividing the data into 70% train and 30% test data
X2_train, X2_test, Y2_train, Y2_test = train_test_split\
(X2, Y2, test_size=0.3, random_state = 0)

In [None]:
## Defining the decision tree classifier to maximum depth of 3
dt2 = tree.DecisionTreeClassifier(max_depth=3) 

In [None]:
#Fitting the model 
dt2.fit(X2_train,Y2_train)

In [None]:
# This code will visualize a decision tree dt, trained with the attributes in X and the class labels in Y
dt2_feature_names = list(X2.columns) #Converting the independent variables to a list 
dt2_target_names = [str(s) for s in Y2.unique()] #take unique values of Y i.e. DogName
tree.export_graphviz(dt2, out_file='tree.dot', 
    feature_names=dt_feature_names, class_names=dt_target_names,
    filled=True)  
graph = pydotplus.graph_from_dot_file('tree.dot') #generating a tree
Image(graph.create_png())

In [None]:
importances2 = dt2.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances2)[::-1]

# Rearrange feature names so they match the sorted feature importances
names = [df_dt2.columns[i] for i in indices]

# Create plot
plt.figure()

# Create plot title
plt.title("Feature Importance")

# Add bars
plt.bar(range(X2.shape[1]), importances2[indices])

# Add feature names as x-axis labels
plt.xticks(range(X2.shape[1]), names, rotation=90)

# Show plot
plt.show()

In [None]:
dt2.predict(X2_test)

In [None]:
(dt2.predict(X2_test) == Y2_test).mean()

In [None]:
y2_pred = dt2.predict(X2_test)

In [None]:
dt2.predict_proba(X2_test)

In [None]:
from sklearn.metrics import confusion_matrix
import sklearn.metrics as met

In [None]:
confusion_matrix(Y2_test,y2_pred)

In [None]:
(y2_pred == Y2_test).mean()