In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import scipy.stats as stats

In [2]:
df = pd.read_csv('https://drive.google.com/uc?export=download&id=1_YP_qbYwcLCfKfkTqVYkbupJiPR6yYdU')

In [3]:
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,1419,https://www.airbnb.com/rooms/1419,20240412141305,2024-04-13,previous scrape,Beautiful home in amazing area!,"This large, family home is located in one of T...",The apartment is located in the Ossington stri...,https://a0.muscache.com/pictures/76206750/d643...,1565,...,5.0,5.0,5.0,,f,1,1,0,0,0.06
1,8077,https://www.airbnb.com/rooms/8077,20240412141305,2024-04-13,previous scrape,Downtown Harbourfront Private Room,Guest room in a luxury condo with access to al...,,https://a0.muscache.com/pictures/11780344/141c...,22795,...,4.9,4.92,4.83,,f,2,1,1,0,0.95
2,26654,https://www.airbnb.com/rooms/26654,20240412141305,2024-04-13,city scrape,"World Class @ CN Tower, convention centre, The...","CN Tower, TIFF Bell Lightbox, Metro Convention...",There's a reason they call it the Entertainmen...,https://a0.muscache.com/pictures/81811785/5dcd...,113345,...,4.76,4.86,4.67,,f,5,5,0,0,0.26
3,27423,https://www.airbnb.com/rooms/27423,20240412141305,2024-04-13,city scrape,Executive Studio Unit- Ideal for One Person,"Brand new, fully furnished studio basement apa...",,https://a0.muscache.com/pictures/176936/b687ed...,118124,...,5.0,4.86,4.86,,f,1,1,0,0,0.17
4,30931,https://www.airbnb.com/rooms/30931,20240412141305,2024-04-13,previous scrape,Downtown Toronto - Waterview Condo,Split level waterfront condo with a breathtaki...,,https://a0.muscache.com/pictures/227971/e8ebd7...,22795,...,,,,,f,2,1,1,0,0.01


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20650 entries, 0 to 20649
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            20650 non-null  int64  
 1   listing_url                                   20650 non-null  object 
 2   scrape_id                                     20650 non-null  int64  
 3   last_scraped                                  20650 non-null  object 
 4   source                                        20650 non-null  object 
 5   name                                          20650 non-null  object 
 6   description                                   20181 non-null  object 
 7   neighborhood_overview                         11103 non-null  object 
 8   picture_url                                   20650 non-null  object 
 9   host_id                                       20650 non-null 

In [5]:
df.isnull().sum()/100 

id                                               0.00
listing_url                                      0.00
scrape_id                                        0.00
last_scraped                                     0.00
source                                           0.00
                                                ...  
calculated_host_listings_count                   0.00
calculated_host_listings_count_entire_homes      0.00
calculated_host_listings_count_private_rooms     0.00
calculated_host_listings_count_shared_rooms      0.00
reviews_per_month                               53.78
Length: 75, dtype: float64

In [6]:
#Deleting all columns containing urls
df = df.drop(columns=[col for col in df.columns if 'url' in col])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20650 entries, 0 to 20649
Data columns (total 70 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            20650 non-null  int64  
 1   scrape_id                                     20650 non-null  int64  
 2   last_scraped                                  20650 non-null  object 
 3   source                                        20650 non-null  object 
 4   name                                          20650 non-null  object 
 5   description                                   20181 non-null  object 
 6   neighborhood_overview                         11103 non-null  object 
 7   host_id                                       20650 non-null  int64  
 8   host_name                                     20648 non-null  object 
 9   host_since                                    20648 non-null 

In [8]:
#Filter data to focus on Toronto AirBnBs
toronto_neighbourhoods = [
    'Little Portugal', 'Waterfront Communities-The Island', 'South Riverdale', 
    'South Parkdale', 'Wexford/Maryvale', 'Rosedale-Moore Park', 'Bay Street Corridor', 
    'Church-Yonge Corridor', 'Niagara', 'Roncesvalles', 'Cabbagetown-South St.James Town', 
    'Moss Park', 'High Park North', 'Woburn', 'Don Valley Village', 'Junction Area', 
    'High Park-Swansea', 'Oakridge', 'Casa Loma', 'Thistletown-Beaumond Heights', 'Annex', 
    'Dovercourt-Wallace Emerson-Junction', 'Caledonia-Fairbank', 'Palmerston-Little Italy', 
    'Danforth', 'North St.James Town', 'Newtonbrook West', 'Playter Estates-Danforth', 
    'Blake-Jones', 'Greenwood-Coxwell', 'Regent Park', 'Flemingdon Park', 'Mount Pleasant West', 
    'Willowdale East', 'The Beaches', 'Dufferin Grove', 'University', 'Humewood-Cedarvale', 
    'Trinity-Bellwoods', 'Mount Pleasant East', 'East End-Danforth', 'Brookhaven-Amesbury', 
    'Oakwood Village', 'Mimico (includes Humber Bay Shores)', 'North Riverdale', 'Parkwoods-Donalda', 
    'Woodbine Corridor', 'Broadview North', 'Morningside', 'Kensington-Chinatown', 'Yonge-St.Clair', 
    'Old East York', 'Corso Italia-Davenport', 'Birchcliffe-Cliffside', 'Stonegate-Queensway', 
    'Agincourt South-Malvern West', 'Yonge-Eglinton', 'St.Andrew-Windfields', 'Rockcliffe-Smythe', 
    'Clanton Park', 'Willowdale West', 'Islington-City Centre West', 'Lawrence Park North', 
    'Bayview Woods-Steeles', 'Bendale', 'Mount Dennis', 'Bayview Village', 'Cliffcrest', 
    'Englemount-Lawrence', 'New Toronto', 'Agincourt North', 'Woodbine-Lumsden', 'Danforth East York', 
    'Bridle Path-Sunnybrook-York Mills', 'Etobicoke West Mall', 'Wychwood', 'Bedford Park-Nortown', 
    'Forest Hill South', 'Runnymede-Bloor West Village', 'Guildwood', "Tam O'Shanter-Sullivan", 
    "L'Amoreaux", 'Lansing-Westgate', 'Lambton Baby Point', 'Long Branch', 'Briar Hill-Belgravia', 
    'Westminster-Branson', 'Lawrence Park South', 'Hillcrest Village', 'York University Heights', 
    'Weston-Pellam Park', 'Bathurst Manor', 'Kingsway South', 'Ionview', 'Leaside-Bennington', 
    'Weston', 'Pelmo Park-Humberlea', 'Clairlea-Birchmount', 'Eglinton East', 'Yorkdale-Glen Park', 
    'Eringate-Centennial-West Deane', 'West Humber-Clairville', 'Kennedy Park', 'Black Creek', 
    'West Hill', 'Beechborough-Greenbrook', 'Keelesdale-Eglinton West', 'Rexdale-Kipling', 
    'Edenbridge-Humber Valley', 'Rouge', 'Willowridge-Martingrove-Richview', 'Alderwood', 
    "O'Connor-Parkview", 'Victoria Village', 'Pleasant View', 'Banbury-Don Mills', 'Henry Farm', 
    'Markland Wood', 'Dorset Park', 'Princess-Rosethorn', 'Kingsview Village-The Westway', 
    'Scarborough Village', 'Thorncliffe Park', 'Malvern', 'Mount Olive-Silverstone-Jamestown', 
    'Glenfield-Jane Heights', 'Highland Creek', 'Taylor-Massey', 'Elms-Old Rexdale', 
    'Forest Hill North', 'Steeles', 'Newtonbrook East', 'Downsview-Roding-CFB', 'Maple Leaf', 
    'Humbermede', 'Humber Heights-Westmount', 'Centennial Scarborough', 'Milliken', 
    'Humber Summit', 'Rustic'
]

df_filtered = df[df['neighbourhood_cleansed'].isin(toronto_neighbourhoods)]



In [9]:
df_filtered.head()

Unnamed: 0,id,scrape_id,last_scraped,source,name,description,neighborhood_overview,host_id,host_name,host_since,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,1419,20240412141305,2024-04-13,previous scrape,Beautiful home in amazing area!,"This large, family home is located in one of T...",The apartment is located in the Ossington stri...,1565,Alexandra,2008-08-08,...,5.0,5.0,5.0,,f,1,1,0,0,0.06
1,8077,20240412141305,2024-04-13,previous scrape,Downtown Harbourfront Private Room,Guest room in a luxury condo with access to al...,,22795,Kathie & Larry,2009-06-22,...,4.9,4.92,4.83,,f,2,1,1,0,0.95
2,26654,20240412141305,2024-04-13,city scrape,"World Class @ CN Tower, convention centre, The...","CN Tower, TIFF Bell Lightbox, Metro Convention...",There's a reason they call it the Entertainmen...,113345,Adela,2010-04-25,...,4.76,4.86,4.67,,f,5,5,0,0,0.26
3,27423,20240412141305,2024-04-13,city scrape,Executive Studio Unit- Ideal for One Person,"Brand new, fully furnished studio basement apa...",,118124,Brent,2010-05-04,...,5.0,4.86,4.86,,f,1,1,0,0,0.17
4,30931,20240412141305,2024-04-13,previous scrape,Downtown Toronto - Waterview Condo,Split level waterfront condo with a breathtaki...,,22795,Kathie & Larry,2009-06-22,...,,,,,f,2,1,1,0,0.01


In [10]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20650 entries, 0 to 20649
Data columns (total 70 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            20650 non-null  int64  
 1   scrape_id                                     20650 non-null  int64  
 2   last_scraped                                  20650 non-null  object 
 3   source                                        20650 non-null  object 
 4   name                                          20650 non-null  object 
 5   description                                   20181 non-null  object 
 6   neighborhood_overview                         11103 non-null  object 
 7   host_id                                       20650 non-null  int64  
 8   host_name                                     20648 non-null  object 
 9   host_since                                    20648 non-null 

In [11]:
df_filtered.isnull().sum()/100 

id                                               0.00
scrape_id                                        0.00
last_scraped                                     0.00
source                                           0.00
name                                             0.00
                                                ...  
calculated_host_listings_count                   0.00
calculated_host_listings_count_entire_homes      0.00
calculated_host_listings_count_private_rooms     0.00
calculated_host_listings_count_shared_rooms      0.00
reviews_per_month                               53.78
Length: 70, dtype: float64

In [12]:
#Drop all columns with missing values
df_filtered = df_filtered.dropna(axis=1, how='all')

In [13]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20650 entries, 0 to 20649
Data columns (total 68 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            20650 non-null  int64  
 1   scrape_id                                     20650 non-null  int64  
 2   last_scraped                                  20650 non-null  object 
 3   source                                        20650 non-null  object 
 4   name                                          20650 non-null  object 
 5   description                                   20181 non-null  object 
 6   neighborhood_overview                         11103 non-null  object 
 7   host_id                                       20650 non-null  int64  
 8   host_name                                     20648 non-null  object 
 9   host_since                                    20648 non-null 

In [14]:
df_filtered = df_filtered.drop(columns=['bathrooms','room_type','name','description','neighborhood_overview'])

In [15]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20650 entries, 0 to 20649
Data columns (total 63 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            20650 non-null  int64  
 1   scrape_id                                     20650 non-null  int64  
 2   last_scraped                                  20650 non-null  object 
 3   source                                        20650 non-null  object 
 4   host_id                                       20650 non-null  int64  
 5   host_name                                     20648 non-null  object 
 6   host_since                                    20648 non-null  object 
 7   host_location                                 15354 non-null  object 
 8   host_about                                    9651 non-null   object 
 9   host_response_time                            14116 non-null 

In [25]:
df_filtered['property_type'].unique()

array(['Entire home', 'Private room in rental unit', 'Entire condo',
       'Entire rental unit', 'Private room in condo',
       'Private room in home', 'Entire townhouse',
       'Entire serviced apartment', 'Entire loft', 'Entire guest suite',
       'Shared room in rental unit', 'Private room in townhouse',
       'Private room in guest suite', 'Entire guesthouse',
       'Private room in cottage', 'Private room in loft', 'Private room',
       'Private room in serviced apartment', 'Shared room in home',
       'Private room in guesthouse', 'Entire bungalow',
       'Shared room in condo', 'Entire place',
       'Private room in bed and breakfast', 'Private room in bungalow',
       'Shared room in townhouse', 'Private room in barn', 'Entire villa',
       'Tiny home', 'Floor', 'Private room in villa',
       'Shared room in bungalow', 'Shared room in hostel',
       'Private room in castle', 'Entire cottage',
       'Private room in hostel', 'Shared room in loft', 'Entire home/apt

In [26]:
#Define a function to categorize property types
def categorize_property_type(property_type):
    if 'entire' in property_type.lower():
        return 'Entire'
    elif 'private' in property_type.lower():
        return 'Private Room'
    elif 'shared' in property_type.lower():
        return 'Shared Room'
    else:
        return 'Others'

# Apply the function to the 'property_type' column
df_filtered['property_type_category'] = df_filtered['property_type'].apply(categorize_property_type)


In [27]:
df_filtered['property_type_category'].unique()

array(['Entire', 'Private Room', 'Shared Room', 'Others'], dtype=object)

In [28]:
df_filtered.head()

Unnamed: 0,id,scrape_id,last_scraped,source,host_id,host_name,host_since,host_location,host_about,host_response_time,...,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,property_type_category,Entire,Others,Private,Shared
0,1419,20240412141305,2024-04-13,previous scrape,1565,Alexandra,2008-08-08,"Vancouver, Canada","I live in Vancouver, Canada with my husband an...",,...,1,1,0,0,0.06,Entire,1,0,0,0
1,8077,20240412141305,2024-04-13,previous scrape,22795,Kathie & Larry,2009-06-22,"Toronto, Canada",My husband and I have been airbnb host for alm...,,...,2,1,1,0,0.95,Private Room,0,0,1,0
2,26654,20240412141305,2024-04-13,city scrape,113345,Adela,2010-04-25,,Welcome to Toronto! \r\n\r\nAfter our first me...,within a day,...,5,5,0,0,0.26,Entire,1,0,0,0
3,27423,20240412141305,2024-04-13,city scrape,118124,Brent,2010-05-04,"Toronto, Canada",I love to travel and meet new people from arou...,within an hour,...,1,1,0,0,0.17,Entire,1,0,0,0
4,30931,20240412141305,2024-04-13,previous scrape,22795,Kathie & Larry,2009-06-22,"Toronto, Canada",My husband and I have been airbnb host for alm...,,...,2,1,1,0,0.01,Entire,1,0,0,0


In [29]:
#Perform OneHotEncoding on the category
df_property_type = pd.get_dummies(df_filtered['property_type_category'], dtype=int)

In [30]:
df_property_type

Unnamed: 0,Entire,Others,Private Room,Shared Room
0,1,0,0,0
1,0,0,1,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0
...,...,...,...,...
20645,1,0,0,0
20646,1,0,0,0
20647,1,0,0,0
20648,1,0,0,0


In [31]:
#adding back to the dataframe
df_filtered = pd.concat([df_filtered, df_property_type], axis=1)

In [32]:
df_filtered

Unnamed: 0,id,scrape_id,last_scraped,source,host_id,host_name,host_since,host_location,host_about,host_response_time,...,reviews_per_month,property_type_category,Entire,Others,Private,Shared,Entire.1,Others.1,Private Room,Shared Room
0,1419,20240412141305,2024-04-13,previous scrape,1565,Alexandra,2008-08-08,"Vancouver, Canada","I live in Vancouver, Canada with my husband an...",,...,0.06,Entire,1,0,0,0,1,0,0,0
1,8077,20240412141305,2024-04-13,previous scrape,22795,Kathie & Larry,2009-06-22,"Toronto, Canada",My husband and I have been airbnb host for alm...,,...,0.95,Private Room,0,0,1,0,0,0,1,0
2,26654,20240412141305,2024-04-13,city scrape,113345,Adela,2010-04-25,,Welcome to Toronto! \r\n\r\nAfter our first me...,within a day,...,0.26,Entire,1,0,0,0,1,0,0,0
3,27423,20240412141305,2024-04-13,city scrape,118124,Brent,2010-05-04,"Toronto, Canada",I love to travel and meet new people from arou...,within an hour,...,0.17,Entire,1,0,0,0,1,0,0,0
4,30931,20240412141305,2024-04-13,previous scrape,22795,Kathie & Larry,2009-06-22,"Toronto, Canada",My husband and I have been airbnb host for alm...,,...,0.01,Entire,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20645,1132097786104369570,20240412141305,2024-04-12,city scrape,566290809,Arjun,2024-03-10,,,,...,,Entire,1,0,0,0,1,0,0,0
20646,1132116282721989390,20240412141305,2024-04-13,city scrape,68521096,Vahid,2016-04-23,"Toronto, Canada",Art director living in Toronto with my partner...,,...,,Entire,1,0,0,0,1,0,0,0
20647,1132192910161286086,20240412141305,2024-04-13,city scrape,529130894,Marina,2023-07-31,"Toronto, Canada",,,...,,Entire,1,0,0,0,1,0,0,0
20648,1132219980412668322,20240412141305,2024-04-12,city scrape,128013575,Shahrokh,2017-04-28,"Toronto, Canada",Hi. I'm Sharok and my wife is Melika. We both ...,within an hour,...,,Entire,1,0,0,0,1,0,0,0
