# AirBnB Case
by David Keller



### Load required libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
import text2emotion as te
from tqdm import tqdm
%matplotlib inline

tqdm.pandas()

### Load airbnb Data

In [78]:
calendar = pd.read_csv('./data/airbnb_zurich/calendar.csv')
listings = pd.read_csv('./data/airbnb_zurich/listings.csv')
reviews = pd.read_csv('./data/airbnb_zurich/reviews.csv')

### Get an idea

In [9]:
listings.shape

(1860, 74)

In [10]:
calendar.shape

(678535, 7)

In [57]:
reviews.shape

(50245, 6)

In [8]:
listings.head(10)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,73282,https://www.airbnb.com/rooms/73282,20211228000812,2021-12-28,"Clean, central, quiet",Arty neighborhood<br /><br /><b>The space</b><...,,https://a0.muscache.com/pictures/481072/abd94c...,377532,https://www.airbnb.com/users/show/377532,...,4.93,4.71,4.61,,f,1,1,0,0,0.42
1,86645,https://www.airbnb.com/rooms/86645,20211228000812,2021-12-28,Stadium Letzigrund - by Airhome,Discover a boutique apartment presented by Air...,Located 300 meters to Zurich Letzigrund Stadio...,https://a0.muscache.com/pictures/miso/Hosting-...,475053,https://www.airbnb.com/users/show/475053,...,4.77,4.6,4.47,,t,18,18,0,0,0.39
2,143821,https://www.airbnb.com/rooms/143821,20211228000812,2021-12-28,marvelous LOFT in SIHLCITY Zürich,<b>The space</b><br />- 2.5 rooms on 2 floors ...,,https://a0.muscache.com/pictures/1012249/a4f34...,697307,https://www.airbnb.com/users/show/697307,...,,,,,f,1,1,0,0,
3,178448,https://www.airbnb.com/rooms/178448,20211228000812,2021-12-28,"a lovely place, top location","Very central location, 5 min walk from Bahnhof...","We live in one of the top locations of Zürich,...",https://a0.muscache.com/pictures/7d41e016-e818...,854016,https://www.airbnb.com/users/show/854016,...,4.89,5.0,4.89,,f,1,0,1,0,0.07
4,204586,https://www.airbnb.com/rooms/204586,20211228000812,2021-12-28,very nice luxury city apartment,<b>The space</b><br />share a room in a very n...,,https://a0.muscache.com/pictures/55486203/9834...,1004816,https://www.airbnb.com/users/show/1004816,...,,,,,f,1,0,1,0,
5,216395,https://www.airbnb.com/rooms/216395,20211228000812,2021-12-28,"city studio, modern meets colonial",<b>The space</b><br />Right at the most centr...,,https://a0.muscache.com/pictures/2300000/49b5f...,1116961,https://www.airbnb.com/users/show/1116961,...,,,,,f,1,1,0,0,
6,222565,https://www.airbnb.com/rooms/222565,20211228000812,2021-12-28,Bedroom overlooking the lake near,<b>The space</b><br />We offer a great room ov...,,https://a0.muscache.com/pictures/2299734/2509e...,1155866,https://www.airbnb.com/users/show/1155866,...,4.83,4.76,4.6,,t,1,0,1,0,1.8
7,227039,https://www.airbnb.com/rooms/227039,20211228000812,2021-12-28,*Luxury Penthouse in the heart of trendy Zurich*,Modern and unique penthouse apartment over thr...,Zurich is hip-circuit with the Swiss peacefuln...,https://a0.muscache.com/pictures/28325669/6d4b...,1184427,https://www.airbnb.com/users/show/1184427,...,4.93,4.93,4.86,,f,1,1,0,0,0.23
8,272841,https://www.airbnb.com/rooms/272841,20211228000812,2021-12-28,"room with balcony, city centre",<b>The space</b><br />room with balcony in the...,,https://a0.muscache.com/pictures/6f23ded0-2ade...,1427927,https://www.airbnb.com/users/show/1427927,...,4.66,4.67,4.58,,f,1,0,1,0,2.4
9,283737,https://www.airbnb.com/rooms/283737,20211228000812,2021-12-28,Best Location in Zurich Oldtown,"The confortable, clean and authentic flat is i...",In my opinion the niederdorf is the most beaut...,https://a0.muscache.com/pictures/b0b6eedd-d96c...,1477771,https://www.airbnb.com/users/show/1477771,...,4.74,4.94,4.43,,f,2,2,0,0,2.22


In [11]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,73282,1304820,2012-05-19,1787723,Jeff,Simona was an excellent hostess. The apartment...
1,73282,1448861,2012-06-10,2183393,Rick,"The apartment was fantastic - clean, beautiful..."
2,73282,1574534,2012-06-27,2343568,Joseph,I stayed in the apartment for two weeks with m...
3,73282,1745571,2012-07-19,2343192,Yvonne,The apartment was everything and more. spaciou...
4,73282,2026650,2012-08-19,2092762,Joy,Simona's place is great! It's very quiet and ...


In [12]:
reviews.describe()

Unnamed: 0,listing_id,id,reviewer_id
count,50245.0,50245.0,50245.0
mean,19082730.0,6.993185e+16,104247500.0
std,13586300.0,1.651534e+17,104138900.0
min,73282.0,306613.0,3369.0
25%,6293591.0,233329100.0,22889660.0
50%,17906820.0,456338100.0,65556740.0
75%,27501040.0,650377100.0,158268400.0
max,53803500.0,5.26643e+17,436646600.0


### Work on Calendar Dataset

Convert price and adjusted_price column from string to float.

In [79]:
calendar.price = calendar.price.str.replace( "[$,]", "" ).astype(float)
calendar.adjusted_price = calendar.adjusted_price.str.replace( "[$,]", "" ).astype(float)

AirBnB has a mechanism to adjust prices dynamically. How does this mechanism work in our dataset? Check if price and adjusted_price have any difference:

In [87]:
(calendar.price - calendar.adjusted_price).describe()

count    678535.000000
mean          0.142091
std           2.052814
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          89.000000
dtype: float64

In [80]:
calendar.describe()

Unnamed: 0,listing_id,price,adjusted_price,minimum_nights,maximum_nights
count,678535.0,678535.0,678535.0,678535.0,678535.0
mean,31850210.0,173.898396,173.756304,10.859158,3973.787
std,16690790.0,358.464046,358.40517,49.698803,2607015.0
min,73282.0,10.0,10.0,1.0,1.0
25%,16864720.0,75.0,75.0,2.0,365.0
50%,35777510.0,110.0,110.0,3.0,1125.0
75%,46542350.0,175.0,175.0,6.0,1125.0
max,54012370.0,9999.0,9999.0,730.0,2147484000.0


### Work on Reviews Dataset

Since we would like to analyze emotions in the comment text, it is feasible to remove empty rows here. These do not contribute to a better understanding of the ratings.

In [64]:
# the number of empty comments
reviews.comments.isnull().sum()

61

In [67]:
reviews = reviews.dropna( subset=["comments"] )

In [68]:
reviews.shape

(50184, 6)

### Join Datasets

In [58]:
reviews.join( listings, "", "inner" )

### Extract Emotions from Comments

In [14]:
te.get_emotion( reviews.comments.iat[0] )

{'Happy': 0.26, 'Angry': 0.11, 'Surprise': 0.21, 'Sad': 0.11, 'Fear': 0.32}

In [15]:
for i in range(10) :
        print( te.get_emotion( reviews.comments.iat[i] ))

{'Happy': 0.26, 'Angry': 0.11, 'Surprise': 0.21, 'Sad': 0.11, 'Fear': 0.32}
{'Happy': 0.62, 'Angry': 0.0, 'Surprise': 0.12, 'Sad': 0.12, 'Fear': 0.12}
{'Happy': 0.33, 'Angry': 0.17, 'Surprise': 0.0, 'Sad': 0.33, 'Fear': 0.17}
{'Happy': 0.29, 'Angry': 0.14, 'Surprise': 0.14, 'Sad': 0.43, 'Fear': 0.0}
{'Happy': 0.36, 'Angry': 0.14, 'Surprise': 0.07, 'Sad': 0.14, 'Fear': 0.29}
{'Happy': 0.33, 'Angry': 0.0, 'Surprise': 0.33, 'Sad': 0.17, 'Fear': 0.17}
{'Happy': 0.4, 'Angry': 0.0, 'Surprise': 0.0, 'Sad': 0.2, 'Fear': 0.4}
{'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.0, 'Sad': 0.0, 'Fear': 1.0}
{'Happy': 0.5, 'Angry': 0.0, 'Surprise': 0.17, 'Sad': 0.0, 'Fear': 0.33}
{'Happy': 0.25, 'Angry': 0.12, 'Surprise': 0.0, 'Sad': 0.12, 'Fear': 0.5}


In [53]:
def compute_emotions( comment ) :
    """
    Recognize emotions in review.
    
    Parameters:
    comment (string): comment containing text to be processed.
    
    Returns:
    list: a Series of values returned by te.get_emotion or numpy.nan in case of error
    
    """
    try :
        return pd.Series( te.get_emotion( comment ) )
    except :
        return [np.nan, np.nan, np.nan, np.nan, np.nan]

In [14]:
reviews.Happy = np.nan
reviews.Angry = np.nan
reviews.Surprise = np.nan
reviews.Sad = np.nan
reviews.Fear = np.nan

In [54]:
reviews.comments.head(20).progress_apply( compute_emotions )

100%|██████████| 20/20 [00:03<00:00,  5.84it/s]


Unnamed: 0,Happy,Angry,Surprise,Sad,Fear
0,0.26,0.11,0.21,0.11,0.32
1,0.62,0.0,0.12,0.12,0.12
2,0.33,0.17,0.0,0.33,0.17
3,0.29,0.14,0.14,0.43,0.0
4,0.36,0.14,0.07,0.14,0.29
5,0.33,0.0,0.33,0.17,0.17
6,0.4,0.0,0.0,0.2,0.4
7,0.0,0.0,0.0,0.0,1.0
8,0.5,0.0,0.17,0.0,0.33
9,0.25,0.12,0.0,0.12,0.5


Take care, this is very slow.

In [55]:
reviews.comments.head(20).progress_apply( compute_emotions )

100%|██████████| 20/20 [00:03<00:00,  5.72it/s]


Unnamed: 0,Happy,Angry,Surprise,Sad,Fear
0,0.26,0.11,0.21,0.11,0.32
1,0.62,0.0,0.12,0.12,0.12
2,0.33,0.17,0.0,0.33,0.17
3,0.29,0.14,0.14,0.43,0.0
4,0.36,0.14,0.07,0.14,0.29
5,0.33,0.0,0.33,0.17,0.17
6,0.4,0.0,0.0,0.2,0.4
7,0.0,0.0,0.0,0.0,1.0
8,0.5,0.0,0.17,0.0,0.33
9,0.25,0.12,0.0,0.12,0.5


### Playground