In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython import display
%matplotlib inline
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
#Load in the data

df_calendar = pd.read_csv('/Users/Caddi/Data/BerlinAirbnb/calendar.csv')
df_listings = pd.read_csv('/Users/Caddi/Data/BerlinAirbnb/listings.csv')
df_reviews = pd.read_csv('/Users/Caddi/Data/BerlinAirbnb/reviews.csv')

In [3]:
df_listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2695,https://www.airbnb.com/rooms/2695,20190514043746,2019-05-14,Prenzlauer Berg close to Mauerpark,,In the summertime we are spending most of our ...,In the summertime we are spending most of our ...,none,,...,f,f,moderate,f,f,1,0,1,0,0.67
1,3176,https://www.airbnb.com/rooms/3176,20190514043746,2019-05-14,Fabulous Flat in great Location,This beautiful first floor apartment is situa...,1st floor (68m2) apartment on Kollwitzplatz/ P...,This beautiful first floor apartment is situa...,none,The neighbourhood is famous for its variety of...,...,f,f,strict_14_with_grace_period,f,f,1,1,0,0,1.19
2,7071,https://www.airbnb.com/rooms/7071,20190514043746,2019-05-14,BrightRoom with sunny greenview!,Cozy and large room in the beautiful district ...,"The BrightRoom is an approx. 20 sqm (215ft²), ...",Cozy and large room in the beautiful district ...,none,"Great neighborhood with plenty of Cafés, Baker...",...,f,f,moderate,f,f,2,0,2,0,1.93
3,9991,https://www.airbnb.com/rooms/9991,20190514043746,2019-05-14,Geourgeous flat - outstanding views,4 bedroom with very large windows and outstand...,"THE APPARTMENT - 4 bedroom (US, Germany: 5 roo...",4 bedroom with very large windows and outstand...,none,Prenzlauer Berg is an amazing neighbourhood wh...,...,f,f,strict_14_with_grace_period,f,f,1,1,0,0,0.13
4,14325,https://www.airbnb.com/rooms/14325,20190514043746,2019-05-14,Apartment in Prenzlauer Berg,The apartment is located on the upper second f...,The apartment is located on the south-facing s...,The apartment is located on the upper second f...,none,,...,f,f,strict_14_with_grace_period,f,f,4,4,0,0,0.21


In a first step, we will explore the datasets. 

In [4]:
# First, the listings data
# Print rows and columns in calendar data set
print("Number of rows and columns: ", df_listings.shape)
# Assess null values
print(df_listings.isnull().sum())

Number of rows and columns:  (23536, 106)
id                                                  0
listing_url                                         0
scrape_id                                           0
last_scraped                                        0
name                                               54
summary                                          1069
space                                            8410
description                                       297
experiences_offered                                 0
neighborhood_overview                           10776
notes                                           15779
transit                                          9289
access                                          12215
interaction                                     11986
house_rules                                     11540
thumbnail_url                                   23536
medium_url                                      23536
picture_url                             

In [8]:
#Find columns with many missing values
df_listings.isna().mean()



id                                              0.000000
listing_url                                     0.000000
scrape_id                                       0.000000
last_scraped                                    0.000000
name                                            0.002294
summary                                         0.045420
space                                           0.357325
description                                     0.012619
experiences_offered                             0.000000
neighborhood_overview                           0.457852
notes                                           0.670420
transit                                         0.394672
access                                          0.518992
interaction                                     0.509262
house_rules                                     0.490313
thumbnail_url                                   1.000000
medium_url                                      1.000000
picture_url                    

In [27]:
# drop columns that have more than thresh percent of missing values
thresh = 0.75
droplist = []
for col in df_listings.columns:
    if df_listings[col].isna().mean() >= thresh :
        droplist.append(col)

   
df_listings_drop = df_listings.drop( droplist, axis =1 )

droplist=[]
for col in df_listings_drop.columns:
    if df_listings[col].nunique() == 1 :
        droplist.append(col)
df_listings_drop = df_listings.drop( droplist, axis =1 )        

In [28]:
df_listings_drop.head()


Unnamed: 0,id,listing_url,name,summary,space,description,neighborhood_overview,notes,transit,access,...,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2695,https://www.airbnb.com/rooms/2695,Prenzlauer Berg close to Mauerpark,,In the summertime we are spending most of our ...,In the summertime we are spending most of our ...,,,Within walking distance you'll find the S-Bahn...,Außer deinem Zimmer kannst du noch die Küche u...,...,,f,moderate,f,f,1,0,1,0,0.67
1,3176,https://www.airbnb.com/rooms/3176,Fabulous Flat in great Location,This beautiful first floor apartment is situa...,1st floor (68m2) apartment on Kollwitzplatz/ P...,This beautiful first floor apartment is situa...,The neighbourhood is famous for its variety of...,We welcome FAMILIES and cater especially for y...,"We are 5 min walk away from the tram M2, whic...",The apartment will be entirely yours. We are c...,...,,f,strict_14_with_grace_period,f,f,1,1,0,0,1.19
2,7071,https://www.airbnb.com/rooms/7071,BrightRoom with sunny greenview!,Cozy and large room in the beautiful district ...,"The BrightRoom is an approx. 20 sqm (215ft²), ...",Cozy and large room in the beautiful district ...,"Great neighborhood with plenty of Cafés, Baker...",I hope you enjoy your stay to the fullest! Ple...,Best access to other parts of the city via pub...,"The guests have access to the bathroom, a smal...",...,,f,moderate,f,f,2,0,2,0,1.93
3,9991,https://www.airbnb.com/rooms/9991,Geourgeous flat - outstanding views,4 bedroom with very large windows and outstand...,"THE APPARTMENT - 4 bedroom (US, Germany: 5 roo...",4 bedroom with very large windows and outstand...,Prenzlauer Berg is an amazing neighbourhood wh...,,Excellent location regarding public transport ...,All amenities shared - nothing off limits,...,,f,strict_14_with_grace_period,f,f,1,1,0,0,0.13
4,14325,https://www.airbnb.com/rooms/14325,Apartment in Prenzlauer Berg,The apartment is located on the upper second f...,The apartment is located on the south-facing s...,The apartment is located on the upper second f...,,,,,...,,f,strict_14_with_grace_period,f,f,4,4,0,0,0.21
