In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df_listings = pd.read_csv('./listings.csv')
np.shape(df_listings)

(3818, 92)

In [2]:
#Checking how many total hosts are there?

np.shape(np.unique(df_listings['host_id']))


(2751,)

In [3]:
#How many superhosts are there?
def is_superhost(superhost):
    '''
    INPUT
        superhost - a string of one of the values from the host is a superhost column
    
    OUTPUT
        return 1 if the host is a superhost has value 't'
        return 0 otherwise
    
    '''
    if superhost in ['t']:
        return 1; 
    else:
        return 0
    

df_listings["host_is_superhost"].apply(is_superhost)[:5] #Test your function to assure it provides 1 and 0 values for the df

0    0
1    1
2    0
3    0
4    0
Name: host_is_superhost, dtype: int64

In [4]:
# Check how many superhosts
df_listings['Superhost?'] = df_listings["host_is_superhost"].apply(is_superhost)
superhost_perc = df_listings['Superhost?'].mean()
superhost_perc


0.2037716081718177

20% of hosts are superhosts

In [5]:
superhost = df_listings[df_listings['Superhost?'] == 1]  # Subset df to only those with HigherEd of 1
notsuperhost = df_listings[df_listings['Superhost?'] == 0]  # Subset df to only those with HigherEd of 0


print(superhost['Superhost?'][:5]) #Assure it looks like what you would expect
print(notsuperhost['Superhost?'][:5]) #Assure it looks like what you would expect

1     1
6     1
7     1
9     1
10    1
Name: Superhost?, dtype: int64
0    0
2    0
3    0
4    0
5    0
Name: Superhost?, dtype: int64


In [6]:
# Check for instant booking, cancellation policy and room type 
instant_booking = df_listings['instant_bookable'].value_counts()
instant_booking

f    3227
t     591
Name: instant_bookable, dtype: int64

In [7]:
#Check your subset is correct - you should get a plot that was created using pandas styling
#which you can learn more about here: https://pandas.pydata.org/pandas-docs/stable/style.html

def test_values_for_superhost(field_to_test) :
    ed_1_perc = superhost[field_to_test].value_counts().reset_index()
    ed_1_perc.rename(columns={'index': field_to_test, field_to_test: 'count'}, inplace=True)
    ed_1_perc.set_index(field_to_test,inplace=True)
    ed_1_perc = ed_1_perc/ed_1_perc.sum()
    ed_1_perc

    ed_0_perc = notsuperhost[field_to_test].value_counts().reset_index()
    ed_0_perc.rename(columns={'index': field_to_test, field_to_test: 'count'}, inplace=True)
    ed_0_perc.set_index(field_to_test,inplace=True)
    ed_0_perc = ed_0_perc/ed_0_perc.sum()

    comp_df = pd.merge(ed_1_perc, ed_0_perc, left_index=True, right_index=True)
    comp_df.columns = [ 'superhost','nonsuperhost']

    comp_df['Diff_superHost_Vals'] = comp_df['superhost'] - comp_df['nonsuperhost']
    return comp_df  

In [8]:
#Instant Bookable
comp_df = test_values_for_superhost('instant_bookable')
comp_df.style.bar(subset=['Diff_superHost_Vals'])

Unnamed: 0,superhost,nonsuperhost,Diff_superHost_Vals
f,0.786632,0.860197,-0.073565
t,0.213368,0.139803,0.073565


In [9]:
#Room type 
comp_df = test_values_for_superhost('room_type')
comp_df.style.bar(subset=['Diff_superHost_Vals'])

Unnamed: 0,superhost,nonsuperhost,Diff_superHost_Vals
Entire home/apt,0.667095,0.665132,0.001964
Private room,0.318766,0.3,0.018766
Shared room,0.014139,0.034868,-0.02073


In [10]:
#Cancellation policy
comp_df = test_values_for_superhost('cancellation_policy')
comp_df.style.bar(subset=['Diff_superHost_Vals'])

Unnamed: 0,superhost,nonsuperhost,Diff_superHost_Vals
strict,0.430591,0.355921,0.07467
flexible,0.182519,0.331579,-0.14906
moderate,0.386889,0.3125,0.074389


In [11]:
#Response time
comp_df = test_values_for_superhost('host_response_time')
comp_df.style.bar(subset=['Diff_superHost_Vals'])

Unnamed: 0,superhost,nonsuperhost,Diff_superHost_Vals
within an hour,0.656376,0.471765,0.184611
within a few hours,0.265772,0.301961,-0.036189
within a day,0.07651,0.211765,-0.135255
a few days or more,0.001342,0.01451,-0.013168


In [12]:
#Count of listings
comp_df = test_values_for_superhost('host_total_listings_count')
comp_df.style.bar(subset=['Diff_superHost_Vals'])

Unnamed: 0,superhost,nonsuperhost,Diff_superHost_Vals
1.0,0.508997,0.586899,-0.077902
2.0,0.209512,0.150428,0.059084
3.0,0.078406,0.065833,0.012573
4.0,0.053985,0.035879,0.018106
5.0,0.044987,0.020737,0.02425
6.0,0.007712,0.013825,-0.006113
7.0,0.008997,0.007571,0.001427
10.0,0.012853,0.005267,0.007587


In [13]:
#Review scores rating
comp_df = test_values_for_superhost('review_scores_rating')
comp_df = comp_df.sort_values(by="superhost", ascending=False)
comp_df.style.bar(subset=['Diff_superHost_Vals'])

Unnamed: 0,superhost,nonsuperhost,Diff_superHost_Vals
98.0,0.238349,0.046694,0.191655
100.0,0.17976,0.266942,-0.087182
97.0,0.173103,0.056198,0.116904
99.0,0.141145,0.016529,0.124616
96.0,0.126498,0.07562,0.050878
95.0,0.063915,0.087603,-0.023689
94.0,0.038615,0.069835,-0.03122
93.0,0.011984,0.072314,-0.06033
90.0,0.009321,0.054132,-0.044811
92.0,0.006658,0.041322,-0.034665


In [14]:
#Number of reviews
comp_df = test_values_for_superhost('number_of_reviews')
# comp_df = comp_df.sort_values(by="superhost", ascending=False)
comp_df.style.bar(subset=['Diff_superHost_Vals'])

Unnamed: 0,superhost,nonsuperhost,Diff_superHost_Vals
0.0,0.034704,0.197368,-0.162664
1.0,0.025707,0.095395,-0.069688
2.0,0.023136,0.067763,-0.044627
3.0,0.010283,0.049013,-0.03873
4.0,0.015424,0.048684,-0.03326
5.0,0.012853,0.036513,-0.02366
6.0,0.012853,0.033553,-0.020699
7.0,0.011568,0.031579,-0.020011
10.0,0.020566,0.022368,-0.001803
9.0,0.012853,0.022039,-0.009186


### CONCLUSION: There are many factors that make an airbnb host a superhost. From our analysis, we can conclude that airbnbs with superhosts are generally entire home/ apt, are more instant bookable,  and have higher review rating and number of reviews. Also, superhosts have higher response time and have more no. of listings as compared to hosts that are not superhosts. One suprising thing we discovered was that the cancellation policy for more superhosts were morderate and strict as compared to non superhosts.