# Data Sorting #

In [1]:
import pandas as pd
import numpy as np
import math
from statsmodels.discrete.discrete_model import Probit
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

df = pd.read_csv('20160213_data/main_data.csv', na_values=[r'\N', 'Null'], sep=';')

# Price NA adjustment
df['price'] = df.price.apply(lambda x: np.NaN if x==-1 else x)

# Making guest_race and guest_gender binary variables
df['guest_black'] = df.guest_race.apply(lambda x: 1 if x == "black" else 0)
df['guest_female'] = df.guest_gender.apply(lambda x: 1 if x == "female" else 0)
df['guest_male'] = df.guest_gender.apply(lambda x: 1 if x == "male" else 0)

# Variable for clustered standard errors used by the authors
df['name_by_city'] = df['guest_first_name'] + df['city']
df['name_by_city'] = df.groupby(['name_by_city']).ngroup() + 2

# Binary variable for host race and gender
binaryrace_df = pd.get_dummies(df['host_race'])
for race in ['black', 'white', 'hisp', 'asian', 'mult']:
    heading = str('host_race_'+ race)
    df[heading] = binaryrace_df[race]
    
binarygender_df = pd.get_dummies(df['host_gender'])
for gender in ['F', 'FF', 'M', 'MM', 'MF']:
    heading = str('host_gender_'+ gender)
    df[heading] = binarygender_df[gender]

df['host_gender_same_sex'] = df.host_gender_MM.apply(lambda x: 1 if x == 1 else 0)
df['host_gender_same_sex'] = df.host_gender_FF.apply(lambda x: 1 if x == 1 else 0)

# Categorical host age variable
df['host_age_cat'] = (
    np.select(
        condlist=[
            [x in ['young', 'young/UU', 'UU/young', 'young/NA', 'NA/young'] for x in df['host_age']] ,
            [x in ['middle/young', 'young/middle'] for x in df['host_age']],
            [x in ['middle', 'middle/UU', 'UU/middle', 'middle/NA', 'NA/middle'] for x in df['host_age']],
            [x in ['middle/old', 'old/middle'] for x in df['host_age']],
            [x in ['old', 'old/UU', 'UU/young', 'old/NA', 'NA/old'] for x in df['host_age']]                 
                 ],
        choicelist=[0,1,2,3,4],
        default='NA'))

# Binary variable for other host characteristics
df['ten_reviews'] = df.number_of_reviews.apply(lambda x: True if x >= 10 else False)
df['five_star_property'] = df.apt_rating.apply(lambda x: True if x == 5 else False)
df['multiple_listings'] = df.number_of_listings.apply(lambda x: True if x > 1 else False)
df['shared_property'] = df.property_setup.apply(lambda x: True if x == 'Private Room' or x == 'Shared Room' else False)
df['shared_bathroom'] = df.apply(lambda x: True if x.shared_property == 1 or x.bathrooms < 1.5 else False, axis=1)
df['has_cleaning_fee'] = df.cleaning_fee.apply(lambda x: True if x != '.'else False)
df['strict_cancellation'] = df.cancellation_policy.apply(lambda x: True if x == 'Strict' else False)
df['young'] = df.host_age_cat.apply(lambda x: True if x == 0 else False)
df['middle'] = df.host_age_cat.apply(lambda x: True if x == 1 or x == 2 else False)
df['old'] = df.host_age_cat.apply(lambda x: True if x == 3 or x == 4 else False)
df['pricey']= df.price.rank(pct=True).apply(lambda x: True if x >= 0.9 else False)
df['price_median']= df.price.rank(pct=True).apply(lambda x: True if x > 0.5 else False)
df['log_price'] = df.price.apply(lambda x: math.log(x) if x!=np.NaN else x)

# Variable for the proportion of population in the census tract
df['white_proportion'] = df.whites / df.population
df['black_proportion'] = df.blacks / df.population
df['asian_proportion'] = df.asians / df.population
df['hispanic_proportion'] = df.hispanics / df.population

# Variable that tracks number of properties withing a census tract
sum_df = df.groupby(['census_tract']).sum()
df['tract_listings'] = df['census_tract'].map(sum_df['latitude'])
df['log_tract_listings'] = df.tract_listings.apply(lambda x: math.log(x) if not(np.isnan(x)) else x)

# Simplifying the categories of host responses
host_response_labels = {
    0: "No or unavailable",
    1: "Yes",
    2: "Request for more info (Can you verify? How many people?)",
    3: "No, unless you verify",
    4: "Yes, if you verify/give more info",
    5: "Offers a different place",
    6: "Offers Lower Price If You Book Now",
    7: "Asks for higher price",
    8: "Yes if stay is extended",
    9: "Check back later for definitive answer",
    10: "I will get back to you",
    11: "Unsure right now",
    12: "Only used for events",
    13: "Confused (our date error)",
    14: "Message not sent",
    -1: "No response"
    }
relabel_dict = {
    1:4, 4:2, 6:3, 8:4, 5:5, 7:6, 2:7, -1:9, 3:10, 0:11, 9:8, 10:8, 11:8,
    12:np.NaN, 13:np.NaN, 14:np.NaN
}
df['simplified_host_response'] = df['host_response'].map(relabel_dict)

simplified_host_response_labels = {
    1: "Yes",
    2: "Yes, but requests more info",
    3: "Yes, with lower price if booked now",
    4: "Yes, if guest extends stay",
    5: "Yes, but in different propery",
    6: "Yes, at a higher price",
    7: "Requests more information",
    8: "Not sure or check later",
    9: "No response",
    10: "No, without more information",
    11: "No",  
    }

graph_relabel_dict = {
    1:1, 2:2, 3:2, 4:3, 5:2, 6:2, 7:4, 8:4, 9:3, 10:8, 11:5
    }    
graph_bins_labels = {
    1: "Yes",
    2: "Conditional Yes",
    3: "No Response",
    4: "Conditional No",
    5: "No"
    }
df['graph_bins'] = df['simplified_host_response'].map(relabel_dict)    

# Simplified variable for whether the host responded with yes or no
yes_relabel_dict = {
    -1:0, 0:0, 1:1, 2:0, 3:0, 4:1, 5:0, 6:1, 7:0, 8:0, 9:0, 10:0, 11:0, 12:0, 13:0, 14:np.NaN
    }  
df['yes'] = df['host_response'].map(yes_relabel_dict)    

# Rename columns
#col_labels = {
#    "yes": "Positive Response",
#    "guest_white": "Guest is White",
#    "white_proportion": "Share of White Population in Census Tract",
#    "black_proportion": "Share of Black Population in Census Tract",
#    "asian_proportion": "Share of Asian Population in Census Tract",
#    "hispanic_proportion": "Share of Hispanic Population in Census Tract",
#    "bed_type": "Type of Bed",
#    "number_guests": "Number of Guests",
#    "bedrooms": "Number of Bedrooms",
#    "bathrooms": "Number of Bathrooms",
#    "cleaning_fee": "Cleaning Fee",
#    "price": "Price",
#    "log_price": "Log Price",
#    "price_median": "Price Above Median",
#    "pricey": "Price in Top Decile",
#    "apt_rating": "Apartment's Star Rating",
#    "verified_id": "Verified ID",
#    "super_host": "Super Host",
#    "guest_black": "Guest is African-American",
#    "guest_female": "Female Guest",
#    "guest_race_continuous": "Whiteness of Name",
#    "host_race_black": "Host is African American",
#    "host_race_white": "Host is White",
#    "host_race_hisp": "Host is Hispanic",
#    "host_race_asian": "Host is Asian",
#    "host_gender_F": "Host is Female",
#    "host_gender_M": "Host is Male",
#    "host_gender_MF": "Host is an Opposite-Sex Couple",
#    "host_gender_same_sex": "Host is a Same-Sex Couple",
#    "ten_reviews": "Host has 10+ Reviews",
#    "five_star_property": "Property has 5 Star Rating",
#    "multiple_listings": "Host has Multiple Listings",
#    "shared_property": "Shared Property",
#    "shared_bathroom": "Shared Bathroom",
#    "has_cleaning_fee": "Has a Cleaning Fee",
#    "strict_cancellation": "Strict Cancellation Policy",
#    "young": "Host Looks Young",
#    "old": "Host Looks Old",
#    "middle": "Host Looks Middle-Aged",
#    "price": "Top Decile in Price",
#    "log_price": "ln(Price)",
#    "tract_listings": "Airbnb Listings per Census Tract",
#    "new_number_of_listings": "Number of Listings"
#}
#df.rename(columns=col_labels, inplace=True)

# Drop Tampa and Atlanta. Tampa and Atlanta requests were all shut down by Airbnb
df = df[(df.city != "Tampa") & (df.city != "Atlanta")]

# City dummy variable
df['baltimore'] = df.city.apply(lambda x: True if x=="Baltimore" else False)
df['dallas'] = df.city.apply(lambda x: True if x=="Dallas" else False)
df['los_angeles'] = df.city.apply(lambda x: True if x=="Los-Angeles" else False)
df['sl'] = df.city.apply(lambda x: True if x=="St-Louis" else False)
df['dc'] = df.city.apply(lambda x: True if x=="Washington" else False)


df_hosts = pd.io.stata.read_stata('20160213_data/hosts.dta')
df = pd.merge(df, df_hosts)
 
#col_labels  = {
#    "any_black": "Host has at least one review from an African American guest",
#    "prop_black": "Proportion of past guests who are African American",
#    "raw_black": "Number of past guests who are African American"
#    }
#df.rename(columns=col_labels, inplace=True)

#df['filled_september'] = df.up_not_available_september.apply(lambda x: True if x == 1 else False)
#prediction = Probit(
#    missing = 'drop',
#    endog = np.asarray(df.filled_september, dtype=float), 
#    exog  = np.asarray(df[['host_race_black', 'host_race_asian', 'host_race_hisp',
#        'host_gender_M', 'log_price', 'bedrooms', 'shared_bathroom',
#        'shared_property', 'number_of_reviews', 'young',
#        'multiple_listings', 'white_proportion', 'log_tract_listings',
#        'baltimore', 'dallas', 'los_angeles', 'sl']], dtype=float)
#    ).fit()


df.head()

Unnamed: 0,host_response,response_date,number_of_messages,automated_coding,latitude,longitude,bed_type,property_type,cancellation_policy,number_guests,...,baltimore,dallas,los_angeles,sl,dc,total_guests,raw_black,prop_black,any_black,past_guest_merge
0,1,08.07.2015 03:40,,1,328.153,-967.702,,House,Moderate,6.0,...,False,True,False,False,False,64.0,0.0,0.0,0.0,matched (3)
1,0,07.07.2015 20:30,3.0,0,328.131,-967.702,Real Bed,House,Strict,4.0,...,False,True,False,False,False,21.0,0.0,0.0,0.0,matched (3)
2,4,07.07.2015 15:38,3.0,0,328.167,-967.678,,House,Moderate,6.0,...,False,True,False,False,False,13.0,1.0,0.076923,1.0,matched (3)
3,0,07.07.2015 02:46,,0,328.145,-967.687,Real Bed,House,Flexible,2.0,...,False,True,False,False,False,,0.0,0.0,0.0,master only (1)
4,4,07.07.2015 20:50,3.0,0,330.045,-968.294,Real Bed,Apartment,Flexible,3.0,...,False,True,False,False,False,19.0,2.0,0.105263,1.0,matched (3)


In [40]:
df = pd.io.stata.read_stata('20160213_data/finished_do.dta')

In [41]:

df_model = df[['yes','guest_black','name_by_city', 'host_gender_M', 'host_race_black']].dropna()
df_model3 = df[['yes','guest_black','name_by_city', 'host_gender_M', 'host_race_black', 'multiple_listings', 'shared_property', 'ten_reviews', 'log_price']].dropna()

model = smf.ols('yes ~ guest_black', data=df_model)
result1 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + host_race_black + host_gender_M', data=df_model)
result2 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + host_race_black + host_gender_M + multiple_listings + shared_property + ten_reviews + log_price', data=df_model3)
result3 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model3['name_by_city']})


tble = summary_col([result1, result2, result3], stars=True, float_format='%.2f', regressor_order=['guest_black', 'host_race_black', 'host_gender_M'], info_dict={'N':lambda x: "{0:d}".format(int(x.nobs))})
tble

0,1,2,3
,yes I,yes II,yes III
guest_black,-0.08***,-0.08***,-0.09***
,(0.02),(0.02),(0.02)
host_race_black,,0.07***,0.09***
,,(0.02),(0.02)
host_gender_M,,-0.05***,-0.05***
,,(0.01),(0.01)
Intercept,0.49***,0.50***,0.76***
,(0.01),(0.01),(0.07)
R-squared,0.01,0.01,0.04


In [42]:
df['guest_host_black'] = df['guest_black'] * df['host_race_black']

## Table 3: Race Gap by Race of the Host, across all hosts, then across male and female hosts ##

In [43]:
df_model = df[['yes','guest_black','name_by_city', 'guest_host_black', 'host_race_black', 'host_gender_M', 'host_gender_F']].dropna()

df_model_gender = df_model
model = smf.ols('yes ~ guest_black + host_race_black + guest_host_black', data=df_model_gender)
result1 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model_gender['name_by_city']})

df_model_gender = df_model[df_model['host_gender_M'] == 1]
model = smf.ols('yes ~ guest_black + host_race_black + guest_host_black', data=df_model_gender)
result2 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model_gender['name_by_city']})

df_model_gender = df_model[df_model['host_gender_F'] == 1]
model = smf.ols('yes ~ guest_black + host_race_black + guest_host_black', data=df_model_gender)
result3 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model_gender['name_by_city']})

df_model_gender = df_model[(df_model['host_gender_F'] != 1) & (df_model['host_gender_M'] != 1)]
model = smf.ols('yes ~ guest_black + host_race_black + guest_host_black', data=df_model_gender)
result4 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model_gender['name_by_city']})


tble = summary_col([result1, result2, result3, result4], stars=True, float_format='%.2f', 
                   regressor_order=['guest_black', 'host_race_black', 'guest_host_black'], 
                   model_names=('All hosts', 'Male hosts', 'Female hosts', 'Other hosts'),  
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs))})
tble

0,1,2,3,4
,All hosts,Male hosts,Female hosts,Other hosts
guest_black,-0.08***,-0.09***,-0.09***,-0.07**
,(0.02),(0.02),(0.02),(0.03)
host_race_black,0.06**,0.19***,-0.00,0.03
,(0.03),(0.05),(0.04),(0.09)
guest_host_black,0.01,-0.11,0.11*,-0.06
,(0.05),(0.08),(0.06),(0.14)
Intercept,0.48***,0.44***,0.50***,0.50***
,(0.01),(0.02),(0.02),(0.02)
R-squared,0.01,0.01,0.01,0.00


## Table 4. Proportion of Positive Responses by Race and Gender ##

In [51]:
df['host_male'] = df['host_gender_M']
df['host_female'] = df['host_gender_F']
df['guest_race_black']= df['guest_black']
df['guest_race_white'] = df.guest_black.apply(lambda x: 1 if x==0 else 0)

for gender in ['female', 'male']:
    for race in ['white', 'black']:
        for side in ['guest', 'host']:
            df[side+'_'+gender+'_'+race] = df.apply(lambda x: 1 if x[side+'_'+gender] == 1 and x[side+'_race_'+race] ==1 else 0, axis=1)

df['no'] = df.yes.apply(lambda x: 0 if x==1 else 1)
host_combinations = ['host_male_white', 'host_male_black','host_female_white', 'host_female_black']
guest_combinations = ['guest_female_white', 'guest_female_black', 'guest_male_white', 'guest_male_black']

# Sum of positive responses
table1 = pd.pivot_table(df, values=guest_combinations, index=host_combinations,
                   columns=['yes'], aggfunc=np.sum)
# Sum of negative responses
table2 = pd.pivot_table(df, values=guest_combinations, index=host_combinations,
                   columns=['no'], aggfunc=np.sum)
# Response rate
table3 = table1 / (table1+table2)

# Drop unwanted columns and rows
table3.drop(columns=table3.columns[::2], inplace=True)
table3 = table3.iloc[1:]
table3.reset_index(inplace=True)
table3.reindex(host_combinations)
host_combinations.reverse()
table3.index = host_combinations
table3 = table3.iloc[:,4:]
table3.columns = table3.columns.get_level_values(0)
table3 = table3[['guest_male_white', 'guest_male_black','guest_female_white', 'guest_female_black']]
host_combinations.reverse()
table3.reindex(host_combinations)
pd.options.display.float_format = '{:,.2f}'.format
table3

Unnamed: 0,guest_male_white,guest_male_black,guest_female_white,guest_female_black
host_female_black,0.43,0.38,0.53,0.59
host_female_white,0.46,0.35,0.49,0.44
host_male_black,0.64,0.4,0.59,0.43
host_male_white,0.42,0.35,0.49,0.32


## Table 5. Are Effects Driven by Host Characteristics? ##


In [45]:
df['shared_guest_black'] = df['shared_property'] * df['guest_black']
df['multiple_black'] = df['multiple_listings'] * df['guest_black']
df['ten_reviews_black'] = df['ten_reviews'] * df['guest_black']
df['young_black'] = df['young'] * df['guest_black']
df['any_black_gb'] = df['any_black'] * df['guest_black']

In [52]:
df_model = df[['yes','guest_black','name_by_city', 'shared_property', 'shared_guest_black', 
               'multiple_listings', 'multiple_black', 'ten_reviews', 'ten_reviews_black',
              'young', 'young_black', 'any_black', 'any_black_gb']].dropna()


model = smf.ols('yes ~ guest_black + shared_property + shared_guest_black', data=df_model)
result1 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})


model = smf.ols('yes ~ guest_black + multiple_listings + multiple_black', data=df_model)
result2 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})


model = smf.ols('yes ~ guest_black + ten_reviews + ten_reviews_black', data=df_model)
result3 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})


model = smf.ols('yes ~ guest_black + young + young_black', data=df_model)
result4 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + any_black + any_black_gb', data=df_model)
result5 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

tble = summary_col([result1, result2, result3, result4, result5], stars=True, float_format='%.2f', 
                   regressor_order=['guest_black','shared_property', 'shared_guest_black', 
                        'multiple_listings', 'multiple_black', 'ten_reviews', 'ten_reviews_black',
                        'young', 'young_black', 'any_black', 'any_black_gb'],  
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2_adj':lambda x: "{:.3f}".format(x.rsquared_adj)})
tble
#tble.as_latex()

0,1,2,3,4,5
,yes I,yes II,yes III,yes IIII,yes IIIII
guest_black,-0.07***,-0.08***,-0.09***,-0.08***,-0.09***
,(0.02),(0.02),(0.02),(0.02),(0.02)
shared_property,0.00,,,,
,(0.01),,,,
shared_guest_black,-0.02,,,,
,(0.03),,,,
multiple_listings,,0.10***,,,
,,(0.02),,,
multiple_black,,-0.00,,,


## Table 6. Are Effects Driven by Location Characteristics? ##

In [47]:
df['guest_black_price_median'] = df['guest_black'] * df['price_median']
df['guest_black_pop_black'] = df['guest_black'] * df['black_proportion']
df['guest_black_tract_listings'] = df['guest_black'] * df['tract_listings']
df['guest_black_pr_filled'] = df['guest_black'] * df['pr_filled']

In [48]:
df_model = df[['yes', 'name_by_city', 'guest_black','price_median', 'guest_black_price_median']].dropna()
model = smf.ols('yes ~ guest_black + price_median + guest_black_price_median', data=df_model)
result1 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

df_model = df[['yes', 'name_by_city', 'guest_black', 'black_proportion', 'guest_black_pop_black']].dropna()
model = smf.ols('yes ~ guest_black + black_proportion + guest_black_pop_black', data=df_model)
result2 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

df_model = df[['yes', 'name_by_city', 'guest_black','tract_listings', 'guest_black_tract_listings']].dropna()
model = smf.ols('yes ~ guest_black + tract_listings + guest_black_tract_listings', data=df_model)
result3 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

df_model = df[['yes', 'name_by_city', 'guest_black','pr_filled', 'guest_black_pr_filled']].dropna()
model = smf.ols('yes ~ guest_black + pr_filled + guest_black_pr_filled', data=df_model)
result4 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

tble = summary_col([result1, result2, result3, result4], stars=True, float_format='%.2f', 
                   regressor_order=['guest_black','price_median', 'guest_black_price_median', 'black_proportion', 
                        'guest_black_pop_black', 'tract_listings', 'guest_black_tract_listings', 'pr_filled',
                        'guest_black_pr_filled'],  
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2_adj':lambda x: "{:.3f}".format(x.rsquared_adj)})
tble
#tble.as_latex()

0,1,2,3,4
,yes I,yes II,yes III,yes IIII
guest_black,-0.08***,-0.08***,-0.09***,-0.12**
,(0.02),(0.02),(0.02),(0.06)
price_median,-0.06***,,,
,(0.02),,,
guest_black_price_median,0.01,,,
,(0.03),,,
black_proportion,,0.05,,
,,(0.05),,
guest_black_pop_black,,0.02,,
