# Data Sorting #

In [1]:
import pandas as pd
import numpy as np
import math
import time
from statsmodels.discrete.discrete_model import Probit
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

df = pd.io.stata.read_stata('20160213_data/finished_do.dta')

# Reproduction of results # 

For table exportation, methods as_latex() and to_latex() of statsmodels and pandas respectively have been used. For better visuality of the paper, the method calls have been deleted for the final export of the notebook.

## Table 2 - The impact of race on likelihood of acceptance ##

In [2]:
df_model = df[['yes','guest_black','name_by_city', 'host_gender_M', 'host_race_black']].dropna()
df_model3 = df[['yes','guest_black','name_by_city', 'host_gender_M', 'host_race_black', 'multiple_listings', 'shared_property', 'ten_reviews', 'log_price']].dropna()

model = smf.ols('yes ~ guest_black', data=df_model)
result1 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + host_race_black + host_gender_M', data=df_model)
result2 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + host_race_black + host_gender_M + multiple_listings + shared_property + ten_reviews + log_price', data=df_model3)
result3 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model3['name_by_city']})


tble = summary_col([result1, result2, result3], stars=True, float_format='%.2f', regressor_order=['guest_black', 'host_race_black', 'host_gender_M', 'multiple_listings', 'shared_property', 'ten_reviews', 'log_price', 'Intercept'], info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)), 'R²_adj':lambda x: "{0:.3f}".format(float(x.rsquared_adj))})
tble

0,1,2,3
,yes I,yes II,yes III
guest_black,-0.08***,-0.08***,-0.09***
,(0.02),(0.02),(0.02)
host_race_black,,0.07***,0.09***
,,(0.02),(0.02)
host_gender_M,,-0.05***,-0.05***
,,(0.01),(0.01)
multiple_listings,,,0.06***
,,,(0.01)
shared_property,,,-0.07***


## Table 3: Race Gap by Race of the Host, across all hosts, then across male and female hosts ##

In [3]:
df['guest_host_black'] = df['guest_black'] * df['host_race_black']
df_model = df[['yes','guest_black','name_by_city', 'guest_host_black', 'host_race_black', 'host_gender_M', 'host_gender_F']].dropna()

df_model_gender = df_model
model = smf.ols('yes ~ guest_black + host_race_black + guest_host_black', data=df_model_gender)
result1 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model_gender['name_by_city']})

df_model_gender = df_model[df_model['host_gender_M'] == 1]
model = smf.ols('yes ~ guest_black + host_race_black + guest_host_black', data=df_model_gender)
result2 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model_gender['name_by_city']})

df_model_gender = df_model[df_model['host_gender_F'] == 1]
model = smf.ols('yes ~ guest_black + host_race_black + guest_host_black', data=df_model_gender)
result3 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model_gender['name_by_city']})

df_model_gender = df_model[(df_model['host_gender_F'] != 1) & (df_model['host_gender_M'] != 1)]
model = smf.ols('yes ~ guest_black + host_race_black + guest_host_black', data=df_model_gender)
result4 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model_gender['name_by_city']})


tble2 = summary_col([result1, result2, result3, result4], stars=True, float_format='%.2f', 
                   regressor_order=['guest_black', 'host_race_black', 'guest_host_black'], 
                   model_names=('All hosts', 'Male hosts', 'Female hosts', 'Other hosts'),  
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)), 'R²_adj':lambda x: "{0:.3f}".format(float(x.rsquared_adj)),
                             'Implied coefficient':lambda x: "{0:.2f}".format(float(x.params['guest_black']+x.params['guest_host_black']))})
tble2

0,1,2,3,4
,All hosts,Male hosts,Female hosts,Other hosts
guest_black,-0.08***,-0.09***,-0.09***,-0.07**
,(0.02),(0.02),(0.02),(0.03)
host_race_black,0.06**,0.19***,-0.00,0.03
,(0.03),(0.05),(0.04),(0.09)
guest_host_black,0.01,-0.11,0.11*,-0.06
,(0.05),(0.08),(0.06),(0.14)
Intercept,0.48***,0.44***,0.50***,0.50***
,(0.01),(0.02),(0.02),(0.02)
R-squared,0.01,0.01,0.01,0.00


## Table 4. Proportion of Positive Responses by Race and Gender ##

In [4]:
# Format race/gender interaction columns as naming is inconsistent
df['host_male'] = df['host_gender_M']
df['host_female'] = df['host_gender_F']
df['guest_race_black']= df['guest_black']
df['guest_race_white'] = df.guest_black.apply(lambda x: 1 if x==0 else 0)

for gender in ['female', 'male']:
    for race in ['white', 'black']:
        for side in ['guest', 'host']:
            df[side+'_'+gender+'_'+race] = df.apply(lambda x: 1 if x[side+'_'+gender] == 1 and x[side+'_race_'+race] ==1 else 0, axis=1)

df['no'] = df.yes.apply(lambda x: 0 if x==1 else 1)
host_combinations = ['host_male_white', 'host_male_black','host_female_white', 'host_female_black']
guest_combinations = ['guest_female_white', 'guest_female_black', 'guest_male_white', 'guest_male_black']

# Sum of positive responses
table1 = pd.pivot_table(df, values=guest_combinations, index=host_combinations,
                   columns=['yes'], aggfunc=np.sum)
# Sum of negative responses
table2 = pd.pivot_table(df, values=guest_combinations, index=host_combinations,
                   columns=['no'], aggfunc=np.sum)
# Response rate
table3 = table1 / (table1+table2)

# Formatting
table3.drop(columns=table3.columns[::2], inplace=True)
table3 = table3.iloc[1:]
table3.reset_index(inplace=True)
table3.reindex(host_combinations)
host_combinations.reverse()
table3.index = host_combinations
table3 = table3.iloc[:,4:]
table3.columns = table3.columns.get_level_values(0)
table3 = table3[['guest_male_white', 'guest_male_black','guest_female_white', 'guest_female_black']]
host_combinations.reverse()
table3.reindex(host_combinations)
pd.options.display.float_format = '{:,.2f}'.format

table3

Unnamed: 0,guest_male_white,guest_male_black,guest_female_white,guest_female_black
host_female_black,0.43,0.38,0.53,0.59
host_female_white,0.46,0.35,0.49,0.44
host_male_black,0.64,0.4,0.59,0.43
host_male_white,0.42,0.35,0.49,0.32


## Table 5. Are Effects Driven by Host Characteristics? ##


In [5]:
df['shared_guest_black'] = df['shared_property'] * df['guest_black']
df['multiple_black'] = df['multiple_listings'] * df['guest_black']
df['ten_reviews_black'] = df['ten_reviews'] * df['guest_black']
df['young_black'] = df['young'] * df['guest_black']
df['any_black_gb'] = df['any_black'] * df['guest_black']


# This extra df is created to jointly drop NA values including 'name_by_city'. Otherwise, as 
# 'name_by_city' is only called in the .fit() function, the length of it does not match the
# exogenous variables from the model where NA values where already dropped.
df_model = df[['yes','guest_black','name_by_city', 'shared_property', 'shared_guest_black', 
               'multiple_listings', 'multiple_black', 'ten_reviews', 'ten_reviews_black',
              'young', 'young_black', 'any_black', 'any_black_gb']].dropna()


model = smf.ols('yes ~ guest_black + shared_property + shared_guest_black', data=df_model)
result1 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + multiple_listings + multiple_black', data=df_model)
result2 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + ten_reviews + ten_reviews_black', data=df_model)
result3 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + young + young_black', data=df_model)
result4 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + any_black + any_black_gb', data=df_model)
result5 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

tble5 = summary_col([result1, result2, result3, result4, result5], stars=True, float_format='%.2f', 
                   regressor_order=['guest_black','shared_property', 'shared_guest_black', 
                        'multiple_listings', 'multiple_black', 'ten_reviews', 'ten_reviews_black',
                        'young', 'young_black', 'any_black', 'any_black_gb'],  
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2_adj':lambda x: "{:.3f}".format(x.rsquared_adj),
                             'Implied coefficient':lambda x: "{0:.2f}".format(float(x.params[1]+x.params[3]))})
tble5

0,1,2,3,4,5
,yes I,yes II,yes III,yes IIII,yes IIIII
guest_black,-0.07***,-0.08***,-0.09***,-0.08***,-0.09***
,(0.02),(0.02),(0.02),(0.02),(0.02)
shared_property,0.00,,,,
,(0.01),,,,
shared_guest_black,-0.02,,,,
,(0.03),,,,
multiple_listings,,0.10***,,,
,,(0.02),,,
multiple_black,,-0.00,,,


## Table 6. Are Effects Driven by Location Characteristics? ##

In [6]:
df['guest_black_price_median'] = df['guest_black'] * df['price_median']
df['guest_black_pop_black'] = df['guest_black'] * df['black_proportion']
df['guest_black_tract_listings'] = df['guest_black'] * df['tract_listings']
df['guest_black_pr_filled'] = df['guest_black'] * df['pr_filled']

df_model = df[['yes', 'name_by_city', 'guest_black','price_median', 'guest_black_price_median']].dropna()
model = smf.ols('yes ~ guest_black + price_median + guest_black_price_median', data=df_model)
result1 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

df_model = df[['yes', 'name_by_city', 'guest_black', 'black_proportion', 'guest_black_pop_black']].dropna()
model = smf.ols('yes ~ guest_black + black_proportion + guest_black_pop_black', data=df_model)
result2 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

df_model = df[['yes', 'name_by_city', 'guest_black','tract_listings', 'guest_black_tract_listings']].dropna()
model = smf.ols('yes ~ guest_black + tract_listings + guest_black_tract_listings', data=df_model)
result3 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

df_model = df[['yes', 'name_by_city', 'guest_black','pr_filled', 'guest_black_pr_filled']].dropna()
model = smf.ols('yes ~ guest_black + pr_filled + guest_black_pr_filled', data=df_model)
result4 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

tble6 = summary_col([result1, result2, result3, result4], stars=True, float_format='%.2f', 
                   regressor_order=['guest_black','price_median', 'guest_black_price_median', 'black_proportion', 
                        'guest_black_pop_black', 'tract_listings', 'guest_black_tract_listings', 'pr_filled',
                        'guest_black_pr_filled'],  
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2_adj':lambda x: "{:.3f}".format(x.rsquared_adj)})
tble6

0,1,2,3,4
,yes I,yes II,yes III,yes IIII
guest_black,-0.08***,-0.08***,-0.09***,-0.12**
,(0.02),(0.02),(0.02),(0.06)
price_median,-0.06***,,,
,(0.02),,,
guest_black_price_median,0.01,,,
,(0.03),,,
black_proportion,,0.05,,
,,(0.05),,
guest_black_pop_black,,0.02,,


End of reproduction of main results.

# Own empirical analysis # 

## Randomization controls ##

### Regress each control on treatment variable ###

In [7]:
used_controls = ['host_gender_M', 'host_race_black', 'multiple_listings', 'shared_property', 'log_price', 
                     'host_race_black', 'host_gender_M', 'host_gender_F', 'shared_property', 'multiple_listings', 
                     'ten_reviews', 'young', 'price_median', 'black_proportion', 'guest_male',
                     'tract_listings', 'pr_filled', 'baltimore', 'dallas', 'los_angeles', 'sl', 'dc']
used_control_models = {}

print('{0:20} {1:>1} {2:>1}'.format('Control Variable', 'P-Value', 'Coeff'))
for control in used_controls:
    model = smf.ols(control + ' ~ guest_black', data=df)
    result = model.fit()
    used_control_models[control] = result
    print('{0:20} {1:>6.3f} {2:6.3f}'.format(control + ':', result.pvalues[1], result.params[1]))

Control Variable     P-Value Coeff
host_gender_M:        0.896  0.001
host_race_black:      0.972 -0.000
multiple_listings:    0.451  0.009
shared_property:      0.929  0.001
log_price:            0.792  0.005
host_race_black:      0.972 -0.000
host_gender_M:        0.896  0.001
host_gender_F:        0.439 -0.009
shared_property:      0.929  0.001
multiple_listings:    0.451  0.009
ten_reviews:          0.041  0.026
young:                0.799  0.003
price_median:         0.772 -0.004
black_proportion:     0.919 -0.001
guest_male:           0.408 -0.010
tract_listings:       0.848  0.045
pr_filled:            0.899 -0.000
baltimore:            0.906 -0.001
dallas:               0.311 -0.006
los_angeles:          0.743  0.004
sl:                   0.382 -0.003
dc:                   0.505  0.007


ten_reviews is correlated at a 5% significance level. However, the coefficient is "only" 0.026 (both variables are binary). Therefore the effect is not too large. This is not good, but does not greatly damage the interpretation and validity of the whole paper.

### Regress treatment variable on all controls ###

In [8]:
formula = 'guest_black ~ '
for control in used_controls:
    
    formula += str(control + ' + ')
# exclude one arbitrarily chosen city (here dc), otherwise dummy variable trap/fallacy
# this is not the case with host female/male as some could not be categorized
formula = formula[:-8] 

model = smf.ols(formula, data=df)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,guest_black,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.559
Date:,"Tue, 18 Aug 2020",Prob (F-statistic):,0.923
Time:,11:58:45,Log-Likelihood:,-4520.5
No. Observations:,6235,AIC:,9077.0
Df Residuals:,6217,BIC:,9198.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5556,0.099,5.617,0.000,0.362,0.750
host_gender_M,-0.0133,0.017,-0.801,0.423,-0.046,0.019
host_race_black,-0.0011,0.026,-0.040,0.968,-0.052,0.050
multiple_listings,0.0037,0.015,0.257,0.797,-0.025,0.032
shared_property,-0.0148,0.022,-0.683,0.495,-0.057,0.028
log_price,0.0014,0.015,0.092,0.927,-0.029,0.032
host_gender_F,-0.0169,0.016,-1.087,0.277,-0.047,0.014
ten_reviews,0.0304,0.015,2.044,0.041,0.001,0.060
young,0.0059,0.013,0.438,0.661,-0.020,0.032

0,1,2,3
Omnibus:,21655.531,Durbin-Watson:,1.977
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1032.828
Skew:,0.004,Prob(JB):,5.2999999999999995e-225
Kurtosis:,1.006,Cond. No.,269.0


Only ten_reviews is significant, mirroring previous findings. Prob F-Stat very high.

In [9]:
columns = ['variable', 'coef1', 'p-value1', 'coef2', 'p-value2']
randomization_df = pd.DataFrame(columns=columns)
randomization_df['variable'] = list(used_control_models.keys())

for index, row in randomization_df.iterrows():
    var = row['variable']
    randomization_df.loc[index,'coef1'] = used_control_models[var].params[1]
    randomization_df.loc[index,'p-value1'] = used_control_models[var].pvalues[1]
    try:
        randomization_df.loc[index,'coef2'] = result.params[var]
        randomization_df.loc[index,'p-value2'] = result.pvalues[var]
    except:
        randomization_df.loc[index,'coef2'] = np.nan
        randomization_df.loc[index,'p-value2'] = np.nan

pd.set_option('display.float_format', '{:0.3f}'.format)
randomization_df

Unnamed: 0,variable,coef1,p-value1,coef2,p-value2
0,host_gender_M,0.001,0.896,-0.013,0.423
1,host_race_black,-0.0,0.972,-0.001,0.968
2,multiple_listings,0.009,0.451,0.004,0.797
3,shared_property,0.001,0.929,-0.015,0.495
4,log_price,0.005,0.792,0.001,0.927
5,host_gender_F,-0.009,0.439,-0.017,0.277
6,ten_reviews,0.026,0.041,0.03,0.041
7,young,0.003,0.799,0.006,0.661
8,price_median,-0.004,0.772,-0.011,0.584
9,black_proportion,-0.001,0.919,-0.003,0.94


## Fisher Exact Test ##

In [10]:
class FisherExactTest:
    '''Performs Fisher Exact Test Simulation
    '''
    def __init__(self):
        self.rng = np.random.default_rng()
        pass
    
    def calc_teststat(self, treatment):
        '''Calculate sum of outputs by group.

        Given the (altered) treament array whether individual is in the treatment group or not
        and given the original output array, calculate the sum of outputs for each group.

        Arguments:
            treatment(np.array): Array of the treatment variable

        Returns:
            Fisher Test Stat(int): Value of the Fisher Test Statistics given the initial output and
                the (altered) treatment population.
        '''
        treatment_output  = sum(np.multiply(treatment, self.output))
        notreatment_output = sum(np.multiply(np.logical_not(treatment), self.output))
        return treatment_output/self.n_treatment - notreatment_output/self.n_notreatment    

    def initial_arrays(self, treatment, output):
        '''Sets up original treament and output array and calculates teststat.
        
        Arguments:
            treatment(pd.Series): Column of the treatment variable
            
            output(pd.Series): Column of the output variable
            
        Returns:
            initial_teststat(int): Value of the Fisher Test Statistics given the initial output and
                the initial treatment population.
        '''
        self.initial_treatment = np.array(treatment)
        self.n_treatment = treatment.value_counts()[True] # Done here as it doesn't change and fastens iteration
        self.n_notreatment = treatment.value_counts()[False]
        self.output = np.array(output)
        
        self.initial_teststat = self.calc_teststat(self.initial_treatment)
        return self.initial_teststat
    
    def simulation(self, m=1000):
        '''Iterates over simulated permutations and returns how many were larger than initial one. '''
        count_le = 0 # Used for counting whether alternative test statistic is larger than orginal test statistic
        
        start_time = time.perf_counter()
        for i in range(0, m):
            alt_perm = self.rng.permutation(self.initial_treatment) # Creates random alternative permutation
            alt_t = self.calc_teststat(alt_perm) # Fisher test for alternative permutation
            if alt_t >= self.initial_teststat: 
                count_le += 1
        total_time = time.perf_counter() - start_time
        
        print('For m={}, {:.2f} minutes were needed.'.format(m, total_time/60))
        print('Number of simulated test statistics larger or equal to original test statistic:\t', count_le)     
        return count_le


bw = df.guest_black.apply(lambda x: True if x==0 else False if x==1 else np.NaN) # True if guest is white
df_fisher = pd.DataFrame({'white': bw, 'yes':df['yes']})
df_fisher.dropna(inplace=True)

FET = FisherExactTest()
og_test = FET.initial_arrays(df_fisher.white, df_fisher.yes)
FET.simulation(10000)

For m=10000, 0.13 minutes were needed.
Number of simulated test statistics larger or equal to original test statistic:	 0


0

## Additional analysis - Discrimination against male ##

### Replication of tables 3, 5, 6, adding guest_male ###

In [55]:
df['shared_guest_male'] = df['shared_property'] * df['guest_male']
df['multiple_male'] = df['multiple_listings'] * df['guest_male']
df['ten_reviews_male'] = df['ten_reviews'] * df['guest_male']
df['young_male'] = df['young'] * df['guest_male']
df['any_black_gm'] = df['any_black'] * df['guest_male']

df_model = df[['yes','guest_black', 'guest_male', 'name_by_city', 'shared_property', 'shared_guest_male', 
               'multiple_listings', 'multiple_male', 'ten_reviews', 'ten_reviews_male',
              'young', 'young_male', 'any_black', 'any_black_gm']].dropna()


model = smf.ols('yes ~ guest_black + guest_male + shared_property + shared_guest_male', data=df_model)
result1 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + guest_male + multiple_listings + multiple_male', data=df_model)
result2 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + guest_male + ten_reviews + ten_reviews_male', data=df_model)
result3 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + guest_male + young + young_male', data=df_model)
result4 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + guest_male + any_black + any_black_gm', data=df_model)
result5 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

tble5 = summary_col([result1, result2, result3, result4, result5], stars=True, float_format='%.2f', 
                   regressor_order=['guest_black', 'guest_male', 'shared_property', 'shared_guest_male', 
                        'multiple_listings', 'multiple_male', 'ten_reviews', 'ten_reviews_male',
                        'young', 'young_male', 'any_male', 'any_black', 'any_black_gm'],  
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2_adj':lambda x: "{:.3f}".format(x.rsquared_adj),
                             'Implied coefficient':lambda x: "{0:.2f}".format(float(x.params[2]+x.params[4]))})
tble5

0,1,2,3,4,5
,yes I,yes II,yes III,yes IIII,yes IIIII
guest_black,-0.08***,-0.08***,-0.08***,-0.08***,-0.08***
,(0.02),(0.02),(0.02),(0.02),(0.02)
guest_male,-0.02,-0.05***,-0.04**,-0.03*,-0.04**
,(0.02),(0.02),(0.02),(0.02),(0.02)
shared_property,0.02,,,,
,(0.02),,,,
shared_guest_male,-0.06**,,,,
,(0.02),,,,
multiple_listings,,0.09***,,,


In [44]:
model = smf.ols('yes ~ guest_black + guest_male', data=df_model)
result_all = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

df_model_male = df_model[df_model['shared_property'] == 1]
model = smf.ols('yes ~ guest_black + guest_male', data=df_model_male)
result_male = model.fit(cov_type='cluster', cov_kwds={'groups': df_model_male['name_by_city']})

df_model_female = df_model[df_model['shared_property'] == 0]
model = smf.ols('yes ~ guest_black + guest_male', data=df_model_female)
result_female = model.fit(cov_type='cluster', cov_kwds={'groups': df_model_female['name_by_city']})


tble = summary_col([result_all, result_male, result_female], stars=True, float_format='%.2f', regressor_order=['guest_black', 'guest_male', 'Intercept'], info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)), 'R²_adj':lambda x: "{0:.3f}".format(float(x.rsquared_adj))})
tble

0,1,2,3
,yes I,yes II,yes III
guest_black,-0.08***,-0.09***,-0.07***
,(0.02),(0.02),(0.02)
guest_male,-0.05***,-0.08***,-0.02
,(0.02),(0.02),(0.02)
Intercept,0.51***,0.53***,0.50***
,(0.01),(0.02),(0.01)
R-squared,0.01,0.01,0.00
,0.01,0.01,0.00
N,6235,2974,3261


In [57]:
df['guest_black_male'] = df['guest_black'] * df['guest_male']
df['guest_black_male_shared'] = df['guest_black'] * df['guest_male'] * df['shared_property']

df_model = df[['yes','guest_black', 'guest_male', 'name_by_city', 'shared_property', 'shared_guest_male', 
               'guest_black_male', 'shared_guest_black', 'guest_black_male_shared']].dropna()

model = smf.ols('yes ~ guest_black + guest_male', data=df_model)
result0 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + guest_male + guest_black_male', data=df_model)
result1 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + guest_male + guest_black_male + shared_property + shared_guest_male + shared_guest_black + guest_black_male_shared', data=df_model)
result3 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})


tble5 = summary_col([result0, result1, result3], stars=True, float_format='%.2f', 
                   regressor_order=['guest_black', 'guest_male', 'guest_black_male', 'shared_property', 'shared_guest_male', 'shared_guest_black',
                        'guest_black_male_shared'],  
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2_adj':lambda x: "{:.3f}".format(x.rsquared_adj),
                             'Implied coefficient':lambda x: "{0:.2f}".format(float(x.params[1]+x.params[1]))})
tble6 = summary_col([result_all, result_male, result_female, result0, result1, result3], stars=True, float_format='%.2f', 
                   regressor_order=['guest_black', 'guest_male', 'guest_black_male', 'shared_property', 'shared_guest_male', 'shared_guest_black',
                        'guest_black_male_shared'],  
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2_adj':lambda x: "{:.3f}".format(x.rsquared_adj)})
tble6

0,1,2,3,4,5,6
,yes I,yes II,yes III,yes IIII,yes IIIII,yes IIIIII
guest_black,-0.08***,-0.09***,-0.07***,-0.08***,-0.08***,-0.08***
,(0.02),(0.02),(0.02),(0.02),(0.02),(0.03)
guest_male,-0.05***,-0.08***,-0.02,-0.05***,-0.05**,-0.03
,(0.02),(0.02),(0.02),(0.02),(0.02),(0.02)
guest_black_male,,,,,0.01,0.02
,,,,,(0.03),(0.04)
shared_property,,,,,,0.03
,,,,,,(0.02)
shared_guest_male,,,,,,-0.05*


In [53]:
df_model = df[['yes','guest_black','name_by_city', 'host_gender_M', 'host_race_black', 'shared_property']].dropna()
df_model3 = df[['yes','guest_black','name_by_city', 'host_gender_M', 'host_race_black', 'multiple_listings', 'shared_property', 'ten_reviews', 'log_price']].dropna()
df_model = df_model[df_model['shared_property'] == 0]
df_model3 = df_model3[df_model3['shared_property'] == 0]



model = smf.ols('yes ~ guest_black', data=df_model)
result1 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + host_race_black + host_gender_M', data=df_model)
result2 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

model = smf.ols('yes ~ guest_black + host_race_black + host_gender_M + multiple_listings + ten_reviews + log_price', data=df_model3)
result3 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model3['name_by_city']})


tble = summary_col([result1, result2, result3], stars=True, float_format='%.2f', regressor_order=['guest_black', 'host_race_black', 'host_gender_M', 'multiple_listings', 'shared_property', 'ten_reviews', 'log_price', 'Intercept'], info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)), 'R²_adj':lambda x: "{0:.3f}".format(float(x.rsquared_adj))})
tble

0,1,2,3
,yes I,yes II,yes III
guest_black,-0.07***,-0.07***,-0.08***
,(0.02),(0.02),(0.02)
host_race_black,,0.13***,0.14***
,,(0.04),(0.04)
host_gender_M,,-0.03*,-0.03*
,,(0.02),(0.02)
multiple_listings,,,0.06***
,,,(0.02)
ten_reviews,,,0.10***


# Probably discard # 

In [21]:
df['guest_male_price_median'] = df['guest_male'] * df['price_median']
df['guest_male_pop_black'] = df['guest_male'] * df['black_proportion']
df['guest_male_tract_listings'] = df['guest_male'] * df['tract_listings']
df['guest_male_pr_filled'] = df['guest_male'] * df['pr_filled']

df_model = df[['yes', 'name_by_city', 'guest_black', 'guest_male','price_median', 'guest_male_price_median']].dropna()
model = smf.ols('yes ~ guest_black + guest_male + price_median + guest_male_price_median', data=df_model)
result1 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

df_model = df[['yes', 'name_by_city', 'guest_black', 'guest_male','black_proportion', 'guest_male_pop_black']].dropna()
model = smf.ols('yes ~ guest_black + guest_male + black_proportion + guest_male_pop_black', data=df_model)
result2 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

df_model = df[['yes', 'name_by_city', 'guest_black', 'guest_male','tract_listings', 'guest_male_tract_listings']].dropna()
model = smf.ols('yes ~ guest_black + guest_male + tract_listings + guest_male_tract_listings', data=df_model)
result3 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

df_model = df[['yes', 'name_by_city', 'guest_black', 'guest_male','pr_filled', 'guest_male_pr_filled']].dropna()
model = smf.ols('yes ~ guest_black + guest_male + pr_filled + guest_male_pr_filled', data=df_model)
result4 = model.fit(cov_type='cluster', cov_kwds={'groups': df_model['name_by_city']})

tble6 = summary_col([result1, result2, result3, result4], stars=True, float_format='%.2f', 
                   regressor_order=['guest_black', 'guest_male','price_median', 'guest_male_price_median', 'black_proportion', 
                        'guest_male_pop_black', 'tract_listings', 'guest_male_tract_listings', 'pr_filled',
                        'guest_male_pr_filled'],  
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2_adj':lambda x: "{:.3f}".format(x.rsquared_adj)})
tble6

0,1,2,3,4
,yes I,yes II,yes III,yes IIII
guest_black,-0.08***,-0.08***,-0.08***,-0.08***
,(0.02),(0.02),(0.02),(0.02)
guest_male,-0.06***,-0.04**,-0.04*,-0.05
,(0.02),(0.02),(0.02),(0.05)
price_median,-0.07***,,,
,(0.02),,,
guest_male_price_median,0.03,,,
,(0.03),,,
black_proportion,,0.09*,,


In [16]:
# Regress each control on treatment variable
used_controls = ['host_gender_M', 'host_race_black', 'multiple_listings', 'shared_property', 'log_price', 
                     'host_race_black', 'host_gender_M', 'host_gender_F', 'shared_property', 'multiple_listings', 
                     'ten_reviews', 'young', 'price_median', 'black_proportion', 'guest_black',
                     'tract_listings', 'pr_filled', 'baltimore', 'dallas', 'los_angeles', 'sl', 'dc']
used_control_models = {}

print('{0:20} {1:>1}'.format('Control Variable', 'P-Value'))
for control in used_controls:
    model = smf.ols(control + ' ~ guest_male', data=df)
    result = model.fit()
    used_control_models[control] = result
    print('{0:20} {1:>6.3f}'.format(control + ':', result.pvalues[1]))

Control Variable     P-Value
host_gender_M:        0.824
host_race_black:      0.843
multiple_listings:    0.504
shared_property:      0.918
log_price:            0.948
host_race_black:      0.843
host_gender_M:        0.824
host_gender_F:        0.538
shared_property:      0.918
multiple_listings:    0.504
ten_reviews:          0.169
young:                0.980
price_median:         0.484
black_proportion:     0.407
guest_black:          0.408
tract_listings:       0.371
pr_filled:            0.884
baltimore:            0.556
dallas:               0.268
los_angeles:          0.538
sl:                   0.360
dc:                   0.475


In [22]:
male_array = df.guest_male.apply(lambda x: True if x==0 else False if x==1 else np.NaN) # True if guest is female
df_fisher = pd.DataFrame({'male': male_array, 'yes':df['yes']})
df_fisher.dropna(inplace=True)

FET = FisherExactTest()
og_test = FET.initial_arrays(df_fisher.male, df_fisher.yes)
FET.simulation(10000)

For m=10000, 0.15 minutes were needed.
Number of simulated test statistics larger or equal to original test statistic:	 0


0