#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### Chapter 16
**CH16A Predicting apartment prices with random forest**

using the airbnb dataset

version 1.0 2021-05-05

In [22]:
import os
import sys
import warnings
from datetime import datetime as dt

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from IPython.core.display import HTML
from plotnine import *
from stargazer.stargazer import Stargazer
import pyfixest as pf

warnings.filterwarnings("ignore")


In [23]:
# Current script folder
current_path = os.getcwd()
dirname = current_path.split("da_case_studies")[0]

# location folders
data_in = dirname + "da_data_repo/worldbank-immunization/clean/"
data_out = dirname + "da_case_studies/ch23-immunization-life/"
output = dirname + "da_case_studies/ch23-immunization-life/output/"

func = dirname + "da_case_studies/ch00-tech-prep/"
sys.path.append(func)


In [24]:
# Import the prewritten helper functions
from py_helper_functions import *


In [25]:
# Import data

data = pd.read_csv("C:/Users/alexa/Documents/da_data_repo/worldbank-immunization/worldbank-immunization-continents.csv")

In [None]:
p1 = (
    ggplot(data, aes(x="year", y="imm_SAS"))
    + geom_line(color=color[0], size=1)
    + geom_line(aes(x="year", y="imm_SSF"), color=color[1], size=1)
    + annotate(
        geom="text",
        x=data.loc[12, "year"] - 2,
        y=data.loc[12, "imm_SAS"],
        label="South Asia",
        size=10,
    )
    + annotate(
        geom="text",
        x=data.loc[16, "year"],
        y=data.loc[16, "imm_SSF"] - 2,
        label="sub-Saharan Africa",
        size=10,
    )
    + labs(y="Immunization rate (percent)", x="Date (year)")
    + scale_y_continuous(expand=(0, 0), breaks=seq(50, 100, 10), limits=(50, 100))
    + scale_x_continuous(
        expand=(0, 0), breaks=seq(1998, 2018, 5), limits=(1998, 2018)
    )
    + theme_bw()
)

for col in data.columns[1:6]:
    p1 += geom_line(aes(x="year", y=col), color="grey", size=0.7)

p1


In [None]:
for col in data.columns[8:]:
    data[col] = data[col] / 10


In [None]:
p2 = (
    ggplot(data, aes(x="year", y="surv_SAS"))
    + geom_line(color=color[0], size=1)
    + geom_line(aes(x="year", y="surv_SSF"), color=color[1], size=1)
    + annotate(
        geom="text",
        x=data.loc[12, "year"] - 4,
        y=data.loc[12, "surv_SAS"],
        label="South Asia",
        size=10,
    )
    + annotate(
        geom="text",
        x=data.loc[16, "year"],
        y=data.loc[16, "surv_SSF"] - 2,
        label="sub-Saharan Africa",
        size=10,
    )
    + labs(y="Child survival rate (percent)", x="Date (year)")
    + scale_y_continuous(expand=(0, 0), breaks=seq(80, 100, 5), limits=(80, 100))
    + scale_x_continuous(
        expand=(0, 0), breaks=range(1998, 2019, 5), limits=(1998, 2018)
    )
    + theme_bw()
)

for col in data.columns[8:13]:
    p2 += geom_line(aes(x="year", y=col), color="grey", size=0.7)

p2


# Regressions on countries


In [26]:
data_panel = pd.read_csv("C:/Users/alexa/Documents/da_data_repo/worldbank-immunization/worldbank-immunization-panel.csv")
data_panel.describe()


Unnamed: 0,year,pop,mort,surv,imm,gdppc,lngdppc,hexp
count,3807.0,3807.0,3807.0,3807.0,3807.0,3642.0,3642.0,3165.0
mean,2007.526924,35.067473,42.982847,95.701715,85.298135,16337.051802,9.042446,6.274914
std,5.759522,132.455594,45.517445,4.551744,15.738758,19070.3252,1.234401,2.878283
min,1998.0,0.009332,2.1,75.72,8.0,275.518093,5.618653,1.024978
25%,2003.0,1.756771,9.3,93.425,79.0,3114.288251,8.043756,4.321824
50%,2008.0,7.395599,23.3,97.67,92.0,9386.069679,9.146982,5.766445
75%,2013.0,23.576856,65.75,99.07,96.0,22449.101714,10.019005,7.916312
max,2017.0,1386.395,242.8,99.79,99.0,124024.568165,11.728235,27.417822


In [27]:
data_panel = data_panel.dropna(subset=["imm", "gdppc"])


In [28]:
data_panel["balanced"] = data_panel["c"].isin(
    data_panel.groupby("c")
    .agg(
        min_year=("year", min), max_year=("year", max), n_unique_years=("year", "count")
    )
    .query("(min_year == 1998)&(max_year == 2017)&(n_unique_years==20)")
    .index
)


In [29]:
data_balanced = data_panel.query("balanced == True")


In [30]:
countries_grouped = data_balanced.groupby("c")

data_balanced["lnpop"] = countries_grouped["pop"].transform(np.log)
data_balanced["d_surv"] = countries_grouped["surv"].transform("diff")
data_balanced["d_imm"] = countries_grouped["imm"].transform("diff")
data_balanced["d2_imm"] = countries_grouped["d_imm"].transform("diff")
data_balanced["d_lngdppc"] = countries_grouped["lngdppc"].transform("diff")
data_balanced["d_lnpop"] = countries_grouped["lnpop"].transform("diff")
data_balanced["avgpop"] = countries_grouped["pop"].transform("mean")

data_balanced = data_balanced.sort_values(by=["c", "year"])


## Fixed Effect

In [31]:
import pyfixest as pf

In [32]:
data_balanced = data_balanced.assign(
    Year=lambda x: x["year"].astype("category")
).set_index(["c", "year"])


In [33]:
fe_lm = pf.feols(
    fml = "surv ~ imm | Year + c", 
    data = data_balanced.reset_index(), 
    weights = "avgpop", 
    vcov = {"CRV1": "c"}
)

pf.etable(fe_lm)

coef,coef.1
imm,0.077*** (0.010)
fe,fe
Year,x
c,x
modelstats,modelstats
Observations,3440
S.E. type,by: c
R2,-
,surv
,(1)


In [34]:
# r2 measures currently not supported via pyfixest with WLS
# fe_lm._r2_within

In [35]:
fe_lm2 = pf.feols(
    fml = "surv ~ imm + lngdppc + lnpop | Year + c", 
    data = data_balanced.reset_index(), 
    weights = "avgpop", 
    vcov = {"CRV1": "c"}
)

pf.etable([fe_lm, fe_lm2])

coef,coef.1,coef.2
imm,0.077*** (0.010),0.038*** (0.011)
lngdppc,,1.593*** (0.398)
lnpop,,12.049*** (1.643)
fe,fe,fe
Year,x,x
c,x,x
modelstats,modelstats,modelstats
Observations,3440,3440
S.E. type,by: c,by: c
R2,-,-


In [36]:
# within R2
fe_lm2._r2_within


nan

In [37]:
# no weights, not in book
fe_lm2_nowts = pf.feols(
    fml = "surv ~ imm + lngdppc + lnpop | Year + c", 
    data = data_balanced.reset_index(), 
    vcov = {"CRV1": "c"}
)

pf.etable([fe_lm, fe_lm2, fe_lm2_nowts])

coef,coef.1,coef.2,coef.3
imm,0.077*** (0.010),0.038*** (0.011),0.038*** (0.009)
lngdppc,,1.593*** (0.398),2.579*** (0.445)
lnpop,,12.049*** (1.643),7.717*** (1.474)
fe,fe,fe,fe
Year,x,x,x
c,x,x,x
modelstats,modelstats,modelstats,modelstats
Observations,3440,3440,3440
S.E. type,by: c,by: c,by: c
R2,-,-,0.955


In [38]:
# large difference in R2
print(fe_lm2_nowts._r2_within)
print(fe_lm2._r2_within)


0.35603582860970395
nan
0.35603582860970395
nan


In [39]:
# CLUSTER SE VS BIASED SE

fe_lm3 = pf.feols(
    fml = "surv ~ imm + lngdppc + lnpop | Year + c", 
    data = data_balanced.reset_index(), 
    weights = "avgpop", 
    vcov = "hetero"
)

In [40]:
pf.etable([fe_lm2, fe_lm3])

coef,coef.1,coef.2
imm,0.038*** (0.011),0.038*** (0.004)
lngdppc,1.593*** (0.398),1.593*** (0.119)
lnpop,12.049*** (1.643),12.049*** (0.501)
fe,fe,fe
Year,x,x
c,x,x
modelstats,modelstats,modelstats
Observations,3440,3440
S.E. type,by: c,hetero
R2,-,-
