#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### CHAPTER 21
**CH20A Founder/family ownership and quality of management**

using the wms-management dataset

version 1.0 2021-05-05

In [1]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from IPython.core.display import HTML
from sklearn.neighbors import NearestNeighbors
from stargazer.stargazer import Stargazer

warnings.filterwarnings("ignore")


In [2]:
# Current script folder
current_path = os.getcwd()
dirname = current_path.split("da_case_studies")[0]

# location folders
data_in = dirname + "da_data_repo/wms-management-survey/clean/"
data_out = dirname + "da_case_studies/ch21-ownership-management-quality/"
output = dirname + "da_case_studies/ch21-ownership-management-quality/output/"

func = dirname + "da_case_studies/ch00-tech-prep/"
sys.path.append(func)

In [3]:
# Import the prewritten helper functions 
from py_helper_functions import *

### Read in data

In [4]:
# !!! make sure you have run ch21-wms-01-dataprep.ipynb before
data = pd.read_csv(data_out+"wms_da_textbook-work.csv")

In [5]:
data.groupby("foundfam_owned").mean()["management"]

foundfam_owned
0.0    3.079595
1.0    2.683222
Name: management, dtype: float64

### Set variables to use

In [6]:
y_var = "management"
x_var = "foundfam_owned"

control_vars = ["degree_nm", "degree_nm_sq", "compet_moder", "compet_strong", 
                  "lnemp", "age_young", "age_old", "age_unknown"]

control_vars_to_interact = ["industry", "countrycode"]

## REGRESSIONS

In [7]:
# OLS with no control vars.
formula1 = y_var + " ~ " + x_var
ols1 = smf.ols(formula=formula1, data=data).fit()

In [8]:
# OLS with all control vars
formula2 = (
    y_var + " ~ " + x_var + " + " + " + ".join(control_vars + control_vars_to_interact)
)
ols2 = smf.ols(formula=formula2, data=data).fit()

In [9]:
# OLS with all controls + interactions
formula3 = (
    y_var
    + " ~ "
    + x_var
    + "+"
    + ":".join(control_vars_to_interact)
    + "+("
    + "+".join(control_vars)
    + ")*("
    + "+".join(control_vars_to_interact)
    + ")"
)
ols3 = smf.ols(formula=formula3,data=data).fit()

In [10]:
stargazer = Stargazer([ols1, ols2, ols3])
stargazer.rename_covariates({"Intercept": "Constant"})
stargazer.covariate_order([x_var,"Intercept"])
stargazer.significant_digits(2)
stargazer.custom_columns(
    ["No confounders", "With confounders", "With confounders interacted"],
    [1, 1, 1],
)
stargazer.show_model_numbers(False)
HTML(stargazer.render_html())

0,1,2,3
,,,
,Dependent variable:management,Dependent variable:management,Dependent variable:management
,,,
,No confounders,With confounders,With confounders interacted
,,,
foundfam_owned,-0.40***,-0.20***,-0.19***
,(0.01),(0.01),(0.01)
Constant,3.08***,1.73***,0.81***
,(0.01),(0.05),(0.29)
Observations,11672,11672,11672


In [11]:
data["management"].describe()

count    11672.000000
mean         2.903924
std          0.657525
min          1.000000
25%          2.444444
50%          2.888889
75%          3.333333
max          4.888889
Name: management, dtype: float64

In [12]:
data["empbin5"] = pd.cut(
    data["emp_firm"],
    bins=data["emp_firm"].quantile(np.arange(0, 1.01, 0.2)),
    include_lowest=True,
    right=False,
)
data["agecat"] = (
    (data["age_young"] == True)
    + 2 * (data["age_mid"] == True)
    + 3 * (data["age_old"] == True)
    + 4 * (data["age_unknown"] == True)
)

In [13]:
def functions_for_groupby(x):
    d = {}
    d["n"] = x["foundfam_owned"].count()
    d["n0"] = sum(1 - x["foundfam_owned"])
    d["n1"] = sum(x["foundfam_owned"])
    d["y0"] = sum(x["management"] * (x["foundfam_owned"] == 0))
    d["y1"] = sum(x["management"] * (x["foundfam_owned"] == 1))
    return pd.Series(d, index=["n", "n0", "n1", "y0", "y1"])

In [14]:
data_agg = (
    data.groupby(
        [
            "degree_nm_bins",
            "agecat",
            "competition",
            "empbin5",
            "industry",
            "countrycode",
        ]
    )
    .apply(functions_for_groupby)
    .reset_index()
)

data_agg = data_agg.loc[data_agg["n"].notnull()]
data_agg["y0"] = data_agg["y0"] / data_agg["n0"]
data_agg["y1"] = data_agg["y1"] / data_agg["n1"]

In [15]:
# firms with/without exact match
pd.Series(
    {
        "n0==0": data_agg.loc[lambda x: x["n0"] == 0].shape[0],
        "n1==0": data_agg.loc[lambda x: x["n1"] == 0].shape[0],
        "n0!=0 & n1!=0 ": data_agg.loc[lambda x: (x["n0"] != 0) & (x["n1"] != 0)].shape[
            0
        ],
    }
)

n0==0             3528
n1==0             4689
n0!=0 & n1!=0      738
dtype: int64

In [16]:
# random order just for the examples

In [17]:
np.random.seed(123)
data_sample = data_agg.sample(n=340)[
    [
        "industry",
        "countrycode",
        "degree_nm_bins",
        "competition",
        "agecat",
        "empbin5",
        "n1",
        "n0",
        "n",
    ]
]

In [18]:
# examples with founder/family only
data_sample.loc[lambda x: (x["n1"] == 1) & (x["n0"] == 0)].head(8)

Unnamed: 0,industry,countrycode,degree_nm_bins,competition,agecat,empbin5,n1,n0,n
98720,petrol,gb,"[0.05, 0.2)",1-4 competitors,3,"[1.0, 120.0)",1.0,0.0,1.0
91642,tobacco,gr,"[0.05, 0.2)",10+ competitors,2,"[1.0, 120.0)",1.0,0.0,1.0
36983,apparel,us,"[0.0, 0.001)",5-9 competitors,4,"[200.0, 350.0)",1.0,0.0,1.0
92655,apparel,mx,"[0.05, 0.2)",10+ competitors,2,"[350.0, 800.0)",1.0,0.0,1.0
8302,furniture,tr,"[0.0, 0.001)",5-9 competitors,1,"[200.0, 350.0)",1.0,0.0,1.0
50961,fabricated_metal,ge,"[0.001, 0.05)",1-4 competitors,2,"[120.0, 200.0)",1.0,0.0,1.0
82540,transport,cl,"[0.05, 0.2)",10+ competitors,1,"[120.0, 200.0)",1.0,0.0,1.0
139354,glass,gr,"[0.2, 1.01)",10+ competitors,3,"[1.0, 120.0)",1.0,0.0,1.0


In [19]:
# examples with other only:
data_sample.loc[lambda x: (x["n1"] == 0) & (x["n0"] == 1)].head(8)

Unnamed: 0,industry,countrycode,degree_nm_bins,competition,agecat,empbin5,n1,n0,n
52202,printing,br,"[0.001, 0.05)",1-4 competitors,2,"[350.0, 800.0)",0.0,1.0,1.0
101798,chemical,jp,"[0.05, 0.2)",10+ competitors,3,"[200.0, 350.0)",0.0,1.0,1.0
22160,fabricated_metal,gb,"[0.0, 0.001)",1-4 competitors,3,"[120.0, 200.0)",0.0,1.0,1.0
33023,printing,us,"[0.0, 0.001)",1-4 competitors,4,"[350.0, 800.0)",0.0,1.0,1.0
92988,primary_metal,ir,"[0.05, 0.2)",10+ competitors,2,"[350.0, 800.0)",0.0,1.0,1.0
91722,chemical,po,"[0.05, 0.2)",10+ competitors,2,"[120.0, 200.0)",0.0,1.0,1.0
56072,rubber,gb,"[0.001, 0.05)",5-9 competitors,2,"[120.0, 200.0)",0.0,1.0,1.0
118195,food,pt,"[0.2, 1.01)",1-4 competitors,1,"[120.0, 200.0)",0.0,1.0,1.0


In [20]:
# examples of similar firms unmatched
data_sample.loc[
    lambda x: (x["countrycode"] == "us") & (x["industry"] == "food") & (x["n"] == 1)
]

Unnamed: 0,industry,countrycode,degree_nm_bins,competition,agecat,empbin5,n1,n0,n


In [21]:
# ATE/ATET by exact matching:
data_agg["y1-y0"] = data_agg["y1"] - data_agg["y0"]
pd.DataFrame.from_dict(
    {
        "ATE": np.average(
            data_agg.loc[lambda x: (x["n0"] > 0) & (x["n1"] > 0), "y1-y0"],
            weights=data_agg.loc[lambda x: (x["n0"] > 0) & (x["n1"] > 0), "n"],
        ),
        "ATET": np.average(
            data_agg.loc[lambda x: (x["n0"] > 0) & (x["n1"] > 0), "y1-y0"],
            weights=data_agg.loc[lambda x: (x["n0"] > 0) & (x["n1"] > 0), "n1"],
        ),
    },
    orient="index",
)

Unnamed: 0,0
ATE,-0.179515
ATET,-0.16705


## Matching on the propensity score 

In [22]:
data_pscore = data[[y_var] + [x_var] + control_vars + control_vars_to_interact].dropna()

In [23]:
formula_pscore1 = x_var + "~" + "+".join(control_vars + control_vars_to_interact)

In [24]:
log_reg_model = smf.logit(formula=formula_pscore1, data=data_pscore)
log_reg = log_reg_model.fit()

Optimization terminated successfully.
         Current function value: 0.587516
         Iterations 6


In [25]:
data_pscore["pscore"] = log_reg.predict()

In [26]:
def get_pscore_matched(data, variable="foundfam_owned"):
    """
    This function performs Single Nearest Neighbor search on the estimated
    propensity score to find matches to the treated sample.

    """

    # get treatment and control group's estimated pscore
    x_1_pscore = data.loc[lambda x: x[variable] == 1, "pscore"].values
    x_0_pscore = data.loc[lambda x: x[variable] == 0, "pscore"].values

    # this is the NN model, n_neighbors set to 1 to find the closest neightbor
    knn = NearestNeighbors(n_neighbors=1, metric="euclidean", n_jobs=1)

    # fit the model on control (x=0)
    knn.fit(x_0_pscore.reshape(-1, 1))

    # find nearest controls (x=0) to the treated group (x=1)
    indices = knn.kneighbors(x_1_pscore.reshape(-1, 1), return_distance=False)
    index, counts = np.unique(indices, return_counts=True)
    weights = pd.concat([pd.Series(index), pd.Series(counts)], axis=1)

    x_1 = data[lambda x: x[variable] == 1]

    x_0_matched = (
        weights.merge(
            data.loc[lambda x: x[variable] == 0].reset_index(drop=True).reset_index(),
            left_on=0,
            right_on="index",
            how="left",
        )
        .drop(columns=[0, "index"])
        .rename({1: "weights"}, axis=1)
    )

    data_matched = pd.concat([x_1, x_0_matched], axis=0)

    data_matched["weights"] = data_matched["weights"].fillna(1)

    return data_matched

In [27]:
data_matched = get_pscore_matched(data_pscore, "foundfam_owned")

# NOTE: We use weights here,to account for control observations that were matched 
#       to multiple treated osbervations.
#       This is different from weights used to estimate ATE!

reg_match = smf.wls(
    formula="management~foundfam_owned",
    data=data_matched,
    weights=data_matched["weights"],
).fit()
reg_match.summary()

0,1,2,3
Dep. Variable:,management,R-squared:,0.028
Model:,WLS,Adj. R-squared:,0.028
Method:,Least Squares,F-statistic:,223.3
Date:,"Mon, 03 Oct 2022",Prob (F-statistic):,8.73e-50
Time:,13:45:07,Log-Likelihood:,-7893.0
No. Observations:,7714,AIC:,15790.0
Df Residuals:,7712,BIC:,15800.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.8971,0.010,286.290,0.000,2.877,2.917
foundfam_owned,-0.2138,0.014,-14.942,0.000,-0.242,-0.186

0,1,2,3
Omnibus:,474.842,Durbin-Watson:,1.657
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1723.122
Skew:,-0.22,Prob(JB):,0.0
Kurtosis:,5.273,Cond. No.,2.62


In [28]:
formula_pscore2 = (
    x_var
    + " ~ "
    + ":".join(control_vars_to_interact)
    + "+("
    + "+".join(control_vars)
    + ")*("
    + "+".join(control_vars_to_interact)
    + ")"
)

In [29]:
logit_model_2 = smf.logit(formula=formula_pscore2, data=data_pscore)
log_reg2 = logit_model_2.fit(method="ncg")

Optimization terminated successfully.
         Current function value: 0.528371
         Iterations: 20
         Function evaluations: 24
         Gradient evaluations: 24
         Hessian evaluations: 20


In [30]:
data_pscore["pscore"] = log_reg2.predict()

data_matched_2 = get_pscore_matched(data_pscore, "foundfam_owned")

In [31]:
# NOTE: We use weights here,to account for control observations that were matched 
#       to multiple treated osbervations.
#       This is different from weights used to estimate ATE!

model2 = smf.wls(
    formula="management~foundfam_owned",
    data=data_matched_2,
    weights=data_matched_2["weights"],
)
reg_match_2 = model2.fit()
reg_match_2.summary()

0,1,2,3
Dep. Variable:,management,R-squared:,0.025
Model:,WLS,Adj. R-squared:,0.025
Method:,Least Squares,F-statistic:,192.1
Date:,"Mon, 03 Oct 2022",Prob (F-statistic):,3.82e-43
Time:,13:45:19,Log-Likelihood:,-7650.7
No. Observations:,7395,AIC:,15310.0
Df Residuals:,7393,BIC:,15320.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.8838,0.010,281.856,0.000,2.864,2.904
foundfam_owned,-0.2005,0.014,-13.860,0.000,-0.229,-0.172

0,1,2,3
Omnibus:,1201.586,Durbin-Watson:,1.681
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11460.515
Skew:,-0.487,Prob(JB):,0.0
Kurtosis:,9.02,Cond. No.,2.62


In [32]:
stargazer = Stargazer([reg_match, reg_match_2])
stargazer.rename_covariates(
    {"Intercept": "Constant", "foundfam_owned": "ATET estimate"}
)
stargazer.covariate_order([x_var])
stargazer.custom_columns(
    ["All confounders", "All confounders with industry and country"], [1, 1]
)
stargazer.significant_digits(2)
stargazer.show_model_numbers(False)
print(
    """
    Note: ATE not calculated here.
    Note: the "number of matched observations" calculated by 
          this code varies marginally from the one on p607 in the textbook.

    
    
    Warning: the standard error estimates are off here because they don't factor 
    in the uncertainty due to the fac that the propensity score is an estimate 
    itself from the same sample.
    
    To get the correct SE estimates you would have to bootstrap the whole 
    procedure of estimating the logit and carrying out matching. We don't 
    do that in this code.
"""
)
stargazer


    Note: ATE not calculated here.
    Note: the "number of matched observations" calculated by 
          this code varies marginally from the one on p607 in the textbook.

    
    
    in the uncertainty due to the fac that the propensity score is an estimate 
    itself from the same sample.
    
    To get the correct SE estimates you would have to bootstrap the whole 
    procedure of estimating the logit and carrying out matching. We don't 
    do that in this code.



0,1,2
,,
,Dependent variable:management,Dependent variable:management
,,
,All confounders,All confounders with industry and country
,,
ATET estimate,-0.21***,-0.20***
,(0.01),(0.01)
Observations,7714,7395
R2,0.03,0.03
Adjusted R2,0.03,0.03
