#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### CHAPTER 21
**CH20A Founder/family ownership and quality of management**

using the wms-management dataset

version 1.0 2021-05-05

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import os
import sys
from sklearn.neighbors import NearestNeighbors

from stargazer.stargazer import Stargazer
from IPython.core.display import HTML
import warnings

warnings.filterwarnings("ignore")

In [2]:
# Current script folder
current_path = os.getcwd()
dirname = "/".join(current_path.split("/")[:-2]) + "/"

# location folders
data_in = dirname + "da_data_repo/wms-management-survey/clean/"
data_out = dirname + "da_case_studies/ch21-ownership-management-quality/"
output = dirname + "da_case_studies/ch21-ownership-management-quality/output/"

func = dirname + "da_case_studies/ch00-tech-prep/"
sys.path.append(func)

In [3]:
# Import the prewritten helper functions 
from py_helper_functions import *

### Read in data

In [4]:
data = pd.read_csv(data_out+"wms_da_textbook-work.csv")

In [5]:
data.groupby("foundfam_owned").mean()["management"]

foundfam_owned
0    3.047261
1    2.681602
Name: management, dtype: float64

### Set variables to use

In [6]:
y_var = "management"
x_var = "foundfam_owned"

control_vars = ["degree_nm", "degree_nm_sq", "compet_moder", "compet_strong", 
                  "lnemp", "age_young", "age_old", "age_unknown"]

control_vars_to_interact = ["industry", "countrycode"]

## REGRESSIONS

In [7]:
# OLS with no control vars.
formula1 = y_var + " ~ " + x_var
ols1 = smf.ols(formula=formula1, data=data).fit()

In [8]:
# OLS with all control vars
formula2 = (
    y_var + " ~ " + x_var + " + " + " + ".join(control_vars + control_vars_to_interact)
)
ols2 = smf.ols(formula=formula2, data=data).fit()

In [9]:
# OLS with all controls + interactions
formula3 = (
    y_var
    + " ~ "
    + x_var
    + "+"
    + ":".join(control_vars_to_interact)
    + "+("
    + "+".join(control_vars)
    + ")*("
    + "+".join(control_vars_to_interact)
    + ")"
)
ols3 = smf.ols(formula=formula3,data=data).fit()

In [10]:
stargazer = Stargazer([ols1, ols2, ols3])
stargazer.rename_covariates({"Intercept": "Constant"})
stargazer.covariate_order([x_var,"Intercept"])
stargazer.significant_digits(2)
stargazer.custom_columns(
    ["No confounders", "With confounders", "With confounders interacted"],
    [1, 1, 1],
)
stargazer.show_model_numbers(False)
HTML(stargazer.render_html())

0,1,2,3
,,,
,Dependent variable:management,Dependent variable:management,Dependent variable:management
,,,
,No confounders,With confounders,With confounders interacted
,,,
foundfam_owned,-0.37***,-0.19***,-0.19***
,(0.01),(0.01),(0.01)
Constant,3.05***,1.69***,1.06***
,(0.01),(0.06),(0.33)
Observations,8439,8439,8439


In [11]:
data["management"].describe()

count    8439.000000
mean        2.881352
std         0.636060
min         1.055556
25%         2.444444
50%         2.888889
75%         3.333333
max         4.888889
Name: management, dtype: float64

In [12]:
data["empbin5"] = pd.cut(
    data["emp_firm"],
    bins=data["emp_firm"].quantile(np.arange(0, 1.01, 0.2)),
    include_lowest=True,
    right=False,
)
data["agecat"] = (
    (data["age_young"] == True)
    + 2 * (data["age_mid"] == True)
    + 3 * (data["age_old"] == True)
    + 4 * (data["age_unknown"] == True)
)

In [13]:
def functions_for_groupby(x):
    d = {}
    d["n"] = x["foundfam_owned"].count()
    d["n0"] = sum(1 - x["foundfam_owned"])
    d["n1"] = sum(x["foundfam_owned"])
    d["y0"] = sum(x["management"] * (x["foundfam_owned"] == 0))
    d["y1"] = sum(x["management"] * (x["foundfam_owned"] == 1))
    return pd.Series(d, index=["n", "n0", "n1", "y0", "y1"])

In [14]:
data_agg = (
    data.groupby(
        [
            "degree_nm_bins",
            "agecat",
            "competition",
            "empbin5",
            "industry",
            "countrycode",
        ]
    )
    .apply(functions_for_groupby)
    .reset_index()
)

data_agg = data_agg.loc[data_agg["n"].notnull()]
data_agg["y0"] = data_agg["y0"] / data_agg["n0"]
data_agg["y1"] = data_agg["y1"] / data_agg["n1"]

In [15]:
# firms with/without exact match
pd.Series(
    {
        "n0==0": data_agg.loc[lambda x: x["n0"] == 0].shape[0],
        "n1==0": data_agg.loc[lambda x: x["n1"] == 0].shape[0],
        "n0!=0 & n1!=0 ": data_agg.loc[lambda x: (x["n0"] != 0) & (x["n1"] != 0)].shape[
            0
        ],
    }
)

n0==0             2886
n1==0             3605
n0!=0 & n1!=0      462
dtype: int64

In [16]:
# random order just for the examples

In [17]:
np.random.seed(123)
data_sample = data_agg.sample(n=340)[
    [
        "industry",
        "countrycode",
        "degree_nm_bins",
        "competition",
        "agecat",
        "empbin5",
        "n1",
        "n0",
        "n",
    ]
]

In [18]:
# examples with founder/family only
data_sample.loc[lambda x: (x["n1"] == 1) & (x["n0"] == 0)].head(8)

Unnamed: 0,industry,countrycode,degree_nm_bins,competition,agecat,empbin5,n1,n0,n
14092,ind_machinery,cl,"[0,0.001)",1-4 competitors,2,"[760.0, 5000.0)",1.0,0.0,1.0
121450,apparel,gr,"[0.2,1.01)",10+ competitors,1,"[348.8, 760.0)",1.0,0.0,1.0
120175,ind_machinery,fr,"[0.2,1.01)",10+ competitors,1,"[50.0, 120.0)",1.0,0.0,1.0
45127,apparel,fr,"[0.001,0.05)",10+ competitors,1,"[760.0, 5000.0)",1.0,0.0,1.0
57168,electronic,ar,"[0.001,0.05)",5-9 competitors,2,"[760.0, 5000.0)",1.0,0.0,1.0
129851,lumber,in,"[0.2,1.01)",10+ competitors,2,"[50.0, 120.0)",1.0,0.0,1.0
118623,electronic,mx,"[0.2,1.01)",1-4 competitors,1,"[200.0, 348.8)",1.0,0.0,1.0
53690,textile,br,"[0.001,0.05)",10+ competitors,2,"[120.0, 200.0)",1.0,0.0,1.0


In [19]:
# examples with other only:
data_sample.loc[lambda x: (x["n1"] == 0) & (x["n0"] == 1)].head(8)

Unnamed: 0,industry,countrycode,degree_nm_bins,competition,agecat,empbin5,n1,n0,n
111749,rubber,cn,"[0.05,0.2)",10+ competitors,4,"[200.0, 348.8)",0.0,1.0,1.0
7592,rubber,gb,"[0,0.001)",5-9 competitors,1,"[50.0, 120.0)",0.0,1.0,1.0
88849,electronic,au,"[0.05,0.2)",1-4 competitors,2,"[50.0, 120.0)",0.0,1.0,1.0
25441,apparel,au,"[0,0.001)",10+ competitors,3,"[348.8, 760.0)",0.0,1.0,1.0
25137,ind_machinery,ge,"[0,0.001)",10+ competitors,3,"[200.0, 348.8)",0.0,1.0,1.0
131192,glass,gb,"[0.2,1.01)",10+ competitors,2,"[348.8, 760.0)",0.0,1.0,1.0
79749,electronic,sw,"[0.05,0.2)",1-4 competitors,1,"[120.0, 200.0)",0.0,1.0,1.0
138266,chemical,br,"[0.2,1.01)",1-4 competitors,3,"[348.8, 760.0)",0.0,1.0,1.0


In [20]:
# examples of similar firms unmatched
data_sample.loc[
    lambda x: (x["countrycode"] == "us") & (x["industry"] == "food") & (x["n"] == 1)
]

Unnamed: 0,industry,countrycode,degree_nm_bins,competition,agecat,empbin5,n1,n0,n
74999,food,us,"[0.001,0.05)",5-9 competitors,4,"[120.0, 200.0)",1.0,0.0,1.0


In [21]:
# ATE/ATET by exact matching:
data_agg["y1-y0"] = data_agg["y1"] - data_agg["y0"]
pd.DataFrame.from_dict(
    {
        "ATE": np.average(
            data_agg.loc[lambda x: (x["n0"] > 0) & (x["n1"] > 0), "y1-y0"],
            weights=data_agg.loc[lambda x: (x["n0"] > 0) & (x["n1"] > 0), "n"],
        ),
        "ATET": np.average(
            data_agg.loc[lambda x: (x["n0"] > 0) & (x["n1"] > 0), "y1-y0"],
            weights=data_agg.loc[lambda x: (x["n0"] > 0) & (x["n1"] > 0), "n1"],
        ),
    },
    orient="index",
)

Unnamed: 0,0
ATE,-0.157672
ATET,-0.147158


## Matching on the propensity score 

In [22]:
data_pscore = data[[y_var] + [x_var] + control_vars + control_vars_to_interact].dropna()

In [23]:
formula_pscore1 = x_var + "~" + "+".join(control_vars + control_vars_to_interact)

In [24]:
log_reg_model = smf.logit(formula=formula_pscore1, data=data_pscore)
log_reg = log_reg_model.fit()

Optimization terminated successfully.
         Current function value: 0.593408
         Iterations 6


In [25]:
data_pscore["pscore"] = log_reg.predict()

In [26]:
def get_pscore_matched(data, variable="foundfam_owned"):
    """
    This function performs Single Nearest Neighbor search on the estimated
    propensity score to find matches to the treated sample.

    """

    # get treatment and control group's estimated pscore
    x_1_pscore = data.loc[lambda x: x[variable] == 1, "pscore"].values
    x_0_pscore = data.loc[lambda x: x[variable] == 0, "pscore"].values

    # this is the NN model, n_neighbors set to 1 to find the closest neightbor
    knn = NearestNeighbors(n_neighbors=1, metric="euclidean", n_jobs=1)

    # fit the model on control (x=0)
    knn.fit(x_0_pscore.reshape(-1, 1))

    # find nearest controls (x=0) to the treated group (x=1)
    indices = knn.kneighbors(x_1_pscore.reshape(-1, 1), return_distance=False)

    x_1 = data[lambda x: x[variable] == 1]

    x_0_matched = (
        pd.DataFrame(indices)
        .merge(
            data.loc[lambda x: x[variable] == 0].reset_index(drop=True).reset_index(),
            left_on=0,
            right_on="index",
            how="left",
        )
        .drop(columns=[0, "index"])
    )

    data_matched = pd.concat([x_1, x_0_matched], axis=0)

    return data_matched

In [27]:
data_matched = get_pscore_matched(data_pscore,"foundfam_owned")

reg_match = smf.ols(formula="management~foundfam_owned", data=data_matched).fit()
reg_match.summary()

0,1,2,3
Dep. Variable:,management,R-squared:,0.024
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,192.1
Date:,"Sat, 08 May 2021",Prob (F-statistic):,3.630000000000001e-43
Time:,18:26:00,Log-Likelihood:,-7191.8
No. Observations:,7658,AIC:,14390.0
Df Residuals:,7656,BIC:,14400.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.8777,0.010,287.682,0.000,2.858,2.897
foundfam_owned,-0.1961,0.014,-13.861,0.000,-0.224,-0.168

0,1,2,3
Omnibus:,10.209,Durbin-Watson:,1.809
Prob(Omnibus):,0.006,Jarque-Bera (JB):,9.005
Skew:,0.029,Prob(JB):,0.0111
Kurtosis:,2.842,Cond. No.,2.62


In [28]:
formula_pscore2 = (
    x_var
    + " ~ "
    + ":".join(control_vars_to_interact)
    + "+("
    + "+".join(control_vars)
    + ")*("
    + "+".join(control_vars_to_interact)
    + ")"
)

In [29]:
logit_model_2 = smf.logit(formula=formula_pscore2, data=data_pscore)
log_reg2 = logit_model_2.fit(method="ncg")

Optimization terminated successfully.
         Current function value: 0.529673
         Iterations: 24
         Function evaluations: 25
         Gradient evaluations: 25
         Hessian evaluations: 24


In [30]:
data_pscore["pscore"] = log_reg2.predict()

data_matched_2 = get_pscore_matched(data_pscore,"foundfam_owned")

In [31]:
model2 = smf.ols(formula="management~foundfam_owned", data=data_matched_2)
reg_match_2 = model2.fit()
reg_match_2.summary()

0,1,2,3
Dep. Variable:,management,R-squared:,0.028
Model:,OLS,Adj. R-squared:,0.028
Method:,Least Squares,F-statistic:,221.8
Date:,"Sat, 08 May 2021",Prob (F-statistic):,1.8e-49
Time:,18:26:34,Log-Likelihood:,-7054.7
No. Observations:,7658,AIC:,14110.0
Df Residuals:,7656,BIC:,14130.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.8885,0.010,293.983,0.000,2.869,2.908
foundfam_owned,-0.2069,0.014,-14.893,0.000,-0.234,-0.180

0,1,2,3
Omnibus:,26.801,Durbin-Watson:,1.859
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24.345
Skew:,0.098,Prob(JB):,5.17e-06
Kurtosis:,2.806,Cond. No.,2.62


In [32]:
stargazer = Stargazer([reg_match, reg_match_2])
stargazer.rename_covariates(
    {"Intercept": "Constant", "foundfam_owned": "ATET estimate"}
)
stargazer.covariate_order([x_var])
stargazer.custom_columns(
    ["All confounders", "All confounders with industry and country"], [1, 1]
)
stargazer.add_line(
    "Number of observations used in logit",
    [
        data_matched.drop("pscore", axis=1).drop_duplicates().shape[0],
        data_matched_2.drop("pscore", axis=1).drop_duplicates().shape[0],
    ],
)
stargazer.significant_digits(2)
stargazer.show_model_numbers(False)
print(
    """
    Note: ATE not calcuclated here.
    
    
    Warning: the standard error estimates are off here because they don't factor 
    in the uncertainty due to the fac that the propensity score is an estimate 
    itself from the same sample.
    
    To get the correct SE estimates you would have to bootstrap the whole 
    procedure of estimating the logit and carrying out matching. We don't 
    do that in this code.
"""
)
stargazer


    Note: ATE not calcuclated here.
    
    
    in the uncertainty due to the fac that the propensity score is an estimate 
    itself from the same sample.
    
    To get the correct SE estimates you would have to bootstrap the whole 
    procedure of estimating the logit and carrying out matching. We don't 
    do that in this code.



0,1,2
,,
,Dependent variable:management,Dependent variable:management
,,
,All confounders,All confounders with industry and country
,,
ATET estimate,-0.20***,-0.21***
,(0.01),(0.01)
Number of observations used in logit,5714,5477
Observations,7658,7658
R2,0.02,0.03
