#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### CHAPTER 21
**CH20A Founder/family ownership and quality of management**

 using the wms-management dataset
 
 version 1.0 2021-05-05

In [94]:
import os
import sys
import warnings

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")


In [95]:
# Current script folder
current_path = os.getcwd()
dirname = current_path.split("da_case_studies")[0]

# location folders
data_in = dirname + "da_data_repo/wms-management-survey/clean/"
data_out = dirname + "da_case_studies/ch21-ownership-management-quality/"
output = dirname + "da_case_studies/ch21-ownership-management-quality/output/"

func = dirname + "da_case_studies/ch00-tech-prep/"
sys.path.append(func)


In [96]:
# Import the prewritten helper functions
from py_helper_functions import *


## PART I

### Data prep

In [97]:
data = pd.read_csv(data_in + "wms_da_textbook-xsec.csv")
# data = pd.read_csv("https://osf.io/zy9j8/download")


In [98]:
# Ownership: define founder/family owned and drop ownership that's missing or not relevant
# Ownership


In [99]:
ownership = (
    data.groupby("ownership", dropna=False)
    .agg(Freq=("firmid", "count"))
    .assign(Percent=lambda x: 100 * x / x.sum())
)
ownership["Cum"] = ownership["Percent"].cumsum()
ownership.round(2)


Unnamed: 0_level_0,Freq,Percent,Cum
ownership,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dispersed Shareholders,2745,26.7,26.7
"Family owned, CEO unknown",55,0.53,27.23
"Family owned, external CEO",346,3.37,30.6
"Family owned, family CEO",1755,17.07,47.67
"Founder owned, CEO unknown",41,0.4,48.06
"Founder owned, external CEO",300,2.92,50.98
"Founder owned, founder CEO",1856,18.05,69.03
Government,170,1.65,70.69
Other,527,5.13,75.81
Private Equity/Venture Capital,353,3.43,79.25


In [100]:
# Define foundfam owned


In [101]:
data["foundfam_owned"] = np.where(
    (data["ownership"] == "Family owned, external CEO")
    | (data["ownership"] == "Family owned, family CEO")
    | (data["ownership"] == "Family owned, CEO unknown")
    | (data["ownership"] == "Founder owned, external CEO")
    | (data["ownership"] == "Founder owned, CEO unknown")
    | (data["ownership"] == "Founder owned, founder CEO"),
    1,
    np.where(data["ownership"].isnull(), np.nan, 0),
)


In [102]:
# Foundfam owned


In [103]:
ownership = (
    data.groupby("foundfam_owned", dropna=False)
    .agg(Freq=("firmid", "count"))
    .assign(Percent=lambda x: 100 * x / x.sum())
)
ownership["Cum"] = ownership["Percent"].cumsum()
ownership.round(2)


Unnamed: 0_level_0,Freq,Percent,Cum
foundfam_owned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,5913,57.51,57.51
1.0,4353,42.34,99.84
,16,0.16,100.0


In [104]:
# Proportion of managers/non-managers with a college degree
# need correction: -44 means do not know, -99 means missing


In [105]:
data["degree_m"] = data["degree_m"] / 100
data["degree_nm"] = data["degree_nm"] / 100
data.loc[data["degree_m"] < 0, "degree_m"] = None
data.loc[data["degree_nm"] < 0, "degree_nm"] = None


In [106]:
# Generate bins from degree_nm


In [107]:
pd.DataFrame(
    data["degree_nm"].describe(percentiles=[0, 0.1, 0.25, 0.5, 0.75, 0.9, 1])
).T.iloc[:, 4:-1]


Unnamed: 0,0%,10%,25%,50%,75%,90%,100%
degree_nm,0.0,0.0,0.0,0.05,0.14,0.3,1.0


In [108]:
data["degree_nm_bins"] = pd.cut(
    data["degree_nm"], bins=[0, 0.001, 0.05, 0.20, 1.01], right=False
)
data["degree_nm_sq"] = data["degree_nm"] ** 2


In [109]:
data.groupby("degree_nm_bins", dropna=False).agg(
    min=("degree_nm", "min"), max=("degree_nm", max), n=("degree_nm", "count")
)


Unnamed: 0_level_0,min,max,n
degree_nm_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"[0.0, 0.001)",0.0,0.0,3034
"[0.001, 0.05)",0.01,0.04,1434
"[0.05, 0.2)",0.05,0.19,3107
"[0.2, 1.01)",0.2,1.0,1957
,,,0


In [110]:
# Take log of employment
data["lnemp"] = np.log(data["emp_firm"])


In [80]:
# Competition
data["competition"] = data["competition"].str.strip()
data["competition"] = data["competition"].replace(
    {
        "0 competitors": "0-4 competitors",
        "1-4 competitors": "0-4 competitors",
        "5-9 competitors": "5-9 competitors",
        "10+ competitors": "10+ competitors",
    }
)
data["compet_weak"] = data["competition"] == "0-4 competitors"
data["compet_moder"] = data["competition"] == "5-9 competitors"
data["compet_strong"] = data["competition"] == "10+ competitors"

In [81]:
# Industry in 2 digits

industry_names = [
    "food",
    "tobacco",
    "textile",
    "apparel",
    "lumber",
    "furniture",
    "paper",
    "printing",
    "chemical",
    "petrol",
    "rubber",
    "leather",
    "glass",
    "primary_metal",
    "fabricated_metal",
    "ind_machinery",
    "electronic",
    "transport",
    "instrument",
    "misc_manuf",
]


In [82]:
_, index = pd.factorize(data["sic"], sort=True)
dic = dict(zip(index, industry_names))
data["industry"] = data.loc[data["sic"].notnull(), "sic"].apply(lambda x: dic[x])


In [83]:
# Country as factor
data["countrycode"] = data["cty"]


In [84]:
# age
data["age_unknown"] = data["firmage"].isnull()
data["age_young"] = data["firmage"] < 30
data["age_old"] = data["firmage"] > 80
data["age_mid"] = (
    (data["age_young"] == False)
    & (data["age_old"] == False)
    & (data["age_unknown"] == False)
)


In [85]:
data.shape


(10282, 258)

In [86]:
data.ownership.value_counts()


ownership
Dispersed Shareholders            2745
Private Individuals               2118
Founder owned, founder CEO        1856
Family owned, family CEO          1755
Other                              527
Private Equity/Venture Capital     353
Family owned, external CEO         346
Founder owned, external CEO        300
Government                         170
Family owned, CEO unknown           55
Founder owned, CEO unknown          41
Name: count, dtype: int64

### SAMPLE SELECTION
 Keep observations with:
     Non-employee/Research/Gov/Other type of ownership
     non-missing variables 

In [87]:
data = data.loc[
    (data["ownership"] != "Government") & (data["ownership"] != "Other")
].dropna(subset=["ownership"])


In [88]:
data = data.dropna(
    subset=[
        "ownership",
        "management",
        "foundfam_owned",
        "degree_nm",
        "competition",
        "industry",
        "countrycode",
        "lnemp",
    ]
)


In [89]:
# Summary of num. of employment
pd.DataFrame(data["emp_firm"].describe(percentiles=[0.01, 0.5, 0.99])).T


Unnamed: 0,count,mean,std,min,1%,50%,99%,max
emp_firm,8666.0,838.88276,3772.303028,1.0,41.0,250.0,8000.0,176000.0


In [90]:
# Drop tiny and large firms

data.loc[data["emp_firm"] < 50].shape[0]


103

In [91]:
data.loc[data["emp_firm"] > 5000].shape[0]


124

In [92]:
data = data[(data["emp_firm"] >= 50) & (data["emp_firm"] <= 5000)]

In [93]:
data.to_csv(data_out + "wms_da_textbook-work.csv")
