#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### CHAPTER 21
**CH20A Founder/family ownership and quality of management**

 using the wms-management dataset
 
 version 1.0 2021-05-05

In [1]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
from plotnine import *

warnings.filterwarnings("ignore")


In [2]:
# Current script folder
current_path = os.getcwd()
dirname = current_path.split("da_case_studies")[0]

# location folders
data_in = dirname + "da_data_repo/wms-management-survey/clean/"
data_out = dirname + "da_case_studies/ch21-ownership-management-quality/"
output = dirname + "da_case_studies/ch21-ownership-management-quality/output/"

func = dirname + "da_case_studies/ch00-tech-prep/"
sys.path.append(func)


In [3]:
# Import the prewritten helper functions
from py_helper_functions import *


## PART I

### Data prep

In [4]:
data = pd.read_csv(data_in + "wms_da_textbook-xsec.csv")
# data = pd.read_csv("https://osf.io/zy9j8/download")


In [5]:
# Ownership: define founder/family owned and drop ownership that's missing or not relevant
# Ownership


In [6]:
ownership = (
    data.groupby("ownership", dropna=False)
    .agg(Freq=("firmid", "count"))
    .assign(Percent=lambda x: 100 * x / x.sum())
)
ownership["Cum"] = ownership["Percent"].cumsum()
ownership.round(2)


Unnamed: 0_level_0,Freq,Percent,Cum
ownership,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dispersed Shareholders,3772,26.42,26.42
"Family owned, CEO unknown",176,1.23,27.65
"Family owned, external CEO",446,3.12,30.78
"Family owned, family CEO",2370,16.6,47.38
"Founder owned, CEO unknown",124,0.87,48.25
"Founder owned, external CEO",339,2.37,50.62
"Founder owned, founder CEO",2427,17.0,67.62
Government,261,1.83,69.45
Other,968,6.78,76.23
Private Equity/Venture Capital,492,3.45,79.67


In [7]:
# Define foundfam owned


In [8]:
data["foundfam_owned"] = np.where(
    (data["ownership"] == "Family owned, external CEO")
    | (data["ownership"] == "Family owned, family CEO")
    | (data["ownership"] == "Family owned, CEO unknown")
    | (data["ownership"] == "Founder owned, external CEO")
    | (data["ownership"] == "Founder owned, CEO unknown")
    | (data["ownership"] == "Founder owned, founder CEO"),
    1,
    np.where(data["ownership"].isnull(), np.nan, 0),
)


In [9]:
# Foundfam owned


In [10]:
ownership = (
    data.groupby("foundfam_owned", dropna=False)
    .agg(Freq=("firmid", "count"))
    .assign(Percent=lambda x: 100 * x / x.sum())
)
ownership["Cum"] = ownership["Percent"].cumsum()
ownership.round(2)


Unnamed: 0_level_0,Freq,Percent,Cum
foundfam_owned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,8378,58.68,58.68
1.0,5882,41.2,99.88
,17,0.12,100.0


In [11]:
# Proportion of managers/non-managers with a college degree
# need correction: -44 means do not know, -99 means missing


In [12]:
data["degree_m"] = data["degree_m"] / 100
data["degree_nm"] = data["degree_nm"] / 100
data.loc[data["degree_m"] < 0, "degree_m"] = None
data.loc[data["degree_nm"] < 0, "degree_nm"] = None


In [13]:
# Generate bins from degree_nm


In [14]:
pd.DataFrame(
    data["degree_nm"].describe(percentiles=[0, 0.1, 0.25, 0.5, 0.75, 0.9, 1])
).T.iloc[:, 4:-1]


Unnamed: 0,0%,10%,25%,50%,75%,90%,100%
degree_nm,0.0,0.0,0.0,0.05,0.15,0.3,1.0


In [15]:
data["degree_nm_bins"] = pd.cut(
    data["degree_nm"], bins=[0, 0.001, 0.05, 0.20, 1.01], right=False
)
data["degree_nm_sq"] = data["degree_nm"] ** 2


In [16]:
data.groupby("degree_nm_bins", dropna=False).agg(
    min=("degree_nm", "min"), max=("degree_nm", max), n=("degree_nm", "count")
)


Unnamed: 0_level_0,min,max,n
degree_nm_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"[0.0, 0.001)",0.0,0.0,3974
"[0.001, 0.05)",0.01,0.04,1939
"[0.05, 0.2)",0.05,0.19,4207
"[0.2, 1.01)",0.2,1.0,2679
,,,0


In [17]:
# Take log of employment
data["lnemp"] = np.log(data["emp_firm"])


In [18]:
# Competition
data["competition"] = data["competition"].str.strip()
data["compet_weak"] = (data["competition"] == "0 competitors") | (
    data["competition"] == "1-4 competitors"
)
data["compet_moder"] = data["competition"] == "5-9 competitors"
data["compet_strong"] = data["competition"] == "10+ competitors"


In [19]:
# Industry in 2 digits

industry_names = [
    "food",
    "tobacco",
    "textile",
    "apparel",
    "lumber",
    "furniture",
    "paper",
    "printing",
    "chemical",
    "petrol",
    "rubber",
    "leather",
    "glass",
    "primary_metal",
    "fabricated_metal",
    "ind_machinery",
    "electronic",
    "transport",
    "instrument",
    "misc_manuf",
]


In [20]:
_, index = pd.factorize(data["sic"], sort=True)
dic = dict(zip(index, industry_names))
data["industry"] = data.loc[data["sic"].notnull(), "sic"].apply(lambda x: dic[x])


In [21]:
# Country as factor
data["countrycode"] = data["cty"]


In [22]:
# age
data["age_unknown"] = data["firmage"].isnull()
data["age_young"] = data["firmage"] < 30
data["age_old"] = data["firmage"] > 80
data["age_mid"] = (
    (data["age_young"] == False)
    & (data["age_old"] == False)
    & (data["age_unknown"] == False)
)


In [23]:
data.shape


(14277, 258)

In [24]:
data.ownership.value_counts()


Dispersed Shareholders            3772
Private Individuals               2885
Founder owned, founder CEO        2427
Family owned, family CEO          2370
Other                              968
Private Equity/Venture Capital     492
Family owned, external CEO         446
Founder owned, external CEO        339
Government                         261
Family owned, CEO unknown          176
Founder owned, CEO unknown         124
Name: ownership, dtype: int64

### SAMPLE SELECTION
 Keep observations with:
     Non-employee/Research/Gov/Other type of ownership
     non-missing variables 

In [25]:
data = data.loc[
    (data["ownership"] != "Government") & (data["ownership"] != "Other")
].dropna(subset=["ownership"])


In [26]:
data = data.dropna(
    subset=[
        "ownership",
        "management",
        "foundfam_owned",
        "degree_nm",
        "competition",
        "industry",
        "countrycode",
        "lnemp",
    ]
)


In [27]:
# Summary of num. of employment
pd.DataFrame(data["emp_firm"].describe(percentiles=[0.01, 0.5, 0.99])).T


Unnamed: 0,count,mean,std,min,1%,50%,99%,max
emp_firm,11672.0,797.526474,3347.958543,1.0,50.0,260.0,7000.0,176000.0


In [28]:
# Drop tiny and large firms

data.loc[data["emp_firm"] < 50].shape[0]


113

In [29]:
data.loc[data["emp_firm"] > 5000].shape[0]


142

In [30]:
data = data.loc[(data["emp_firm"] > 50) | (data["emp_firm"] < 5000)]


In [31]:
data.to_csv(data_out + "wms_da_textbook-work.csv")
