In [112]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

In [56]:
#importing the data
age_data = pd.read_csv("../../../../ceo_turnover/CEO_v2/age_data.csv")
tenure_data = pd.read_csv("../../../../ceo_turnover/CEO_v2/tenure_data.csv")
conversion_data = pd.read_csv("../../../../ceo_turnover/CEO Turnover Project/Raw Data Files/conversion_dataset.csv")

  interactivity=interactivity, compiler=compiler, result=result)


First we're going to clean the age data

In [57]:
#Creating NaN consitency
age_data.fillna("NaN", inplace=True)
age_data["DOB"].replace("n.a.", "NaN", inplace=True)

In [58]:
#Creating a function that will get the "DOB" column in the format we want. "DOB" column has
#4 different types of inputs

#Type 1: 01 Apr 1918
#Type 2: Sep 1945
#Type 3: "1945"
#Type 4: 1945
#Type 5: "NaN"

def correct_dob(date):
    if date == "NaN":
        return date
    elif type(date) == int:
        return pd.to_datetime(date, format="%Y")
    elif len(date.split(" ")) == 1:
        return pd.to_datetime(date, format="%Y")
    elif len(date.split(" ")) == 2:
        return pd.to_datetime(date, format="%b %Y")
    elif len(date.split(" ")) == 3:
        return pd.to_datetime(date, format="%d %b %Y")
    else:
        return "ERROR"

dob_vector = np.vectorize(correct_dob)

In [59]:
#Updating the DOB column
age_data["DOB"] = dob_vector(age_data["DOB"])

In [60]:
#Age NANs are being left to be dealt with later

Now we're cleaning the tenure data

In [61]:
#Dropping the entries that either have an "N" in the StartDate or EndDate and dropping directors who have remained in their roles
tenure_data.drop(tenure_data[tenure_data["DateEndRole"] == "N"].index, inplace=True)
tenure_data.drop(tenure_data[tenure_data["DateStartRole"] == "N"].index, inplace=True)
tenure_data.drop(tenure_data[tenure_data["DateEndRole"] == "C"].index, inplace=True)

In [62]:
#Converting to "datetime" objects
tenure_data["DateStartRole"] = pd.to_datetime(tenure_data["DateStartRole"], format="%Y%m%d")
tenure_data["DateEndRole"] = pd.to_datetime(tenure_data["DateEndRole"], format="%Y%m%d")

In [63]:
#Dealing with the columns that have the Start Date and End Date reversed
fixed_entries = tenure_data[tenure_data["DateStartRole"] > tenure_data["DateEndRole"]]
fixed_entries.columns = ['CompanyID', 'DirectorID', 'DirectorName', 'CompanyName', 'RoleName', 'DateEndRole', 'DateStartRole', 'Seniority']
tenure_data.drop(fixed_entries.index, inplace=True)
tenure_data = tenure_data.append(fixed_entries)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [91]:
#Adding the "DateStartCompany" and "DateEndCompany" columns to facilitate creating the "company_tenure" column later on
company_starts = tenure_data.groupby(by=["CompanyName","DirectorName", "RoleName"])["DateStartRole"].first().groupby(by="DirectorName").min().rename("DateStartCompany")
company_ends = tenure_data.groupby(by=["CompanyName","DirectorName", "RoleName"])["DateEndRole"].first().groupby(by="DirectorName").max().rename("DateEndCompany")
tenure_data = pd.merge(left=tenure_data, right=company_starts, left_on="DirectorName", right_on=company_starts.index)
tenure_data = pd.merge(left=tenure_data, right=company_ends, left_on="DirectorName", right_on=company_ends.index)

Now we can join the two datasets

In [93]:
#Executing the inner join
joined_data = pd.merge(left=tenure_data, right=age_data)

Now we're going to join in the dataset that links BoardEx's "ComapnyID" with Compustat's "GVKey". We're going to do this now because it will spare us from running our script on thousands of rows that we'll eventually drop anyway because we have no financial performance data on them

In [94]:
#Joining in the conversion dataset
joined_data = pd.merge(left=joined_data, right=conversion_data, left_on="CompanyID", right_on="COMPANYID")

Defining functions that will help us create the panel dataset

In [96]:
#Creating a function that will give us "age" in the desired format - Y.MMM
def age_calculator(DOB, date):
    months = np.datetime64(date, "M") - np.datetime64(DOB, "M")
    age = months / np.timedelta64(12,'M')
    return age

#Vectorizing 
age_vector = np.vectorize(age_calculator, otypes=[np.float])

In [97]:
def age_column(DOB, date_range):
    #handling "NaN" value
    if pd.isnull(DOB):
        ages = ["NaN"] * len(date_range)
    else:
        ages = age_vector(DOB, date_range)
    return ages

In [98]:
def make_panel(idx, row):

    #Declaring key variables
    start = row["DateStartRole"]
    end = row["DateEndRole"]
    dob = row["DOB"]
    date_range = pd.period_range(start=start, end=end, freq='M')

    #Ages column
    ages = age_column(dob, date_range)
  
    #Makeing the panel
    current_panel = pd.DataFrame([row]*len(date_range))
    current_panel["Age"] = ages
    current_panel["date"] = date_range
    
    return current_panel

Now we're putting it all together

In [99]:
#25 minutes to execute - takes in a df with 300,000 entries and outputs a df with 13,000,000
dfs = []

for idx, row in tqdm(joined_data.iterrows(), total=joined_data.shape[0]):
    curr_df = make_panel(idx, row)
    dfs.append(curr_df)
    
panel_data = pd.concat(dfs)

100%|██████████| 274381/274381 [21:40<00:00, 210.93it/s]


Calculating both role_tenure and company_tenure for each director in our dataset

In [103]:
#Adds both comapany and role tenure for each director
def add_tenures(df):

    role_tenure = []
    company_tenure = []

    for role_start, company_start, current_date in tqdm(zip(df["DateStartRole"], df["DateStartCompany"], df["date"]), total=df.shape[0]):
        role_tenure.append(np.datetime64(current_date, "M") - np.datetime64(role_start, "M"))
        company_tenure.append(np.datetime64(current_date, "M") - np.datetime64(company_start, "M"))
    
    df["role_tenure"] = role_tenure / np.timedelta64(12,'M')
    df["company_tenure"] = company_tenure / np.timedelta64(12,'M')

    return df

#Creating the "role_tenure" and "company_tenure" columns for our panel_data
panel_data = add_tenures(panel_data)

100%|██████████| 13621529/13621529 [07:44<00:00, 29347.35it/s]


In [None]:
#We can drop all non-CEO entries now as the tenures have been calculated - reduces df size from 13,000,000 to 700,000
panel_data = panel_data[panel_data["RoleName"].str.contains("CEO")]

Now we can add the True/False variable that describes if a CEO has been removed from his position or not

In [250]:
#Adds "True" to a row if a turnover occured within a specified window of months before the event
#Can't be forward looking because we don't have data on directors after a turnover event occured
def add_turnover_indicator(df, window):
    
    turnover_data = []

    for end_date, current_date in tqdm(zip(df["DateEndRole"], df["date"]), total=df.shape[0]):
        upper_bound = end_date 
        lower_bound = end_date - pd.DateOffset(months=window)
        date_range = pd.period_range(start=lower_bound, end=upper_bound, freq="M")
        if current_date in date_range:
            turnover_data.append(True)
        else:
            turnover_data.append(False)
            
    df["Turnover"] = turnover_data

#Creating the "Turnover" column for our panel_data with a window of 12 months looking back
add_turnover_indicator(panel_data, 12)

100%|██████████| 13621529/13621529 [1:13:51<00:00, 3073.79it/s]


In [256]:
#Final sorting so that the df is in the order we want
panel_data.sort_values(by=["CompanyName","DirectorName","date"], inplace=True)

In [261]:
#Exporting - exporting to pkl because csv was taking far too long to export
panel_data.to_pickle("../../../../ceo_turnover/CEO_v2/directorships.pkl")