In [161]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

In [162]:
#importing the data
age_data = pd.read_csv("../../../../ceo_turnover/CEO_v2/age_data.csv")
tenure_data = pd.read_csv("../../../../ceo_turnover/CEO_v2/tenure_data.csv")
conversion_data = pd.read_csv("../../../../ceo_turnover/CEO Turnover Project/Raw Data Files/conversion_dataset.csv")

First we're going to clean the age data

In [163]:
#Creating NaN consitency
age_data.fillna("NaN", inplace=True)
age_data["DOB"].replace("n.a.", "NaN", inplace=True)

In [164]:
#Creating a function that will get the "DOB" column in the format we want. "DOB" column has
#4 different types of inputs

#Type 1: 01 Apr 1918
#Type 2: Sep 1945
#Type 3: "1945"
#Type 4: 1945
#Type 5: "NaN"

def correct_dob(date):
    if date == "NaN":
        return "NaN"
    elif type(date) == int:
        return pd.to_datetime(date, format="%Y")
    elif len(date.split(" ")) == 1:
        return pd.to_datetime(date, format="%Y")
    elif len(date.split(" ")) == 2:
        return pd.to_datetime(date, format="%b %Y")
    elif len(date.split(" ")) == 3:
        return pd.to_datetime(date, format="%d %b %Y")
    else:
        return "ERROR"

dob_vector = np.vectorize(correct_dob)

In [165]:
#Updating the DOB column
age_data["DOB"] = dob_vector(age_data["DOB"])

Now we're cleaning the tenure data

In [169]:
#Dropping the entries that either have an "N" in the StartDate or EndDate and dropping directors who have remained in their roles
tenure_data.drop(tenure_data[tenure_data["DateEndRole"] == "N"].index, inplace=True)
tenure_data.drop(tenure_data[tenure_data["DateStartRole"] == "N"].index, inplace=True)
tenure_data.drop(tenure_data[tenure_data["DateEndRole"] == "C"].index, inplace=True)

In [177]:
#Converting to "datetime" objects
tenure_data["DateStartRole"] = tenure_data["DateStartRole"].astype(np.datetime64)
tenure_data["DateEndRole"] = tenure_data["DateEndRole"].astype(np.datetime64)

In [171]:
#Dealing with the columns that have the Start Date and End Date reversed
fixed_entries = tenure_data[tenure_data["DateStartRole"] > tenure_data["DateEndRole"]]
fixed_entries.columns = ['CompanyID', 'DirectorID', 'DirectorName', 'CompanyName', 'RoleName', 'DateEndRole', 'DateStartRole', 'Seniority']
tenure_data.drop(fixed_entries.index, inplace=True)
tenure_data = tenure_data.append(fixed_entries)

In [172]:
#Adding the "DateStartCompany" and "DateEndCompany" columns to facilitate creating the "company_tenure" column later on
company_starts = tenure_data.groupby(by=["CompanyName","DirectorName", "RoleName"])["DateStartRole"].first().groupby(by="DirectorName").min().rename("DateStartCompany")
company_ends = tenure_data.groupby(by=["CompanyName","DirectorName", "RoleName"])["DateEndRole"].first().groupby(by="DirectorName").max().rename("DateEndCompany")
tenure_data = pd.merge(left=tenure_data, right=company_starts, left_on="DirectorName", right_on=company_starts.index)
tenure_data = pd.merge(left=tenure_data, right=company_ends, left_on="DirectorName", right_on=company_ends.index)

Now we can join the two datasets

In [173]:
#Executing the inner join
joined_data = pd.merge(left=tenure_data, right=age_data)

We're going to join in the dataset that links BoardEx's "ComapnyID" with Compustat's "GVKey" now because it will spare us from running our script on thousands of rows that we'll eventually drop anyway because we have no financial performance data on them

In [174]:
#Joining in the conversion dataset
joined_data = pd.merge(left=joined_data, right=conversion_data, left_on="CompanyID", right_on="COMPANYID")

We can also safely drop all the non-CEO roles from our dataset because, now that we've calcualted DateStartCompany and DateEndCompany, the company_tenure column can still be calculated

In [175]:
#We can drop all non-CEO entries now as the tenures have been calculated - reduces df size from 13,000,000 to 700,000
joined_data = joined_data[joined_data["RoleName"].str.contains("CEO")]

Here we expand every individual row in the dataframe into a time series and add colums for age, role_tenure, and company_tenure

In [176]:
#This function converts our df into a time series, with an entry every month with updated ages, role tenures, and company tenures
def make_ts(df):

    #Storing all the blown up rows
    dfs = []

    #Helper function that expands each individual row (i.e. a given director in a given role)
    def expand_row(idx, row):

        #Declaring key variables for each row
        role_start = np.datetime64(row["DateStartRole"], "M")
        role_end = np.datetime64(row["DateEndRole"], "M")
        company_start = np.datetime64(row["DateStartCompany"], "M")
        dob = np.datetime64(row["DOB"], "M") if row["DOB"] is not pd.NaT else pd.NaT

        #Creating an array that has every month this particular director served in his role
        role_range = np.arange(role_start, role_end, dtype="M8[M]")

        #Making the panel
        current_panel = pd.DataFrame([row]*len(role_range))
        current_panel["date"] = role_range
        current_panel["ages"] = (role_range - dob) / np.timedelta64(12,'M') if not pd.isna(dob) else pd.NaT
        current_panel["role_tenure"] = (role_range - role_start) / np.timedelta64(12,'M')
        current_panel["company_tenure"] = (role_range - company_start) / np.timedelta64(12,'M')

        return current_panel

    #Now we apply expand_row on every row of the df and concatenate them all together at the end
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        curr_df = expand_row(idx, row)
        dfs.append(curr_df)
    
    #Aggregating
    return pd.concat(dfs)

In [None]:
#Applying the function to create the time series
time_series = make_ts(joined_data)

Finally, we set the target variable - "Turnover(T/F)" - to True if a turnover occured within a given window

In [183]:
#Adds "True" to a row if a turnover occured within a specified window of months before the event
#Can't be forward looking because we don't have data on directors after a turnover event occured
def add_turnover_indicator(df, window):
    
    turnover_data = []

    for end_date, current_date in tqdm(zip(df["DateEndRole"], df["date"]), total=df.shape[0]):
        upper_bound = end_date 
        lower_bound = end_date - pd.DateOffset(months=window)
        date_range = pd.period_range(start=lower_bound, end=upper_bound, freq="M")
        if current_date in date_range:
            turnover_data.append(True)
        else:
            turnover_data.append(False)
            
    df["Turnover"] = turnover_data

#Creating the "Turnover" column for our panel_data with a window of 12 months looking back
add_turnover_indicator(time_series, 12)

100%|██████████| 757760/757760 [04:48<00:00, 2627.34it/s]


In [185]:
time_series.to_csv("../../../../ceo_turnover/CEO_v2/directorships.csv")