In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm

In [None]:
#importing the data
age_data = pd.read_csv("../../../../ceo_turnover/CEO_v2/age_data.csv")
tenure_data = pd.read_csv("../../../../ceo_turnover/CEO_v2/tenure_data.csv")
conversion_data = pd.read_csv("../../../../ceo_turnover/CEO Turnover Project/Raw Data Files/conversion_dataset.csv")

First we're going to clean the age data

In [None]:
#Creating NaN consitency
age_data.fillna("NaN", inplace=True)
age_data["DOB"].replace("n.a.", "NaN", inplace=True)

In [None]:
#Creating a function that will get the "DOB" column in the format we want. "DOB" column has
#4 different types of inputs

#Type 1: 01 Apr 1918
#Type 2: Sep 1945
#Type 3: "1945"
#Type 4: 1945
#Type 5: "NaN"

def correct_dob(date):
    if date == "NaN":
        return date
    elif type(date) == int:
        return pd.to_datetime(date, format="%Y")
    elif len(date.split(" ")) == 1:
        return pd.to_datetime(date, format="%Y")
    elif len(date.split(" ")) == 2:
        return pd.to_datetime(date, format="%b %Y")
    elif len(date.split(" ")) == 3:
        return pd.to_datetime(date, format="%d %b %Y")
    else:
        return "ERROR"

dob_vector = np.vectorize(correct_dob)

In [None]:
#Updating the DOB column
age_data["DOB"] = dob_vector(age_data["DOB"])

In [None]:
#Age NANs are being left to be dealt with later

Now we're cleaning the tenure data

In [None]:
#Dropping the entries that either have an "N" in the StartDate or EndDate and dropping directors who have remained in their roles
tenure_data.drop(tenure_data[tenure_data["DateEndRole"] == "N"].index, inplace=True)
tenure_data.drop(tenure_data[tenure_data["DateStartRole"] == "N"].index, inplace=True)
tenure_data.drop(tenure_data[tenure_data["DateEndRole"] == "C"].index, inplace=True)

In [None]:
#Converting to "datetime" objects
tenure_data["DateStartRole"] = pd.to_datetime(tenure_data["DateStartRole"], format="%Y%m%d")
tenure_data["DateEndRole"] = pd.to_datetime(tenure_data["DateEndRole"], format="%Y%m%d")

In [None]:
#Dealing with the columns that have the Start Date and End Date
fixed_entries = tenure_data[tenure_data["DateStartRole"] > tenure_data["DateEndRole"]]
fixed_entries.columns = ['CompanyID', 'DirectorID', 'DirectorName', 'CompanyName', 'RoleName', 'DateEndRole', 'DateStartRole', 'Seniority']
tenure_data.drop(fixed_entries.index, inplace=True)
tenure_data = tenure_data.append(fixed_entries)

Now we can join the two datasets

In [None]:
#Executing the inner join
joined_data = pd.merge(left=tenure_data, right=age_data)

Now we're going to join in the dataset that links BoardEx's "ComapnyID" with Compustat's "GVKey". We're going to do this now because it will spare us from running our script on thousands of rows that we'll eventually drop anyway because we have no financial performance data on them

In [None]:
#Joining in the conversion dataset
joined_data = pd.merge(left=joined_data, right=conversion_data, left_on="CompanyID", right_on="COMPANYID")

Defining functions that will help us create the panel dataset

In [None]:
#Creating a function that will give us "age" in the desired format - Y.MMM
def age_calculator(DOB, date):
    months = np.datetime64(date, "M") - np.datetime64(DOB, "M")
    age = months / np.timedelta64(12,'M')
    return age

#Vectorizing
age_vector = np.vectorize(age_calculator, otypes=[np.float])

In [None]:
def age_column(DOB, date_range):
    #handling "NaN" value
    if pd.isnull(DOB):
        ages = ["NaN"] * len(date_range)
    else:
        ages = age_vector(DOB, date_range)
    return ages

In [None]:
def make_panel(idx, row):

    #Declaring key variables
    start = row["DateStartRole"]
    end = row["DateEndRole"]
    dob = row["DOB"]
    date_range = pd.period_range(start=start, end=end, freq='M')

    #Ages column
    ages = age_column(dob, date_range)
  
    #Makeing the panel
    current_panel = pd.DataFrame([row]*len(date_range))
    current_panel["Age"] = ages
    current_panel["date"] = date_range
    
    return current_panel

Now we're putting it all together

In [None]:
#25 minutes to execute - takes in a df with 300,000 entries and outputs a df with 13,000,000
dfs = []

for idx, row in tqdm(joined_data.iterrows(), total=joined_data.shape[0]):
    curr_df = make_panel(idx, row)
    dfs.append(curr_df)
    
panel_data = pd.concat(dfs)

In [None]:
def add_role_tenure(df):

    ''' 
    Calculate role_tenure by simply taking the current date minus the date_start_role
    '''
    
    df["role_tenure"] = -1
    tenure_list = []

    for start_date, current_date in tqdm(zip(df["DateStartRole"], df["date"]), total=df.shape[0]):
        tenure_list.append(np.datetime64(current_date, "M") - np.datetime64(start_date, "M"))
    
    df["role_tenure"] = tenure_list / np.timedelta64(12,'M')

    return df

#Creating the role tenure for our panel_data
panel_data = add_role_tenure(panel_data)

The add_company_tenure function take incredibly long. Everything after this cell is WIP - trying to find a more efficient way to implement the function

In [None]:
#Imports a CSV that has all the changes we want up to "add_role_tenure" - this CSV can be found in the S3
panel_data = pd.read_pickle("../../../../ceo_turnover/CEO_v2/directorships_v4.csv")

In [None]:
#This was the implementation of add_company_tenure we came up with originally
def add_company_tenure(df):

    df["company_tenure"] = df["role_tenure"]

    df.sort_values(by=['role_tenure'], inplace=True, ascending=False)
    df.sort_values(by=['date', 'DirectorID', 'CompanyID'], inplace=True)

    for idx in tqdm(range(1, df.shape[0])):
        if df.iloc[idx]["DirectorID"] != df.iloc[idx - 1]["DirectorID"] or \
                df.iloc[idx]["CompanyID"] != df.iloc[idx - 1]["CompanyID"]:
            pass

        elif df.iloc[idx]["date"] != df.iloc[idx - 1]["date"]:
            df.at[idx, "company_tenure"] = df.iloc[idx - 1]["company_tenure"] + (1 / 12)

        else:
            df.at[idx, "company_tenure"] = df.iloc[idx - 1]["company_tenure"]

    return df

#Updating
panel_data = add_company_tenure(panel_data)

Work in progress

In [73]:
costco_data = panel_data[panel_data["CompanyName"].str.contains("Costco")]

In [264]:
costco_table = costco_data.pivot_table(values=["role_tenure"], index="DirectorName", columns="RoleName", aggfunc=max)
costco_table

Unnamed: 0_level_0,role_tenure,role_tenure,role_tenure,role_tenure,role_tenure,role_tenure
RoleName,CEO,Chairman (Executive),Director - SD,President/CEO,President/COO,Senior Executive VP/Division COO
DirectorName,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Jeff Brotman,,22.666667,,,,
Jim Sinegal,1.916667,,6.0,16.25,,
Richard DiCerchio,,,0.583333,,,6.416667
W Jelinek,,,,,1.833333,


In [265]:
costco_table["company_tenure"] = np.sum(costco_table, axis=1).values


In [266]:
costco_table.reset_index()

Unnamed: 0_level_0,DirectorName,role_tenure,role_tenure,role_tenure,role_tenure,role_tenure,role_tenure,company_tenure
RoleName,Unnamed: 1_level_1,CEO,Chairman (Executive),Director - SD,President/CEO,President/COO,Senior Executive VP/Division COO,Unnamed: 8_level_1
0,Jeff Brotman,,22.666667,,,,,22.666667
1,Jim Sinegal,1.916667,,6.0,16.25,,,24.166667
2,Richard DiCerchio,,,0.583333,,,6.416667,7.0
3,W Jelinek,,,,,1.833333,,1.833333


In [None]:
#Determines wether or not a director turned over in a given window
def add_turnover_indicator(df, window_before=12):

    def date_difference(date_1, date_2):
        if not isinstance(date_2, datetime.datetime):
            return window_before + 1
        else:
            return (date_2 - date_1) / np.timedelta64(1, 'M')

    def turnover_for_row(idx):
        current_date = df.iloc[idx]['Date']
        end_date = df.iloc[idx]['role_end_date']

        if date_difference(current_date, end_date) < window_before:
            return True
        else:
            return False

    df['temp_idx'] = df.index

    df['turnover_next_12_mo'] = df['temp_idx'].apply(lambda idx: turnover_for_row(idx))

    df.drop(['temp_idx'], axis=1, inplace=True)

In [None]:
#Only keeping CEOs - RUN THIS WHEN COMPLETELY DONE
tenure_data = tenure_data[tenure_data["RoleName"].str.contains("CEO")]

In [None]:
tenure_data.to_csv("directorships_final.csv")