In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm

In [None]:
#importing the data
age_data = pd.read_csv("../../../../ceo_turnover/CEO_v2/age_data.csv")
tenure_data = pd.read_csv("../../../../ceo_turnover/CEO_v2/tenure_data.csv")

First we're going to clean the age data

In [None]:
#Creating NaN consitency
age_data.fillna("NaN", inplace=True)
age_data["DOB"].replace("n.a.", "NaN", inplace=True)

In [None]:
#Creating a function that will get the "DOB" column in the format we want. "DOB" column has
#4 different types of inputs

#Type 1: 01 Apr 1918
#Type 2: Sep 1945
#Type 3: "1945"
#Type 4: 1945
#Type 5: "NaN"

def correct_dob(date):
    if date == "NaN":
        return date
    elif type(date) == int:
        return pd.to_datetime(date, format="%Y")
    elif len(date.split(" ")) == 1:
        return pd.to_datetime(date, format="%Y")
    elif len(date.split(" ")) == 2:
        return pd.to_datetime(date, format="%b %Y")
    elif len(date.split(" ")) == 3:
        return pd.to_datetime(date, format="%d %b %Y")
    else:
        return "ERROR"

dob_vector = np.vectorize(correct_dob)

In [None]:
#Updating the DOB column
age_data["DOB"] = dob_vector(age_data["DOB"])

In [None]:
#Age NANs are being left to be dealt with later

Now we're cleaning the tenure data

In [None]:
#Dropping the entries that either have an "N" in the StartDate or EndDate and dropping directors who have remained in their roles
tenure_data.drop(tenure_data[tenure_data["DateEndRole"] == "N"].index, inplace=True)
tenure_data.drop(tenure_data[tenure_data["DateStartRole"] == "N"].index, inplace=True)
tenure_data.drop(tenure_data[tenure_data["DateEndRole"] == "C"].index, inplace=True)

In [None]:
#Converting to "datetime" objects
tenure_data["DateStartRole"] = pd.to_datetime(tenure_data["DateStartRole"], format="%Y%m%d")
tenure_data["DateEndRole"] = pd.to_datetime(tenure_data["DateEndRole"], format="%Y%m%d")

In [None]:
#Dealing with the columns that have the Start Date and End Date
fixed_entries = tenure_data[tenure_data["DateStartRole"] > tenure_data["DateEndRole"]]
fixed_entries.columns = ['CompanyID', 'DirectorID', 'DirectorName', 'CompanyName', 'RoleName', 'DateEndRole', 'DateStartRole', 'Seniority']
tenure_data.drop(fixed_entries.index, inplace=True)
tenure_data = tenure_data.append(fixed_entries)

Now we can join the two datasets

In [None]:
#Executing the inner join
joined_data = pd.merge(left=tenure_data, right=age_data)

There are some columns that have the Start Date and End Date swapped

Defining functions that will help us create the panel dataset

In [None]:
#Creating a function that will give us "age" in the desired format - Y.MMM
def age_calculator(DOB, date):
    months = np.datetime64(date, "M") - np.datetime64(DOB, "M")
    age = months / np.timedelta64(12,'M')
    return age

#Vectorizing
age_vector = np.vectorize(age_calculator, otypes=[np.float])

In [None]:
def age_column(DOB, date_range):
    #handling "NaN" value
    if pd.isnull(DOB):
        ages = ["NaN"] * len(date_range)
    else:
        ages = age_vector(DOB, date_range)
    return ages

In [None]:
def make_panel(idx, row):

    #Declaring key variables
    start = row["DateStartRole"][idx]
    end = (pd.to_datetime('today') if row["DateEndRole"][idx] == -1 else row["DateEndRole"][idx])
    date_range = pd.period_range(start=start, end=end, freq='M')
    dob = row["DOB"][idx]

    #Ages
    ages = age_column(dob, date_range)
  
    #Makeing the panel
    current_panel = pd.concat([row]*len(date_range))
    current_panel["Age"] = ages
    
    return current_panel

Now we're putting it all together

In [None]:
#30 minutes to execute
dfs = []

for i in tqdm(range(joined_data.shape[0])):
    curr_df = make_panel(i, joined_data[i:i+1])
    dfs.append(curr_df)
    
panel_data = pd.concat(dfs)

In [None]:
def add_role_tenure(df):

    ''' 
    Calculate role_tenure by simply taking the current date minus the date_start_role
    '''
    
    df["role_tenure"] = -1
    tenure_list = []

    for start_date, current_date in tqdm(zip(df["date_start_role"], df["date"]), total=df.shape[0]):
        tenure_list.append(np.datetime64(current_date, "M") - np.datetime64(start_date, "M"))
    
    df["role_tenure"] = tenure_list / np.timedelta64(12,'M')

    return df

#Creating the role tenure for our panel_data
panel_data = add_role_tenure(panel_data)

In [None]:
#Calcualtes the company tenure for a given director
def add_company_tenure(df):

    df["company_tenure"] = df["role_tenure"]

    df.sort_values(by=['role_tenure'], inplace=True, ascending=False)
    df.sort_values(by=['date', 'DirectorID', 'CompanyID'], inplace=True)

    for idx in tqdm(range(1, df.shape[0])):
        if df.iloc[idx]["DirectorID"] != df.iloc[idx - 1]["DirectorID"] or \
                df.iloc[idx]["CompanyID"] != df.iloc[idx - 1]["CompanyID"]:
            pass

        elif df.iloc[idx]["date"] != df.iloc[idx - 1]["date"]:
            df.at[idx, "company_tenure"] = df.iloc[idx - 1]["company_tenure"] + (1 / 12)

        else:
            df.at[idx, "company_tenure"] = df.iloc[idx - 1]["company_tenure"]

    return df

#Updating
panel_data = add_company_tenure(panel_data)

In [None]:
#Determines wether or not a director turned over in a given window
def add_turnover_indicator(df, window_before=12):

    def date_difference(date_1, date_2):
        if not isinstance(date_2, datetime.datetime):
            return window_before + 1
        else:
            return (date_2 - date_1) / np.timedelta64(1, 'M')

    def turnover_for_row(idx):
        current_date = df.iloc[idx]['Date']
        end_date = df.iloc[idx]['role_end_date']

        if date_difference(current_date, end_date) < window_before:
            return True
        else:
            return False

    df['temp_idx'] = df.index

    df['turnover_next_12_mo'] = df['temp_idx'].apply(lambda idx: turnover_for_row(idx))

    df.drop(['temp_idx'], axis=1, inplace=True)

In [None]:
#Only keeping CEOs - RUN THIS WHEN COMPLETELY DONE
tenure_data = tenure_data[tenure_data["RoleName"].str.contains("CEO")]

In [None]:
tenure_data.to_csv("directorships_v3.csv")