In [None]:
# General imports

import numpy as np
import pandas as pd
import gc, warnings, random, datetime, math, awscli

from pandas.util import hash_pandas_object

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, RobustScaler, SimpleImputer

import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')
gc.enable()

### Saving and opening files

In [None]:
df_ceos = pd.read_pickle("./data/df_ceos.pkl")

In [None]:
df_ceos["place_time_hash"] = hash_pandas_object(df_ceos[["date","company_id"]], index=False)

In [None]:
# compression_opts = dict(method='gzip', archive_name='pre_processed_data.csv')
# df_ceos.to_csv('./data/csv_files/pre_processed_data.csv.gz', index=False, compression=compression_opts)
# df_ceos.to_pickle("./data/df_ceos.pkl")
# df_directors.to_pickle("./data/df_directors.pkl")
# df_share_prices.to_pickle("./data/df_share_prices.pkl")
# df_directors = pd.read_pickle("./data/df_directors.pkl")
# df_ceos.sort_values(['company_name', 'director_name'], ascending=[True,True], inplace = True)

### TO-DO
Add this code in somewhere to run some cohort analysis

In [None]:
# Run this to download the CSV files to your local ./data/csv_files directory
# Make sure you have configured your AWS Credential File
!aws s3 sync s3://sagemaker-us-east-1-936165954724/ml-turnover/ ./data/csv_files

In [3]:
# You can run this command to upload new CSV files to the remote directory
!aws s3 sync ./data/csv_files s3://sagemaker-us-east-1-936165954724/ml-turnover/ 

Completed 167.6 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 167.8 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 168.1 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 168.3 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 168.6 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 168.8 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 169.1 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 169.3 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 169.6 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 169.8 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 170.1 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 170.3 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 170.6 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 170.8 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 171.1 MiB/171.8 MiB (2.7 MiB/s) with 1 file(s) remainingCompleted 

### Step 1: Load files

In [None]:
### Load share price data for companies
def load_gzip_csv_file(name, encoding='utf8'):
    """
    Reads csv file (gzip) from data/csv_files directory
    :param name: file name without extension
    :return: dataframe
    """
    csv_file = f'./data/csv_files/{name}.csv.gz'
    df = pd.read_csv(csv_file, compression='gzip', encoding=encoding)
    
    return df

In [None]:
def convert_datetime(df):
    """
    Convert datedate into a pandas datetime format
    """
    df["datadate"] = pd.to_datetime(df.datadate, format="%Y%m%d")
    
    return df

In [None]:
### Load csv file
def load_csv_file(name):
    """
    Reads csv file (gzip) from data/csv_files directory
    :param name: file name without extension
    :return: dataframe
    """
    csv_file = f'./data/csv_files/{name}.csv'
    df = pd.read_csv(csv_file)
    
    return df

In [None]:
def calculate_returns(df, type, periods=[1, 3, 6, 12, 24, 36]):
    """
    Return dataframe with monthly returns
    :param df: dataframe with raw prices
    :param type: firm, sector, market
    :return: dataframe
    """

    for period in periods:
        key_1 = str(type) + '_rtn_' + str(period) + 'm'
        # key_2 = 'annualized_return_' + str(period) + 'm'
        # use ffill method to deal with missing closing prices data
        try: 
            df[key_1] = df.groupby(['cusip'])['price'].pct_change(fill_method='ffill', periods=period)
        except: 
            df[key_1] = df['price'].pct_change(fill_method='ffill', periods=period)
        # df[key_2] = ((df[key_1]+1)**(12/period)-1)
    
    return df

In [None]:
def calculate_sector_returns(df, periods=[1, 3, 6, 12, 24, 36]):
    """
    Return dataframe with monthly returns
    :param df: dataframe with raw prices
    :return: dataframe
    """

    for period in periods:
        lookup_key = f'firm_rtn_{period}m'
        key = 'sector_rtn_' + str(period) + 'm'
        # use ffill method to deal with missing closing prices data
        df[key] = df.groupby(['sic', 'date'])[lookup_key].transform('mean')
    
    return df

In [None]:
def load_director_age():
    """ Load csv file containing director ages
    
    Args: 
        input_data: csv file in S3 location

    Returns: 
        dataframe with each row corresponding to a single role
    """
    csv_file = './data/csv_files/boardex_director_age_nationality.csv'
    df = pd.read_csv(csv_file, encoding='latin1')
    # df.datadate = pd.to_datetime(df.datadate, format="%Y%m%d")
    return df

In [None]:
def partition_df(dict, n):

    if (n == 1):
        return dict

    else: 
        p = sum(dict.values())/2
        _sum = 0
        _dict = {}

        while _sum < p:
            removed_item = dict.popitem()
            _dict[removed_item[0]] = removed_item[1]
            _sum += removed_item[1]
            
        return (partition_df(_dict, n/2), partition_df(dict, n/2))

_dict = df_ceos.director_id.to_dict()
keyz = partition_df(_dict, 4)
flat_keyz = [item for sublist in keyz for item in sublist]

In [None]:
df_ceos['nationality'] = df_ceos['nationality'].fillna('NaN')

In [None]:
# df['first_bid'] = df.assign(date = df['date'].\
# where(df['event'] == 'bid')).groupby('user_id')['date'].transform('min')
# df_ceos['company_tenure_at_ceo_role_start'] = df_ceos.assign(company_tenure_at_ceo_role_start = \
df_ceos.assign(company_tenure_at_ceo_role_start = df_ceos['company_roles_tenure'].\
    where(\
    (df_ceos['start_date'] == df_ceos['date'])\
    &(df_ceos['role_extension']==False)\
    ))
    # .groupby(['director_id', 'company_id'])['date'].transform('min')

In [None]:
df_ceos = df_ceos.dropna(subset=['firm_rtn_36m', 'previous_ceo_tenure'])

In [None]:
df_ceos[df_ceos.director_id == 181726][60:120][["date","role_id", "role_duration","role_tenure", "previous_ceo_tenure"]]

In [None]:
def check_nans(df):
    nans_df = df.isna()

    for col in df.columns:
        cur_group = nans_df[col].sum()
        if cur_group >= 0:
            print(f'{col}')
            print(f'NAN row count = {cur_group}')
            print(f'% NAN count = {df[col].isnull().mean()*100:.2f}%\n')
        
    del nans_df

check_nans(df_ceos)

In [None]:
# Load market index prices 
df_index_prices = load_gzip_csv_file('index_prices')
df_index_prices = convert_datetime(df_index_prices)

In [None]:
# Load pre-processed data
csv_file = './data/csv_files/pre_processed_data.csv.gz'
df_pre_processed = pd.read_csv(csv_file, compression='gzip')

In [None]:
# Load share price director link
df_lookup_table = load_csv_file('boardex_capiq_link_file')

In [None]:
# Load remuneration data
df_remuneration = load_gzip_csv_file('boardex_director_remuneration', encoding='latin-1')

In [None]:
# Load director data
df_directors = load_gzip_csv_file('boardex_director_profiles', encoding='latin-1')

In [None]:
# Load age data
df_ages = load_director_age()

### Step 2a: Clean up the share price dataframe

In [None]:
# Load share prices data
df_share_prices = load_gzip_csv_file('share_prices')
df_share_prices = convert_datetime(df_share_prices)

In [None]:
# Drop columns with more than 10% missing values
df_share_prices = df_share_prices.loc[:, df_share_prices.isnull().mean() < .1]

In [None]:
# Rename some column names to identifiable names
df_share_prices = df_share_prices.rename(columns={"prccm":"raw_price", "ajexm": "adjustment_factor", "conm": "company", "datadate": "date"})

In [None]:
# Drop if the row is missing a SIC (industry) code
df_share_prices = df_share_prices.dropna(subset=['sic', 'cusip'])

In [None]:
# Drop all companies with an adjustment factor of 0 (can't divide by 0)
print(f"Number of rows before adjustment: {len(df_share_prices)}")

print(f"Processing...")

moonshot_prices = df_share_prices[df_share_prices["adjustment_factor"] == 0]["gvkey"].unique()
print(f"Number of companies with desparate reverse splits: {len(moonshot_prices)}")

df_share_prices = df_share_prices[~df_share_prices.gvkey.isin(moonshot_prices)]

print(f"Number of rows after adjustment: {len(df_share_prices)}")

In [None]:
# Convert raw price into adjusted price
df_share_prices["price"] = df_share_prices["raw_price"]/df_share_prices["adjustment_factor"] 

In [None]:
# Calculate firm returns
df_share_prices = calculate_returns(df_share_prices, 'firm')

In [None]:
# Merge in the GVKEY to the directors dataframe 
# df_lookup_table = df_lookup_table.rename(columns={'companyid':'CompanyID'})
# df_directors = df_directors.merge(df_lookup_table, on='CompanyID', how='left')

# Find out which GVKEY in the performance dataset appears in the directors dataset
gvkey_list = df_share_prices.gvkey.isin(df_ceos["gvkey"])

# Select only the gvkeys in the performance dataset that also occur in the director dataframe
df_share_prices = df_share_prices[gvkey_list]

In [None]:
# Drop penny stocks with a share price less than USD 0.01 or monthly returns in excess of 10x 
penny_stock_gvkeys = df_share_prices[(df_share_prices["raw_price"] < 0.01) | (df_share_prices["firm_rtn_1m"]>10)]["gvkey"].unique()
df_share_prices = df_share_prices[~df_share_prices.gvkey.isin(penny_stock_gvkeys)]

In [None]:
# Drop more unused columns containing no information
df_share_prices = df_share_prices.drop(['iid', "cusip"], axis=1)

In [None]:
# Drop rows with nan in the price column
print(f"Number of rows before adjustment: {len(df_share_prices)}")
print(f"Processing...")
df_share_prices.dropna(subset=["price"], inplace=True)
print(f"Number of rows after dropping NaNs: {len(df_share_prices)}")

In [None]:
# Calculate sector returns
df_share_prices = calculate_sector_returns(df_share_prices)

In [None]:
# Save only the necessary columns
df_share_prices.drop(['ajpm','company','state', 'raw_price', 'adjustment_factor', 'trfm', 'city', 'naics', 'price'], axis=1, errors='raise', inplace=True)

### Step 2b: Clean up the market index file

In [None]:
# Select the Russell 3000 (broad market index)
df_index = df_index_prices[df_index_prices["conm"] == "Russell 3000"]

In [None]:
# Drop all columsn with "NA" values
df_index = df_index[["datadate", "prccm"]].rename(columns={"prccm":"price", "datadate":"date"})

In [None]:
# Calculate index returns
df_index = calculate_returns(df_index, 'index')

In [None]:
# Merge in index returns
df_share_prices = df_share_prices.merge(df_index, on="date")

### Step 2c: Clean up the remuneration data file

In [None]:
#Inspect the remuneration file
df_remuneration[:10]

In [None]:
# Convert date to pandas datetime format, remove the AnnualReportDate column
df_remuneration["date"] = pd.to_datetime(df_remuneration["AnnualReportDate"], format="%Y%m%d")
del df_remuneration["AnnualReportDate"]

In [None]:
# Map for renaming columns
remuneration_cols_map = {
    'BoardName':'company_name',
    'DirectorName':'director_name',
    'RoleName':'role_name',
    'Currency':'currency',
    'BoardID':'company_id',
    'DirectorID':'director_id',
    'Salary':'salary',
    'Bonus':'bonus', 
    'Other':'other', 
    'PenEmpCon':'pension',
    'TotalCompensation':'tot_comp', 
    'ValTotEqHeld':'equity_held',
    'TotRemPeriod':'tot_remuneration',
    'TotalDirectComp':'tot_direct_comp'
}
df_remuneration = df_remuneration.rename(columns = remuneration_cols_map)

In [None]:
# Select only rows in df_remuneration dataframe where company_id occurs in df_ceos dataframe 
df_remuneration_filter = df_remuneration['company_id'].isin(df_ceos["company_id"])
df_remuneration = df_remuneration[df_remuneration_filter]

In [None]:
# Select only rows in df_remuneration dataframe where director_id occurs in df_ceos dataframe 
df_remuneration_filter = df_remuneration['director_id'].isin(df_ceos["director_id"])
df_remuneration = df_remuneration[df_remuneration_filter]

In [None]:
# Drop all rows with n.a. values in DirectorID or DirectorName
# df_remuneration = df_remuneration.dropna(subset=['DirectorName', 'DirectorID'])
df_remuneration = df_remuneration[\
    ~(np.isnan(df_remuneration["tot_comp"]))|\
    ~(np.isnan(df_remuneration["salary"]))
    ]

In [None]:
# Create a new year variable where year corresponds to previous year, i.e. 2018 refers to 2017, etc.
df_remuneration["year"] = df_remuneration.date.dt.year + 1

In [None]:
# Create a 'year' variable for merging the dataframes
df_ceos["year"] = df_ceos.date.dt.year
# _df_remuneration = df_remuneration

In [None]:
# Make the dataframe smaller to include only the columns we need
df_remuneration = df_remuneration[["director_id", "company_id", "salary", "tot_remuneration", "bonus", "year"]]

In [None]:
# Merge the remuneration dataframe into the ceo dataframe
df_ceos.merge(df_remuneration, on=["director_id", "company_id", "year"])

### Step 3: Clean up director data

In [None]:
# Drop the unidentfiable names
df_directors = df_directors.iloc[140:]

In [None]:
# Merge in the nationality and age data
df_directors = df_directors.merge(df_ages, on="DirectorID")

In [None]:
# Drop the extra column from the dataframe
df_directors = df_directors.drop(['DirectorName_y'], axis=1)

In [None]:
# Rename some columns
df_directors = df_directors.rename(columns={
    'DirectorName_x':'director_name',\
    'CompanyName':'company_name',\
    'BrdPosition':'board_position',\
    'RoleName':'role_name',\
    'NED':'non_exec',\
    'DirectorID':'director_id',\
    'CompanyID':'company_id',\
    'DateStartRole':'start_date',\
    'DateEndRole':'end_date',\
    'HOCountryName':'country',\
    'Sector':'sector',\
    'OrgType':'org_type',
    'DOB':'date_of_birth',\
    'Gender':'gender',\
    'Nationality':'nationality',\
    'NetworkSize':'network_size'
})

In [None]:
def date_conversion(df):
    print(f"Number of rows before processing: {df.shape[0]}")
    df["date_of_birth"] = [np.nan if (val == 'NaN') | (val == 'n.a.') else val for val in df['date_of_birth']]
    df = df.dropna(subset=["date_of_birth"])

    regex = r"(?P<DAY>\d{2})?[/\s-]?(?P<MONTH>[a-zA-Z]{3})?[/\s-]?(?P<YEAR>\d{4})$"
    df = df.join(df["date_of_birth"].str.extract(regex))

    df["DAY"].fillna("01", inplace=True)
    df["MONTH"].fillna("Jan", inplace=True)

    df["date_of_birth"] = df["YEAR"] + "-" + df["MONTH"] + "-" + df["DAY"]
    df["date_of_birth"] = pd.to_datetime(df["date_of_birth"], format="%Y-%b-%d")

    df.drop(["YEAR", "MONTH", "DAY"], axis=1, inplace=True)
    print(f"Number of rows after dropping NAs: {df.shape[0]}")

    return df

df_directors = date_conversion(df_directors)

In [None]:
# Drop the entries that are missing StartDate or EndDate
df_directors.drop(df_directors[df_directors["start_date"] == "N"].index, inplace=True)
df_directors.drop(df_directors[df_directors["end_date"] == "N"].index, inplace=True)

In [None]:
# Create a Role ID to be used later
df_directors['role_id'] = df_directors.index.values.tolist()

In [None]:
# Create a column to indicate directors who remain in their roles
df_directors["current_role"] = False
df_directors.loc[df_directors["end_date"] == "C","current_role"] = True

In [None]:
# Convert "C" DateEndRole to the Maximum End Date in the dataset
MaxDate = np.unique(df_directors["end_date"])[-2]
df_directors.loc[df_directors["end_date"] == "C","end_date"] = MaxDate

In [None]:
# Covert start and end date to datetime format
df_directors["start_date"] = df_directors["start_date"].astype(np.datetime64)
df_directors["end_date"] = df_directors["end_date"].astype(np.datetime64)

In [None]:
# Calculate the role duration. Note that we cannot use this directly, because that is what we're trying to predict.
# But this is a known and usable value once a role has ended. If there is a trend, it may continue.
df_directors['role_duration'] = df_directors['end_date']-df_directors['start_date']

In [None]:
# https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600#Feature-Engineering

In [None]:
df_directors = df_directors.drop(["YEAR", "MONTH", "DAY"], axis=1)

In [None]:
# Drop the duplicate roles (same director_id, role name, same dates, same company), keep the first occurence
df_directors.drop_duplicates(subset=["director_id", "role_name", "company_id", "start_date", "end_date"], keep="first", inplace=True)

In [None]:
def melt_rows(df):
    """
    Expand the dataframe to melt the start_date and end_date
    """
    
    df = df.melt(id_vars=['director_name', 'company_name', 'board_position', 'role_name',
       'non_exec', 'director_id', 'company_id', 'country', 'sector', 
       'org_type', 'ISIN', 'date_of_birth', 'gender',
       'nationality', 'network_size', 'role_id', 'current_role',
       'role_duration'],value_name='date',var_name='date_type')

    df = df.drop_duplicates(subset=['role_id', 'date'], keep=False)
    
    # df = df.groupby('RoleId').apply(lambda x: x.set_index('Date').resample('M').pad())

    return df

In [None]:
df_directors = melt_rows(df_directors)

In [None]:
# Shift the role duration so that it is only at the end of the role that we start counting completed roles
df_directors['past_role_duration'] = df_directors.groupby(['role_id'])['role_duration'].shift(1)
df_directors.loc[pd.isnull(df_directors['past_role_duration']), 'past_role_duration'] = np.timedelta64(0, "D")

In [None]:
#  Sum previous role duration to derive career length and company tenure
#  Sort end_date < start_date to ensure we capture tenure for subsequent roles where start_date matches previous role end_date
df_directors.sort_values(['director_id', 'date', 'date_type'], ascending=[True,True,True], inplace = True)
df_directors['past_roles_tenure'] = df_directors.groupby('director_id')['past_role_duration'].transform(pd.Series.cumsum)
df_directors['company_roles_tenure'] = df_directors.groupby(['director_id', 'company_id'])['past_role_duration'].transform(pd.Series.cumsum)

In [None]:
# Drop the roles that last <1 day
df_directors = df_directors.drop_duplicates(subset=['role_id', 'date'], keep=False)

In [None]:
# We create a PastRolesIncrementer as well as an ActiveRolesIncrementer to be used to tally the sum of past and active roles
df_directors['past_roles_incrementer'] = [0 if DateType == 'start_date' else 1 for DateType in df_directors['date_type']]
df_directors['active_roles_incrementer'] = [1 if val == 'start_date' else -1 for val in df_directors['date_type']]

In [None]:
# Calculate the number of previous roles career-wide
df_directors.sort_values(['director_id', 'date', 'date_type'], ascending=[True,True,True], inplace = True)
df_directors['past_roles_count'] = df_directors.groupby(['director_id'])['past_roles_incrementer'].transform(pd.Series.cumsum)

In [None]:
# Calculate the number of previous roles at the company
# df_directors.sort_values(['director_id', 'date', 'date_type'], ascending=[True,True,True], inplace = True)
df_directors['company_roles_count'] = df_directors.groupby(['director_id', 'company_id'])['past_roles_incrementer'].transform(pd.Series.cumsum)

In [None]:
# Calculate the average tenure of past roles
df_directors["past_roles_tenure_avg"] = df_directors["past_roles_tenure"]/df_directors["past_roles_count"]

In [None]:
# Calculate the average tenure of past roles at the company
df_directors["company_roles_tenure_avg"] = df_directors["company_roles_tenure"]/df_directors["company_roles_count"]

In [None]:
# Join roles where the same director, keeps the same role, same role_name, at the same company
df_directors.drop_duplicates(subset=["director_id", "role_name", "company_id", "date"], keep=False, inplace=True)

In [None]:
# In each month, take the sum of all active roles increments to calculate net role changes
df_directors['active_roles_incrementer'] = df_directors.groupby(['director_id', 'date'])['active_roles_incrementer'].transform('sum')

In [None]:
# We are going to identify some duplicate months for each director (without dropping them)
df_directors['duplicata'] = df_directors.duplicated(subset=['director_id', 'date'], keep='first')

In [None]:
# We convert the duplicate tag into a value of 0 if it's true, so that we can calculate active roles
df_directors['duplicata'] = [0 if val else 1 for val in df_directors['duplicata']]


In [None]:
# Calculate active_roles_incrementer without duplicates
df_directors['active_roles_inc_no_dup'] = df_directors['active_roles_incrementer']*df_directors['duplicata'] 
df_directors['active_roles_count'] = df_directors.groupby('director_id')['active_roles_inc_no_dup'].transform('cumsum')

In [None]:
# Calculate the maximum number of active roles on a rolling basis
df_directors['active_roles_count_max'] = df_directors.groupby('director_id')['active_roles_count'].cummax()

### Step 4: Create a CEO-only dataset with expanded rows

In [None]:
# Add an indicator variable for CEOs, Chairmen and CEOs who also hold the title of Chair
df_directors["ceo"] = df_directors["role_name"].str.contains('ceo', case=False, regex=False)
df_directors["chair"] = df_directors["role_name"].str.contains('chairman', case=False, regex=False) 
df_directors["chair_ceo"] = (df_directors["chair"]) & (df_directors["ceo"]) 

In [None]:
# Create a dataframe filtered to CEOs
df_ceos = df_directors[df_directors["ceo"]]

In [None]:
# Drop Regional, Division, Acting, Interim CEOs from dataframe
df_ceos = df_ceos[~df_ceos["role_name"].str.contains('Division|Interim|Acting|Regional|Deputy\sCEO', regex=True)]

In [None]:
# Calculate the tenure of the previous CEO
df_ceos.sort_values(['company_id', 'date', 'date_type'], ascending=[True,True,True], inplace = True)
df_ceos["previous_role_duration"] = df_ceos.groupby(['role_id'])['role_duration'].shift(1)
df_ceos['previous_ceo_tenure'] = df_ceos.groupby(['company_id'])['previous_role_duration'].ffill()
df_ceos['previous_ceo_tenure'] = df_ceos.groupby(['company_id'])['previous_ceo_tenure'].shift(1)

In [None]:
# Identify role extensions, i.e. where a CEO continues in his/her role
df_ceos["role_extension"] = df_ceos.duplicated(subset=["director_id", "company_id", "date"], keep=False)

In [None]:
# Drop rows where our estimated date is wrong (role_duration is a negative number)
drop_this = (df_ceos["role_duration"]<np.timedelta64(0, "Y"))
df_ceos = df_ceos[~drop_this]

In [None]:
# Merge in the gvkey to the directors dataframe 
df_lookup_table = df_lookup_table.rename(columns={'companyid':'company_id'})
df_ceos = df_ceos.merge(df_lookup_table, on='company_id', how='left')
df_ceos = df_ceos.rename(columns={'GVKEY':'gvkey'})

In [None]:
# Find out which gvkey in the directors dataset appears in the performance dataset
# Select only rows in df_ceos dataframe that co-occur in share prices dataframe (i.e. publicly-traded companies)
gvkey_list = df_ceos.gvkey.isin(df_share_prices["gvkey"])
df_ceos = df_ceos[gvkey_list]

In [None]:
# Optional step: downsample some of the people appearing in the dataset very often
df_ceos.groupby('director_id').apply(lambda x: x.sample(frac=0.2) if (len(x)>50) else x).reset_index(drop=True)

In [None]:
# Expand the rows of the dataset so that one row corresponds to one month
from multiprocessing import Pool

def parallelize_dataframe(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df
    
def expand_rows(df):
    df = df.groupby('role_id').apply(lambda x: x.set_index('date').resample('M').pad())
    df = df.droplevel(level=0, axis=0)
    df.reset_index(inplace=True)
    return df

df_ceos = parallelize_dataframe(df_ceos, expand_rows)

In [None]:
# Add back end_date and start_date columns 
df_ceos["end_date"] = df_ceos.groupby('role_id')['date'].transform('max')
df_ceos["start_date"] = df_ceos.groupby('role_id')['date'].transform('min')

In [None]:
# Add role_tenure
df_ceos["role_tenure"] = df_ceos["date"] - df_ceos["start_date"]

In [None]:
# Create a response variable set to TRUE if a turnover event takes place within the next 12 months
df_ceos["time_left"] = df_ceos["end_date"] - df_ceos["date"]
df_ceos["turnover"] = (df_ceos['time_left'] < np.timedelta64(1, "Y")) & (df_ceos['role_extension']==False)

In [None]:
# Drop roles from the dataframe that started before 1990 (more than 30 years ago)
expired_data = (df_ceos["start_date"].dt.year < 1990)
df_ceos = df_ceos[~expired_data]

In [None]:
# Drop rows where there is a turnover event but it's a current role
df_ceos = df_ceos[~((df_ceos.turnover==True) & (df_ceos.current_role==True))]

In [None]:
# Select only necessary columns 
df_ceos = df_ceos[['date', 'director_name',\
 'company_name','role_name','director_id',\
  'company_id', 'date_of_birth','gender', \
'nationality','network_size', 'role_id', \
'current_role', 'role_duration','past_roles_count',\
'company_roles_count', 'past_roles_tenure_avg',\
'company_roles_tenure_avg', 'ceo', 'chair', \
'chair_ceo', 'past_role_duration', 'past_roles_tenure',\
 'company_roles_tenure','active_roles_count',\
 'active_roles_count_max','previous_ceo_tenure', \
 'role_extension','end_date', 'start_date','role_tenure','turnover']]

In [None]:
# Add CEO age 
df_ceos['age'] = (df_ceos['date'] - df_ceos['date_of_birth'])/np.timedelta64(1,'Y')

### Step 5: Merge the company and share price datasets

In [None]:
# Merge in performance dataframe into the ceo dataframe
from multiprocessing import Pool

def parallel_merge(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df
    
def merge_dataframes(df):
    df.merge(df_share_prices, on=['date', 'gvkey'], validate='many_to_one')
    return df

df_ceos = parallel_merge(df_ceos, merge_dataframes)

### Step 6: Data preparation: dealing with NAs, encoding and normalization

In [None]:
# Get names of columns with missing values
def identify_missing(df):
    list = []
    # Ignore return columns (labeled 'rtn') for now 
    list = [col for col in df.columns if df[col].isnull().any()]
    return list

cols_with_missing = identify_missing(df_ceos)

In [22]:
# Drop the first 36 months of the dataset
df_ceos = df_ceos[df_ceos["date"] > "01-01-1993"]

In [26]:
# Indicate rows that have columns with missing data
for col in cols_with_missing:
    df_ceos[col + '_was_missing'] = df_ceos[col].isnull()

In [27]:
# Convert dataset to CSV file
df_ceos.to_csv("./data/csv_files/ceos.csv.gz")