In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm

In [2]:
#importing the data
age_data = pd.read_csv("age_data.csv")
tenure_data = pd.read_csv("tenure_data.csv")

  interactivity=interactivity, compiler=compiler, result=result)


First we're going to clean the age data

In [3]:
#Creating NaN consitency
age_data.fillna("NaN", inplace=True)
age_data["DOB"].replace("n.a.", "NaN", inplace=True)

In [4]:
#Creating a function that will get the "DOB" column in the format we want. "DOB" column has
#4 different types of inputs

#Type 1: 01 Apr 1918
#Type 2: Sep 1945
#Type 3: "1945"
#Type 4: 1945
#Type 5: "NaN"

def correct_dob(date):
    if date == "NaN":
        return date
    elif type(date) == int:
        return pd.to_datetime(date, format="%Y").strftime("%Y%m%d")
    elif len(date.split(" ")) == 1:
        return pd.to_datetime(date, format="%Y").strftime("%Y%m%d")
    elif len(date.split(" ")) == 2:
        return pd.to_datetime(date, format="%b %Y").strftime("%Y%m%d")
    elif len(date.split(" ")) == 3:
        return pd.to_datetime(date, format="%d %b %Y").strftime("%Y%m%d")
    else:
        return "ERROR"

dob_vector = np.vectorize(correct_dob)

In [5]:
#Updating the DOB column
age_data["DOB"] = dob_vector(age_data["DOB"])

In [6]:
#Age NANs are being left to be dealt with later

Now we're cleaning the tenure data

In [7]:
#Dropping the entries that either have an "N" in the StartDate or EndDate
n_list = list(np.where(np.logical_or(tenure_data["DateEndRole"]=="N",tenure_data["DateStartRole"]=="N"))[0])
tenure_data.drop(n_list, inplace=True)

In [8]:
#Helper function to update "C" entry to today's date
def update_end(date):
    if date == "C":
        return -1
    else:
        return date

end_vector = np.vectorize(update_end)

In [9]:
#Updating the "C" entry and converting start and end dates to datetime format
tenure_data["DateEndRole"] = end_vector(tenure_data["DateEndRole"])
tenure_data["date_end_role"] = pd.to_datetime(tenure_data["DateEndRole"], format="%Y%m%d")
tenure_data["date_start_role"] = pd.to_datetime(tenure_data["DateStartRole"], format="%Y%m%d")

In [10]:
#Only keeping CEOs
tenure_data = tenure_data[tenure_data["RoleName"].str.contains("CEO")]

Now we can join the two datasets

In [11]:
#Executing the inner join
joined_data = pd.merge(left=tenure_data, right=age_data)

Defining functions that will help us create the panel dataset

In [48]:
def years_column(date_range):
    years = date_range.strftime("%Y").tolist()
    return years

In [49]:
def months_column(date_range):
    months = date_range.strftime("%m").tolist()
    return months

In [86]:
def id_column(_id_, n):
    id_array = [_id_] * n
    return id_array

In [19]:
#Creating a function that will give us "age" in the desired format - Y.MMM
def age_calculator(DOB, date):
    months = np.datetime64(date, "M") - np.datetime64(DOB, "M")
    age = months / np.timedelta64(12,'M')
    return age

#Vectorizing
age_vector = np.vectorize(age_calculator, otypes=[np.float])

In [141]:
def age_column(DOB, date_range):
    #handling "NaN" value
    if DOB == "NaN":
        ages = ["NaN"] * len(date_range)
    else:
        dob = pd.period_range(start=DOB, periods=1 , freq='M').strftime("%Y-%m").tolist()[0]
        reference_dates = date_range.strftime("%Y-%m").tolist()
        ages = list(age_vector(dob, reference_dates))
    return ages

In [189]:
def make_panel(idx, row):
   
    #original df
    original_df = pd.DataFrame(row).transpose()

    #Declaring key variables
    start = row["date_start_role"]
    end = pd.to_datetime('today') if row["date_end_role"] == -1 else end = row["date_end_role"]
    date_range = pd.period_range(start=start, end=end, freq='M')
    _id_ = row["DirectorID"]
    dob = row["DOB"]
    
    #Creating the columns
    # years = years_column(date_range)
    # months = months_column(date_range)
    ages = age_column(dob, date_range)
    ids = id_column(_id_, len(date_range))
    date = date_range
    
    #Creating the df to be appended
    new_df = pd.DataFrame([date,ages,ids]).transpose()
    new_df.columns = ["date", "age", "director_id"]
    
    #Executing the join
    joined_df = pd.merge(left=original_df, right=new_df)
    
    return joined_df

Now I need to find a way to efficiently apply the "make_panel" function to every row in the df

In [190]:
#19 minutes to execute
dfs = []

for idx, row in tqdm(joined_data.iterrows(), total=joined_data.shape[0]):
    curr_df = make_panel(idx, row)
    dfs.append(curr_df)
    
panel_data = pd.concat(dfs)

In [3]:
#Sorting values 
# panel_data panel_dataer.sort_values(["CompanyName"director_idme"dateth"])

NameError: name 'master' is not defined

In [None]:
def add_role_tenure(df):

    ''' 
    Calculate role_tenure by simply taking the current date minus the date_start_role
    '''
    
    df["role_tenure"] = -1

    for start_date, current_date in zip(df["date_start_role"], df["date"]):
        df["role_tenure"] = np.datetime64(current_date, "M") - np.datetime64(start_date, "M")
    
    df["role_tenure"] = df["role_tenure"] / np.timedelta64(12,'M')

    return df

In [None]:
def add_company_tenure(df):
    ''' 
    Calculate company_tenure (in months) by iterating over rows, incrementing tenure +1 for each row, 
    
    '''

    df["company_tenure"] = -1
    
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]): 
        if df.iloc[idx]["director_id"] != df.iloc[idx - 1]["director_id"]:
            df.iloc[idx]["company_tenure"] = 0
        elif df.iloc[idx]["company_id"] != df.iloc[idx - 1]["company_id"]:
            df.iloc[idx]["company_tenure"] = 0
        elif df.iloc[idx]["date"] != df.iloc[idx - 1]["date"]:
            df.iloc[idx]["company_tenure"] = df.iloc[idx - 1]["company_tenure"] + 1 
        else: 
            df.iloc(idx)["company_tenure"] = df.iloc[idx - 1]["company_tenure"] 
        
    return df    

In [None]:
panel_data = add_role_tenure(panel_data)

In [202]:
#Resetting the index and dropping the index column
panel_data.reset_index(inplace=True)
panel_data.drop(columns=["index"], inplace=True)

In [206]:
panel_data.to_csv("panel_data.csv")