In [69]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm

In [2]:
#importing the data
age_data = pd.read_csv("age_data.csv")
tenure_data = pd.read_csv("tenure_data.csv")

First we're going to clean the age data

In [3]:
#Creating NaN consitency
age_data.fillna("NaN", inplace=True)
age_data["DOB"].replace("n.a.", "NaN", inplace=True)

In [4]:
#Creating a function that will get the "DOB" column in the format we want. "DOB" column has
#4 different types of inputs

#Type 1: 01 Apr 1918
#Type 2: Sep 1945
#Type 3: "1945"
#Type 4: 1945
#Type 5: "NaN"

def correct_dob(date):
    if date == "NaN":
        return date
    elif type(date) == int:
        return pd.to_datetime(date, format="%Y").strftime("%Y%m%d")
    elif len(date.split(" ")) == 1:
        return pd.to_datetime(date, format="%Y").strftime("%Y%m%d")
    elif len(date.split(" ")) == 2:
        return pd.to_datetime(date, format="%b %Y").strftime("%Y%m%d")
    elif len(date.split(" ")) == 3:
        return pd.to_datetime(date, format="%d %b %Y").strftime("%Y%m%d")
    else:
        return "ERROR"

dob_vector = np.vectorize(correct_dob)

In [5]:
#Updating the DOB column
age_data["DOB"] = dob_vector(age_data["DOB"])

In [6]:
#Age NANs are being left to be dealt with later

Now we're cleaning the tenure data

In [7]:
#Dropping the entries that either have an "N" in the StartDate or EndDate
n_list = list(np.where(np.logical_or(tenure_data["DateEndRole"]=="N",tenure_data["DateStartRole"]=="N"))[0])
tenure_data.drop(n_list, inplace=True)

In [9]:
#Helper function to update "C" entry to a value of "-1"
def update_end(date):
    if date == "C":
        return -1
    else:
        return pd.to_datetime(date, format="%Y%m%d")

end_vector = np.vectorize(update_end)

In [10]:
#Updating the "C" entry and converting start and end dates to datetime format
tenure_data["date_end_role"] = end_vector(tenure_data["DateEndRole"])
tenure_data["date_start_role"] = pd.to_datetime(tenure_data["DateStartRole"], format="%Y%m%d")

In [11]:
#Only keeping CEOs
tenure_data = tenure_data[tenure_data["RoleName"].str.contains("CEO")]

Now we can join the two datasets

In [12]:
#Executing the inner join
joined_data = pd.merge(left=tenure_data, right=age_data)

Defining functions that will help us create the panel dataset

In [14]:
def years_column(date_range):
    years = date_range.strftime("%Y").tolist()
    return years

In [15]:
def months_column(date_range):
    months = date_range.strftime("%m").tolist()
    return months

In [16]:
def id_column(_id_, n):
    id_array = [_id_] * n
    return id_array

In [17]:
#Creating a function that will give us "age" in the desired format - Y.MMM
def age_calculator(DOB, date):
    months = np.datetime64(date, "M") - np.datetime64(DOB, "M")
    age = months / np.timedelta64(12,'M')
    return age

#Vectorizing
age_vector = np.vectorize(age_calculator, otypes=[np.float])

In [18]:
def age_column(DOB, date_range):
    #handling "NaN" value
    if DOB == "NaN":
        ages = ["NaN"] * len(date_range)
    else:
        dob = pd.period_range(start=DOB, periods=1 , freq='M').strftime("%Y-%m").tolist()[0]
        reference_dates = date_range.strftime("%Y-%m").tolist()
        ages = list(age_vector(dob, reference_dates))
    return ages

In [41]:
def make_panel(idx, row):
   
    #original df
    original_df = pd.DataFrame(row).transpose()

    #Declaring key variables
    start = row["date_start_role"]
    end = (pd.to_datetime('today') if row["date_end_role"] == -1 else row["date_end_role"])
    date_range = pd.period_range(start=start, end=end, freq='M')
    _id_ = row["DirectorID"]
    dob = row["DOB"]
    
    #Creating the columns
    # years = years_column(date_range)
    # months = months_column(date_range)
    ages = age_column(dob, date_range)
    ids = id_column(_id_, len(date_range))
    date = date_range
    
    #Creating the df to be appended
    new_df = pd.DataFrame([date,ages,ids]).transpose()
    new_df.columns = ["date", "age", "DirectorID"]
    
    #Executing the join
    joined_df = pd.merge(left=original_df, right=new_df)
    
    return joined_df

In [42]:
#30 minutes to execute
dfs = []

for idx, row in tqdm(joined_data.iterrows(), total=joined_data.shape[0]):
    curr_df = make_panel(idx, row)
    dfs.append(curr_df)
    
panel_data = pd.concat(dfs)

100%|██████████| 91049/91049 [34:45<00:00, 43.67it/s]


In [66]:
def add_role_tenure(df):

    ''' 
    Calculate role_tenure by simply taking the current date minus the date_start_role
    '''
    
    df["role_tenure"] = -1
    tenure_list = []

    for start_date, current_date in tqdm(zip(df["date_start_role"], df["date"]), total=df.shape[0]):
        tenure_list.append(np.datetime64(current_date, "M") - np.datetime64(start_date, "M"))
    
    df["role_tenure"] = tenure_list / np.timedelta64(12,'M')

    return df

#Creating the role tenure for our panel_data
panel_data = add_role_tenure(panel_data)

100%|██████████| 7047297/7047297 [03:29<00:00, 33637.38it/s]


In [76]:
panel_data.columns

Index(['CompanyID', 'DirectorID', 'DirectorName', 'CompanyName', 'RoleName',
       'DateStartRole', 'DateEndRole', 'Seniority', 'date_end_role',
       'date_start_role', 'DOB', 'Gender', 'Nationality', 'date', 'age',
       'role_tenure'],
      dtype='object')

In [77]:
panel_data = panel_data.sort_values(by=["CompanyName","DirectorName", "date"])

In [125]:
def add_company_tenure(df):
    ''' 
    Calculate company_tenure (in months) by iterating over rows, incrementing tenure +1 for each row, 
    
    '''

    df["company_tenure"] = df["role_tenure"]
    
    print(df.head())

    for idx in range(1, df.shape[0]):
        if df.iloc[idx]["DirectorID"] != df.iloc[idx - 1]["DirectorID"]:
            print("1st branch")
            pass
        elif df.iloc[idx]["CompanyID"] != df.iloc[idx - 1]["CompanyID"]:
            print("2nd branch")
            pass
        elif df.iloc[idx]["date"] != df.iloc[idx - 1]["date"]:
            print("3rd branch")
            df.iloc[idx]["company_tenure"] = df.iloc[idx - 1]["company_tenure"] + 1 
        else: 
            print("4th branch")
            df.iloc[idx]["company_tenure"] = df.iloc[idx - 1]["company_tenure"]
    return df    

In [93]:
costco_data = panel_data[panel_data["CompanyName"].str.contains("Costco")].iloc[190:230]

In [94]:
costco_data.reset_index(inplace=True)

In [None]:
add_company_tenure(costco_data)