In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
#importing the data
age_data = pd.read_csv("age_data.csv")
tenure_data = pd.read_csv("tenure_data.csv")

  interactivity=interactivity, compiler=compiler, result=result)


First we're going to clean the age data

In [3]:
#Dropping the "n.a." values of the DOB column and the NaN values of the nationality colum
age_data.drop(np.where(age_data["DOB"] == "n.a.")[0], inplace = True)

#Dropping the NaN values in the entire dataframe
age_data.dropna(inplace=True)

In [4]:
#Creating a function that will get the "DOB" column in the format we want
def correct_date_generator(date):
    if type(date) == int:
        correct_date = pd.to_datetime(date, format="%Y").strftime("%Y%m%d")
    elif len(date.split(" ")) == 1:
        correct_date = pd.to_datetime(date, format="%Y").strftime("%Y%m%d")
    elif len(date.split(" ")) == 2:
        correct_date = pd.to_datetime(date, format="%b %Y").strftime("%Y%m%d")
    elif len(date.split(" ")) == 3:
        correct_date = pd.to_datetime(date, format="%d %b %Y").strftime("%Y%m%d")
    else:
        correct_date = "ERROR"
    return correct_date

In [5]:
#Updating the DOB column
date_vector = np.vectorize(correct_date_generator)
age_data["DOB"] = date_vector(age_data["DOB"])

Now we're cleaning the tenure data

In [6]:
#Dropping "N" (null) values in "DateStartRole" - CLEANER WAY TO DO THIS
index_list = tenure_data[tenure_data["DateStartRole"] == 'N'].index.tolist()
tenure_data.drop(index_list, inplace = True)

In [7]:
#Dropping "N" (null) values in "DateEndRole" - CLEANER WAY TO DO THIS
index_list = tenure_data[tenure_data["DateEndRole"] == 'N'].index.tolist()
tenure_data.drop(index_list, inplace = True)

In [8]:
#Creating a function that will update the "C" variable which means the director is
#still in his role
def update_date(date):
    if date == "C":
        return datetime.today().strftime('%Y%m%d')
    else:
        return date

In [9]:
#Updating the "DateEndRole" column
date_vector = np.vectorize(update_date)
tenure_data["DateEndRole"] = date_vector(tenure_data["DateEndRole"])

In [10]:
#Only keeping CEOs
tenure_data = tenure_data[tenure_data["RoleName"].str.contains("CEO")]

Now we can join the two datasets

In [11]:
#Executing the inner join
joined_data = pd.merge(left=tenure_data, right=age_data)

In [115]:
#Creating a function that will give us "age" in the desired format - Y.MMM
def age_calculator(DOB, date):
    months = np.datetime64(date, "M") - np.datetime64(DOB, "M")
    age = months / np.timedelta64(12,'M')
    return age

#Vectorizing
age_vector = np.vectorize(age_calculator, otypes=[np.float])

In [116]:
def make_panel(data, idx):

    #Creating the years column
    date_range = pd.period_range(start=data["DateStartRole"][idx], end=data["DateEndRole"][idx], freq='M')
    years = date_range.strftime("%Y").tolist()
    
    #Creating the CompanyID column
    id_array = list([data["DirectorID"][idx]]*len(years))
    
    #Creating the months column
    months = date_range.strftime("%m").tolist()
    
    #Getting the DOB in the right format
    dob = pd.period_range(start=data["DOB"][idx], periods=1 , freq='M').strftime("%Y-%m").tolist()[0]
    
    #Creating a list of all the dates we want the age for
    reference_dates = date_range.strftime("%Y-%m").tolist()
    
    #Creating the ages column
    ages = list(age_vector(dob, reference_dates))
    
    #Creating the dataframe to added to the master
    current_df = pd.DataFrame([years,months,ages,id_array]).transpose()
    current_df.columns = ["year", "month", "age","DirectorID"]
    current_df = current_df.merge(data)
    
    return current_df

In [157]:
#This took 3 hours to run

n = joined_data.shape[0]
panel_data = pd.DataFrame()

for i in np.arange(n):
    curr_df = pd.DataFrame(joined_data.iloc[i]).transpose()
    curr_panel = make_panel(curr_df, i)
    panel_data = panel_data.append(curr_panel)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [167]:
#Sorting values
panel_data = panel_data.sort_values(["CompanyName","DirectorName","year","month"])

In [168]:
#Reseting the index
panel_data.reset_index(inplace=True)

In [171]:
#Dropping old index
panel_data.drop(columns=["index"], inplace=True)

In [173]:
panel_data.to_csv("panel_data.csv")