# Feature Generation pt.2

This module will determine the characteristics for each executive whom we found in pt.1.  

## Features

This module will extract features from the Bloomberg executive biographies using basic if/then logic:
1. Gender
2. Highest degree attained
3. Awards received?
4. Years in career?
5. Top university education?

In [48]:
# to work with data
import pandas as pd

# to work with regex
import re

# to download
import requests

# to work with HTML tags
from bs4 import BeautifulSoup

# to time functions
import datetime

# to use NaN
import numpy as np

# to pause
import time
import random

# to work with local files
import os

import random

In [3]:
try:
    Company_Exec
except NameError:
    Company_Exec = pd.read_csv('Company_CEO_CFO.csv', index_col=0)

In [4]:
Company_Exec[Company_Exec['name'] == 'ADVANCED MICRO DEVICES INC'].sort_values('fy', ascending=False)

Unnamed: 0,adsh,cik,name,stprba,period,fy,fp,filed,CEO,CFO
1933,0000002488-18-000042,2488,ADVANCED MICRO DEVICES INC,CA,20171231,2017,FY,20180227,Lisa T. Su,Devinder Kumar
7640,0000002488-17-000043,2488,ADVANCED MICRO DEVICES INC,CA,20161231,2016,FY,20170221,Lisa T. Su,Devinder Kumar
13849,0000002488-16-000111,2488,ADVANCED MICRO DEVICES INC,CA,20151231,2015,FY,20160218,Lisa T. Su,Devinder Kumar
22674,0001193125-15-054362,2488,ADVANCED MICRO DEVICES INC,CA,20141231,2014,FY,20150219,Lisa T. Su,Devinder Kumar
29734,0001193125-14-057240,2488,ADVANCED MICRO DEVICES INC,CA,20131231,2013,FY,20140218,Rory P. Read,Devinder Kumar
36966,0001193125-13-069422,2488,ADVANCED MICRO DEVICES INC,CA,20121231,2012,FY,20130221,Rory P. Read,Devinder Kumar
43784,0001193125-12-075837,2488,ADVANCED MICRO DEVICES INC,CA,20111231,2011,FY,20120224,Rory P. Read,Thomas J. Seifert
47351,0001193125-11-040392,2488,ADVANCED MICRO DEVICES INC,CA,20101231,2010,FY,20110218,,Thomas J. Seifert


Pair down the list so that:
1. the cik has at least 5 filings associated with it
2. the cik has at least 5 entries for CEO and CFO

In [5]:
##### get the counts of adsh (filings), CEOs and CFOs.
adsh_ceo_cfo_count_by_cik = Company_Exec.groupby('cik', as_index=True).count()[['adsh','CEO','CFO']]

# filter them
min_entry_count = 5

cik_to_include = adsh_ceo_cfo_count_by_cik[
                            (adsh_ceo_cfo_count_by_cik['adsh'] > min_entry_count) & #check condition 1
                            (adsh_ceo_cfo_count_by_cik['CEO'] > min_entry_count) & #check cond 2
                            (adsh_ceo_cfo_count_by_cik['CFO'] > min_entry_count)]\
                            .index.values #access the index values (CIK)

# include only filings that meet the filter
Company_Exec_filtered = Company_Exec[
    Company_Exec['cik'].isin(cik_to_include)].sort_values(['cik','fy'], ascending=[True, False]).reset_index()
Company_Exec_filtered.head(3)

Unnamed: 0,index,adsh,cik,name,stprba,period,fy,fp,filed,CEO,CFO
0,577,0001144204-18-051414,2034,ACETO CORP,NY,20180630,2018,FY,20180928,William C. Kennally III,Rebecca Roof
1,6491,0001144204-17-045100,2034,ACETO CORP,NY,20170630,2017,FY,20170825,Salvatore Guccione,Douglas Roth
2,12927,0001571049-16-017785,2034,ACETO CORP,NY,20160630,2016,FY,20160826,Salvatore Guccione,Douglas Roth


Further pair down this list so that:
1. For each company there have been executives (including the current one) that stayed with the company for 3 years




In [6]:
Unique_CIK_filtered = Company_Exec_filtered['cik'].unique().copy()

Company_Exec_filt_Exec_3yr = []

for CIK in Unique_CIK_filtered:
    df = Company_Exec_filtered[Company_Exec_filtered['cik'] == CIK].copy()
    df['CEO_3yrs_ago'] = df['CEO'].shift(-3)
    df['CFO_3yrs_ago'] = df['CFO'].shift(-3)
    CEO_been_with_company_3yr = df['CEO'].iloc[0] == df['CEO_3yrs_ago'].iloc[0]
    CEO_been_with_company_3yr = df['CFO'].iloc[0] == df['CFO_3yrs_ago'].iloc[0]
    if CEO_been_with_company_3yr and CEO_been_with_company_3yr:
        Company_Exec_filt_Exec_3yr.append(df)

Company_Exec_filt_Exec_3yr_df = pd.concat(Company_Exec_filt_Exec_3yr)

Company_Exec_filt_Exec_3yr_df.head(3)

Unnamed: 0,index,adsh,cik,name,stprba,period,fy,fp,filed,CEO,CFO,CEO_3yrs_ago,CFO_3yrs_ago
14,1933,0000002488-18-000042,2488,ADVANCED MICRO DEVICES INC,CA,20171231,2017,FY,20180227,Lisa T. Su,Devinder Kumar,Lisa T. Su,Devinder Kumar
15,7640,0000002488-17-000043,2488,ADVANCED MICRO DEVICES INC,CA,20161231,2016,FY,20170221,Lisa T. Su,Devinder Kumar,Rory P. Read,Devinder Kumar
16,13849,0000002488-16-000111,2488,ADVANCED MICRO DEVICES INC,CA,20151231,2015,FY,20160218,Lisa T. Su,Devinder Kumar,Rory P. Read,Devinder Kumar


In [7]:
# Acquire unique names (CFO and CEO) from executives.
Unique_CEOs = pd.DataFrame(Company_Exec_filt_Exec_3yr_df['CEO'].unique())
Unique_CEOs.columns = ['name']
Unique_CEOs['title'] = 'CEO'

Unique_CFOs = pd.DataFrame(Company_Exec_filt_Exec_3yr_df['CFO'].unique())
Unique_CFOs.columns = ['name']
Unique_CFOs['title'] = 'CFO'

Names_to_research = pd.concat([Unique_CEOs,Unique_CFOs])
Names_to_research = Names_to_research[Names_to_research['name'].notnull()]

Names_to_research.reset_index(drop=True, inplace=True)

Names_to_research.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3657 entries, 0 to 3656
Data columns (total 2 columns):
name     3657 non-null object
title    3657 non-null object
dtypes: object(2)
memory usage: 57.2+ KB


In [8]:
def find_bio_urls(row):
    # put together the web url for a random search engine
    query = 'bloomberg executive profiles '+row['name']+' '+row['title']
    query = query.replace(' ', '+')
    
    # polite web scraping: pause and use random search engine
    time.sleep(random.randint(1,3)/2)
    engine_y = 'http://search.yahoo.com/search?p='
    engine_b = 'https://www.bing.com/search?q='
    
    def search_with(engine):
        query_url = engine + query
        print('using',query_url)
        r = requests.get(query_url, timeout=3)
        soup = BeautifulSoup(r.text)    
        url = ''
        # Check all returned links
        for a in soup.find_all('a', href=True):
            valid_url = 'bloomberg.com' in a['href'] \
            and 'Executive' in a.text
            if valid_url:
                # select first hit
                if not url:
                    url = a['href']
                    return url
                
    url = search_with(engine_y)
    if not url:
        url = search_with(engine_b)
    if not url:
        return np.nan
    if url:
        print('found ', url)
        return url

if not 'bio_urls_0_10.csv' in os.listdir('./bio_urls/'):     
    name_count = len(Names_to_research)

    a = 0
    b = 10
    while a < 2010:
        df = Names_to_research[a:b].copy()
        print('copied row count', len(df))
        df['bio_urls'] = df.apply(find_bio_urls, axis=1)
        print('processed',a,'to ',b)
        df.to_csv('./bio_urls/bio_urls_'+str(a)+'_'+str(b)+'.csv')
        print('saved',a,'to ',b)
        a = b
        b += 100
else: 
    print('Bio URLS already downloaded')

Bio URLS already downloaded


In [55]:
# Bloomberg blocks requests.get.
# Use Selenium with Firefox to access the site.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait

def scrape_bio(row):
    url = row['bio_urls']
    # to scrape politely
    if not pd.isnull(url):
        driver.get(url)
        sleep_secs = random.randint(6,8)
        time.sleep(sleep_secs)
    else:
        return np.nan
    try: 
        read_more_button = driver.find_element_by_xpath('//a[@onclick="show_more(this);"]')
        read_more_button.click()
        sleep_secs2 = random.randint(1,2)
        #print('sleeping for ', sleep_secs2)
        time.sleep(sleep_secs2)        
    except NoSuchElementException as exception:
        pass
    try: 
        bio_text = driver.find_element_by_xpath('//*[@itemprop="description"]').text
        bio_text = bio_text.replace("'", " ")
        bio_text = bio_text.replace('"', " ")
    except NoSuchElementException as exception:
        bio_text = np.nan
    try:
        age = driver.find_element_by_xpath('//td[@class="largeDetail"][1]').text
    except NoSuchElementException as exception:
        age = np.nan
    if type(age) is float:
        return (bio_text, np.nan)
    if not age.isdigit():
        age = np.nan
    return_result = (bio_text, age)
    print(return_result)
    if not len(return_result) == 2:
        return (np.nan, np.nan)
    else:
        return return_result

# Open a Firefox instance and start scraping
driver = webdriver.Firefox()
driver.get('https://addons.mozilla.org/en-US/firefox/addon/ublock-origin/')
_ = input('Activated ad blocker? [enter]')

bio_url_files = os.listdir('./bio_urls/')
bio_url_files = random.sample(bio_url_files, len(bio_url_files))
for datafile in bio_url_files:
    if datafile.replace('bio_urls', 'bios') not in os.listdir('./bios/') and datafile != '.DS_Store':
        df = pd.read_csv('./bio_urls/'+datafile, index_col=0, header=0)
        df['bios'] = np.nan
        df['age'] = np.nan
        print('starting ', datafile)
        try:
            df[['bios', 'age']] = df.apply(scrape_bio, axis=1, result_type='expand')
        # to avoid the program stopping abruptly
        except TypeError:
            print('Encountered TypeError')
            driver.quit()
        except KeyboardInterrupt:
            driver.quit()
        # Save the results
        df.to_csv('./bios/'+str(datafile).replace('bio_urls', 'bios'))
        # Keep track of what's been processed
        print('processed', datafile)
print('all bios downloaded')
driver.quit()

Activated ad blocker? [enter]
starting  bio_urls_3610_3710.csv
('Mr. William T. Kansky, also known as Bill, has been Chief Financial Officer and Senior Vice President of Delta Tucker Holdings, Inc. since August 2010. Mr. Kansky has been the Chief Financial Officer and Senior Vice President of DynCorp International LLC & DynCorp International Inc. since joining in August 2010. He served as Principal Accounting Officer and Treasurer at Delta Tucker Holdings, Inc. He served as Vice President and Chief Financial Officer of ITT Defense and Information Solutions at ITT Corporation since joining in April 2006. He joined ITT in 1996 as Supervisor of Headquarters Accounting and also served as its Director of Corporate Accounting and Assistant Corporate Controller. Prior to joining ITT, he served in the finance organizations of Westinghouse Broadcasting Company and Group W Information Services. He holds a Bachelor of Science Degree in Finance from Central Connecticut State University.\nCollapse 

('Mr. Sean S. Sullivan has been Executive Vice President of AMC Networks Inc. since September 2010 and serves as its Chief Financial Officer since June 6, 2011. Mr. Sullivan served as the Chief Corporate Officer of Rainbow Media Holdings, Inc., a subsidiary of AMC Networks Inc. since September 2010 until 2011. He joined AMC Networks in September 2010. He served as the Chief Financial Officer of HIT Entertainment Limited from 2009 to 2010. Mr. Sullivan was responsible for HIT s global financial, legal, human resources and information technology operations. He served as Senior Vice President of Corporate Development and Executive Officer of Moore Wallace Inc. Mr. Sullivan served as the President of Commercial Print and Packaging Division at Cenveo Inc. from July 9, 2007 to June 1, 2008. He was responsible for the day-to-day operations of Cenveo s Commercial and Packaging businesses. He served as the Chief Financial Officer of Cenveo Inc., from September 12, 2005 to July 9, 2007 and serve

('Mr. Steve Wang is the Owner and Chief Executive Officer of MAT Holdings, Inc.', nan)
('Mr. Martin Mobarak serves as Chief Executive Officer and President of MAP Universal LLC. Mr. Mobarak served as the Chairman, Chief Executive Officer and President at Apex 2 Inc., since October 28, 2011 and served as its Secretary, Treasurer and Chief Financial Officer until April 10, 2015. Mr. Mobarak began his professional experience as a Bering Sea fisherman in 1989. Since 1992, Mr. Mobarak owned a multi-vessel commercial and charter fishing fleet in Alaska and In Alaska, he founded Servcom Inc. From 1997 to 1999, Mr. Mobarak entered the hospitality industry by building a Hotel & Bed and Breakfast in Mexico. In 2001, Mr. Mobarak purchased a hangar at a regional airport in South Florida and started several aviation enterprises under the name Mobarak Aircraft LLC, which included; Interior design, furniture, and fixtures. In addition, he launched a jet fuel service, a food service under the name Fra

('Mr. Curtis L. Buser, also known as Curt, CPA has been the Chief Financial Officer of Carlyle Group Management L.L.C.- General Partner of The Carlyle Group LP, since December 18, 2014 and was its Director since November 2018 until 2018. Mr. Buser is responsible for corporate and partnership financial management and reporting, treasury functions and fund management and is an integral part of Carlyle s public company investor reporting and information technology group. He serves as the Managing Director and Chief Accounting Officer of TC Group, LLC. Mr. Buser joined Carlyle in September 2004 as a Managing Director and served as its Chief Accounting Officer. From May 9, 2014 to December 2014, he served as Carlyle s Interim Chief Financial Officer. Previously he was a Director of The Carlyle Group LP. He was the Interim Chief Financial Officer at Carlyle Group Management L.L.C. from May 2014 to December 2014 and served as its Chief Accounting Officer from September 2004 to June 2, 2014. A

('Mr. Richard C. Madigan is the Chief Investment Officer and Head of Investment Strategy at JPMorgan Private Bank. He is a Portfolio Manager at OFFIT Investment Group. He is also a Portfolio Manager at OFFIT Variable Insurance Fund - Emerging Markets Bond Fund.', nan)
('Mr. Matthew D. Mullet, also known as Matt, has been the Chief Financial Officer of FS Bancorp, Inc since September 2011 and serves as its Secretary and Treasurer. Mr. Mullet has been the Chief Financial Officer of 1st Security Bank of Washington, a subsidiary of FS Bancorp, Inc since September 2011 and serves as its Chief Operating Officer. Mr. Mullet served as the Chief Financial Officer and Vice President of Golf Savings Bank. He joined 1st Security Bank of Washington in July 2011. He started his banking career in June 2000 as a Financial Examiner with the Washington Department of Financial Institutions, Division of Banks, where he worked until October 2004. From October 2004 to August 2010, Mr. Mullet had been with G

('Ms. Catherine R. Smith, also known as Cathy, has been the Chief Financial Officer and Executive Vice President at Target Corporation since September 1, 2015. Ms. Smith’s responsibilities include Treasury and Tax; Internal and External Financial Reporting and Operations; Financial Planning and Analysis; Internal Audit; Investor Relations; and Target’s Financial and Retail Services Business. She served as Chief Financial Officer and Executive Vice President at Express Scripts, Inc. from February 2014 to December 2014. Ms. Smith served as an Executive of Express Scripts Holding Company since January 2, 2015 until March 1, 2015 and served as its Chief Financial Officer and Executive Vice President from February 10, 2014 to January 2, 2015. Ms. Smith served as Executive Vice President and Chief Financial Officer at Walmart International (Walmart) from 2010 to 2014. She served as the Chief Financial Officer at Wal-Mart International since February 2010 and its Executive Vice President of s

('Ms. Rachel Boulds, CPA, PLLC, serves as Chief Financial Officer at The OLB Group, Inc. Ms. Boulds has been Chief Financial Officer at Starco Brands, Inc. (formerly Insynergy Products, Inc.) since March 06, 2015. She is Chief Financial Officer of Vemanti Group, Inc. Ms. Boulds has been the Chief Financial Officer and Secretary of US Nuclear Corp since October 31, 2014. She served as Chief Financial Officer and Secretary at US Nuclear Corp since October 31, 2014 until January 31, 2017. Ms. Boulds served as the Chief Financial Officer of Avalanche International, Corp. from June 6, 2014 to June 25, 2016. Ms. Boulds served as the Chief Financial Officer of Independent Film Development Corporation from January 27, 2012 to July 21, 2015 and served as its Principal Accounting Officer. Since August 2009, Ms. Boulds has been engaged in her sole accounting practice, providing all aspects of consulting and accounting services to clients, including the preparation of full disclosure financial sta

('Mr. Joseph D. Gangemi has been the Chief Financial Officer, Senior Vice President and Secretary at Malvern Bank, National Association (“Bank”) since May 26, 2015. Mr. Gangemi has been the Chief Financial Officer, Senior Vice President and Secretary of Malvern Bancorp, Inc. He joined Bank in September 2014 until May 26, 2015 as Treasurer/Investment Officer and Corporate Secretary. Mr. Gangemi served as the Chief of Staff and Senior Vice President of ConnectOne Bancorp, Inc. since February 2013. He served as the Senior Vice President of Investor Relations at Union Center National Bank, Inc. Mr. Gangemi served as the Chief of Staff and Senior Vice President of Union Center National Bank since February 2013. Mr. Gangemi served as an Executive Assistant to Chief Executive Officer and Corporate Secretary at ConnectOne Bancorp, Inc. since June 2008 and served as its Vice President of Investor Relations. Mr. Gangemi served as Vice President and Assistant Portfolio Manager of Union Center Nat

('Mr. Jeffrey M. Keebler, also known as Jeff, is President and Chief Executive Officer of Madison Gas and Electric Company (“MGE”) and also serves as its Chairman of the Board since October 1, 2018. He serves as President, CEO & Chairman of the Board at MGE Energy, Inc. since October 1, 2018. He serves as Director of American Transmission Company LLC since June 27, 2018. He has served as President and Chief Executive Officer, Director at MGE Energy, Inc. since March 1, 2017 until October 1, 2018. Mr. Keebler has 22 years of experience. He was Assistant Vice President - Energy Supply and Customer Service of MGE since January 2012. He served as Senior Vice President - Energy Supply and Planning of Madison Gas and Electric Company since July 2015. Mr. Keebler has been employed at Madison Gas and Electric Company since 1995. Mr. Keebler is a director of the University of Wisconsin Research Park and United Way of Dane County. He was Senior Director - Energy Supply Procurement of MGE since M

('Mr. David D. Nelson, also known as Dave, has been the Chief Executive Officer and President of West Bancorp., Inc. since April 1, 2010 and serves as its Director. Mr. Nelson has been the Chairman of the Board and Chief Executive Officer at West Bank since April 1, 2010. He served as the President of Southeast Minnesota Business Banking and President of Wells Fargo Bank Rochester in Rochester, Minnesota. Mr. Nelson has more than 25 years experience in commercial banking. Mr. Nelson has strong backgrounds in customer relationship building, credit and leadership development. In addition to his extensive professional expertise, he has been an active participant in many community organizations. He is a past board member of the Rochester Area Chamber of Commerce, the Olmsted County United Way and the Rochester Community and Technical College Foundation. He has also served as Chair of the Olmsted Medical Center Board of Trustees. He has been a Director of West Bank since April 1, 2010. For 

('Mr. Timothy D. Hockey, also known as Tim, has been the President of TD Ameritrade Holding Corporation since January 2, 2016 and became its Chief Executive Officer since October 1, 2016. Mr. Hockey served as the Chief Executive Officer and President of TD Canada Trust at The Toronto-Dominion Bank since June 2008 until January 02, 2016, and was primarily responsible for the leadership of Canadian banking, which included Canadian personal banking, business banking, auto finance, global direct investing, advisory and Canadian asset management businesses. He served as Group Head, Canadian Banking and Wealth Management and Co-Chairman of TD Canada Trust at TD Bank US Holding Company. He served as Group Head of Canadian Banking and Wealth Management at The Toronto-Dominion Bank Group since May 1, 2015 and served as its Group Head of Canadian Banking, Auto Finance & Wealth Management from July 01, 2013 to May 1, 2015 and also its Group Head of Canadian Banking, Auto Finance & Credit Cards fr

('Mr. Syed B. Ali Co-Founded Cavium, Inc. in 2000 and served as its Chairman of the Board, Chief Executive Officer and President since 2000 until July 6, 2018. Mr. Ali has over 20 years of management and engineering experience in the semiconductor area. Mr. Ali also worked for 10 years at WSI/SGS-Thompson and Tandem, where he was involved with product line management and product design. Mr. Ali was a Founding Management Team Member of Malleable Technologies and served as its Vice President of Marketing and Sales. He served as the Vice President of Marketing at I-Cube. He held various positions at Wafer Scale Integration, a division of SGS-Thompson, Tandem Computer and American Microsystems. From 1994 to 1998, he served as an Executive Director of Samsung Electronics. He has been Director of QLogic Corp. since August 16, 2016 and Marvell Technology Group Ltd. since July 6, 2018. Mr. Ali obtained a MSEE from the University of Michigan, Ann Arbor in 1981 and BSEE from the Osmania Universi

('Mr. Thomas M. Coughlin, also known as Tom, is the Chief Executive Officer and President of BCB Community Bank and Corporate Secretary of BCB Bancorp, Inc. Mr. Coughlin has been in the banking industry for over 35 years. He was formerly Vice President of Chatham Savings Bank and prior to that, Controller and Corporate Secretary of First Savings Bank of New Jersey. Mr. Coughlin, who received his CPA designation in 1982, is a member of the Bayonne Rotary, Gift of Life 7490, International Aid Humanitarian Foundation, Friends of Family Readiness Group, Executive Board Member of the Hudson County Chamber of Commerce, and Friends of Special Children. In addition, Mr. Coughlin was a former Commissioner of the Bayonne Rent Control Board, the past President of the American Heart Associated and has served as Trustee of D.A.R.E. and the Bayonne P.A.L. Mr. Coughlin currently serves as the President of the Bayonne Chamber of Commerce. Mr. Coughlin attended Saint Vincent DePaul Grammar School, Bayo

('Mr. Martin Cohen, also known as Marty, is an Executive Chairman, Director and Portfolio Manager at Cohen & Steers Capital Management, Inc. Mr. Cohen has been the Chairman of Cohen & Steers, Inc., since 2014 and served as its Co-Chief Executive Officer and Co-Chairman until 2014. He co-founded Cohen & Steers, Inc. in 1986. Previously, Mr. Cohen was a Senior Vice President and Portfolio Manager at National Securities and Research Corporation from 1984 to 1986. Prior to that, he was a Vice President at Citibank from 1976 to 1981. He served as Co-Chairman of Cohen & Steers VIF Realty Fund, Inc. He has been a Director of Cohen & Steers, Inc. since August 2004. Mr. Cohen has also served as a Member of the Board of Governors of the National Association of Real Estate Investment Trusts. He has vast investment experience. In 2001, he was the recipient of the National Association of Real Estate Investment Trusts Industry Achievement Award. Mr. Cohen earned a B.S. from The City College of New Y

ValueError: Must have equal len keys and value when setting with an iterable

In [94]:
def features_from_bio(row):
    Bio = row['bios']
    Name = row['name']
    if not Bio or type(Bio) != str:
        return [np.nan]*5
    try:
        # Is it the correct Bio?
        if not Name.lower() in Bio.lower():
            return [np.nan]*5
        
        # Determine gender based on pronouns in text
        gender = np.nan
        if 'she' in Bio:
            gender = 'Female'
        elif 'he' in Bio:
            gender = 'Male'


        # Determine degree based on mentions in text
        bachelor = ['B.S.', 'Bachelor of Science', ' BS ', "bachelor’s degree"]
        master = ['M.S.', 'Masters of Science', ' MS ', 'Master', 'MBA', 'M.B.A.']
        doctor = ['Dr.', 'Doctor of']

        degree = np.nan
        if any(title.lower() in Bio.lower() for title in bachelor):
            degree = 'Bachelor'
        if any(title.lower() in Bio.lower() for title in master):
            degree = 'Master'    
        if any(title.lower() in Bio.lower() for title in doctor):
            degree = 'Doctorate'


        # Is there any mention of awards?
        awards = False
        accolades = [' award', ' fortune ', ' forbes ']
        for acc in accolades:
            if acc.lower() in Bio.lower():
                awards = True

        # Assume the earliest year mentioned is the start of their career.    
        years = [int(x) for x in re.findall(r'\d{4}', str(Bio))]
        if years:
            career_start = min(years)
        else:
            career_start = np.nan


        # Is there any mention of a top university 
        # https://www.topuniversities.com/university-rankings/world-university-rankings/2019
        Top25Universities = ['Massachusetts Institute of Technology', 'Stanford University', 'Harvard University',
                          'California Institute of Technology', 'University of Oxford', 'University of Cambridge', 
                          'Swiss Federal Institute of Technology', 'Imperial College London', 'University of Chicago',
                          'University College London', 'National University of Singapore', 'Nanyang Technological University',
                          'Princeton University', 'Cornell University', 'Yale University', 'Columbia University',
                          'Tsinghua University', 'University of Edinburgh', 'University of Pennsylvania', 'University of Michigan',
                          'Johns Hopkins University', 'école polytechnique fédérale de lausanne', 'University of Tokyo',
                             'Australian National University', 'University of Hong Kong'
                          ]

        TopUniversity = False
        for uni in Top25Universities:
            if uni.lower() in Bio.lower():
                TopUniversity = True
    except TypeError:
        return [np.nan]*5        
    
    return gender, degree, awards, career_start, TopUniversity

def verify_name(row):
    if row['name'].lower() in row['bios'].lower():
        return row['age']
    else:
        return np.nan
    

for datafile in random.sample(os.listdir('./bios/'), len(os.listdir('./bios/'))):
    if not 'processed_'+datafile in os.listdir('./bios_processed/') and datafile != '.DS_Store':
        df = pd.read_csv('./bios/'+datafile, header=0, index_col=0)
        df[['gender', 'degree', 'awards', 'career start', 'top university']] = \
        df.apply(features_from_bio, axis=1, result_type='expand')
        df['age'] = df.apply(verify_name, axis=1)
        df.to_csv('./bios_processed/'+'processed_'+datafile)

In [97]:
dfs = []
for datafile in os.listdir('./bios_processed/'):
    if datafile != '.DS_Store':
        df = pd.read_csv('./bios_processed/'+datafile, header=0, index_col=0)
        dfs.append(df)

exec_features = pd.concat(dfs)

exec_features[exec_features['bios'].notnull()].sample(10)

Unnamed: 0,name,title,bio_urls,bios,age,gender,degree,awards,career start,top university
581,Mark A. Klein,CEO,https://www.bloomberg.com/research/stocks/priv...,Mr. Mark A. Klein serves as the President and ...,63.0,Male,Master,False,1976.0,False
93,Keith S. Walters,CEO,https://www.bloomberg.com/research/stocks/peop...,Mr. Keith S. Walters has been the President at...,68.0,Male,,False,1997.0,False
2430,Brandon S. Pedersen,CFO,https://www.bloomberg.com/research/stocks/peop...,"Mr. Brandon S. Pedersen, CPA, served as Chief ...",51.0,Male,,False,2003.0,False
3621,John D. Kerr,CFO,https://www.bloomberg.com/research/stocks/peop...,Mr. John D. Kerr has been the President and Ch...,51.0,Male,,False,2010.0,False
3623,David M. Duckworth,CFO,https://www.bloomberg.com/research/stocks/peop...,Mr. David M. Duckworth has been the Chief Fina...,38.0,Male,,False,2002.0,False
43,Matthew K. Rose,CEO,https://www.bloomberg.com/research/stocks/priv...,"Mr. Matthew K. Rose, also known as Matt, has b...",58.0,Male,Bachelor,False,1981.0,False
670,Joel H. Reichman,CEO,https://www.bloomberg.com/research/stocks/priv...,"Mr. Joel H. Moser, also known as Joel, Esq. is...",,,,,,
674,Grant C. Bennett,CEO,https://www.bloomberg.com/research/stocks/peop...,"Mr. Grant C. Bennett has been the President, C...",63.0,Male,Master,False,1985.0,True
2237,Randall R. Harwood,CFO,https://www.bloomberg.com/research/stocks/priv...,"Mr. Robert C. Lyons, also known as Bob, has be...",62.0,,,,,
2594,Gregory J. Heinlein,CFO,https://www.bloomberg.com/research/stocks/priv...,"Mr. Gregory J. Heinlein, also known as Greg, h...",54.0,Female,Master,False,1987.0,False


In [96]:
exec_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 857 entries, 0 to 709
Data columns (total 10 columns):
name              857 non-null object
title             857 non-null object
bio_urls          745 non-null object
bios              630 non-null object
age               559 non-null float64
gender            348 non-null object
degree            221 non-null object
awards            353 non-null object
career start      346 non-null float64
top university    353 non-null object
dtypes: float64(2), object(8)
memory usage: 73.6+ KB
