# Feature Generation pt.2

This module will determine the characteristics for each executive whom we found in pt.1 by web scraping the biographies of these people from Bloomberg.  

## Features

This module will extract features from the Bloomberg executive biographies using basic if/then logic:
1. Gender: Categorical
2. Highest degree attained: Categorical
3. Awards received: Boolean
4. Career debut (year): Integer
5. Top 25 global university education: Boolean

In [2]:
# to work with data
import pandas as pd

# to work with regex
import re

# to download
import requests

# to work with HTML tags
from bs4 import BeautifulSoup

# to time functions
import datetime

# to use NaN
import numpy as np

# to pause
import time
import random

# to work with local files
import os

# to introduce an element of randomness
import random

In [3]:
# If the list of executives hasn't been loaded yet, load it.
try:
    Company_Exec
except NameError:
    Company_Exec = pd.read_csv('Company_CEO_CFO.csv', index_col=0)

In [4]:
# Let's take a look.
Company_Exec[Company_Exec['name'] == 'ADVANCED MICRO DEVICES INC'].sort_values('fy', ascending=False).head(3)

Unnamed: 0,adsh,cik,name,stprba,period,fy,fp,filed,CEO,CFO
1933,0000002488-18-000042,2488,ADVANCED MICRO DEVICES INC,CA,20171231,2017,FY,20180227,Lisa T. Su,Devinder Kumar
7640,0000002488-17-000043,2488,ADVANCED MICRO DEVICES INC,CA,20161231,2016,FY,20170221,Lisa T. Su,Devinder Kumar
13849,0000002488-16-000111,2488,ADVANCED MICRO DEVICES INC,CA,20151231,2015,FY,20160218,Lisa T. Su,Devinder Kumar


Pair down the list so that:
1. the cik has at least 5 filings associated with it
2. the cik has at least 5 entries for CEO and CFO

In [5]:
# How many records, in years, of executives do we have for each company? 
adsh_ceo_cfo_count_by_cik = Company_Exec.groupby('cik', as_index=True).count()[['adsh','CEO','CFO']]

# We need 5 or more to see growth.
min_entry_count = 5
cik_to_include = adsh_ceo_cfo_count_by_cik[
                            (adsh_ceo_cfo_count_by_cik['adsh'] > min_entry_count) &
                            (adsh_ceo_cfo_count_by_cik['CEO'] > min_entry_count) &
                            (adsh_ceo_cfo_count_by_cik['CFO'] > min_entry_count)]\
                            .index.values # Return the CIK (identifier) values of the matches.

# Include only filings that meet the filter.
Company_Exec_filtered = Company_Exec[
    Company_Exec['cik'].isin(cik_to_include)].sort_values(['cik','fy'], ascending=[True, False]).reset_index()

Further pair down this list so that:
1. For each company there have been executives (including the current one) that stayed with the company for 3 years




In [6]:
Unique_CIK_filtered = Company_Exec_filtered['cik'].unique().copy()

# This can also be done with: 'for heading, data in Unique_CIK_filtered.groupby('cik'):'
Company_Exec_filt_Exec_3yr = []
for CIK in Unique_CIK_filtered:
    # Analyze each CIK individually
    df = Company_Exec_filtered[Company_Exec_filtered['cik'] == CIK].copy()
    
    # Are the execs still the same as three years ago?
    if df['CEO'].iloc[0] == df['CEO'].shift(-3).iloc[0] \
    and df['CFO'].iloc[0] == df['CFO'].shift(-3).iloc[0]:
        Company_Exec_filt_Exec_3yr.append(df)

Company_Exec_filt_Exec_3yr_df = pd.concat(Company_Exec_filt_Exec_3yr)
Company_Exec_filt_Exec_3yr_df.head(3)

Unnamed: 0,index,adsh,cik,name,stprba,period,fy,fp,filed,CEO,CFO
14,1933,0000002488-18-000042,2488,ADVANCED MICRO DEVICES INC,CA,20171231,2017,FY,20180227,Lisa T. Su,Devinder Kumar
15,7640,0000002488-17-000043,2488,ADVANCED MICRO DEVICES INC,CA,20161231,2016,FY,20170221,Lisa T. Su,Devinder Kumar
16,13849,0000002488-16-000111,2488,ADVANCED MICRO DEVICES INC,CA,20151231,2015,FY,20160218,Lisa T. Su,Devinder Kumar


In [7]:
# Acquire unique names (CFO and CEO) from executives.
Unique_CEOs = pd.DataFrame(Company_Exec_filt_Exec_3yr_df['CEO'].unique())
Unique_CEOs.columns = ['name']
Unique_CEOs['title'] = 'CEO'

Unique_CFOs = pd.DataFrame(Company_Exec_filt_Exec_3yr_df['CFO'].unique())
Unique_CFOs.columns = ['name']
Unique_CFOs['title'] = 'CFO'

Names_to_research = pd.concat([Unique_CEOs,Unique_CFOs])
Names_to_research = Names_to_research[Names_to_research['name'].notnull()]

Names_to_research.reset_index(drop=True, inplace=True)

Names_to_research.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2515 entries, 0 to 2514
Data columns (total 2 columns):
name     2515 non-null object
title    2515 non-null object
dtypes: object(2)
memory usage: 39.4+ KB


In [8]:
def find_bio_urls(row):
    # put together the web url for a random search engine
    query = 'bloomberg executive profiles '+row['name']+' '+row['title']
    query = query.replace(' ', '+')
    
    # polite web scraping: pause and use random search engine
    time.sleep(random.randint(1,3)/2)
    engine_y = 'http://search.yahoo.com/search?p='
    engine_b = 'https://www.bing.com/search?q='
    
    def search_with(engine):
        query_url = engine + query
        print('using',query_url)
        r = requests.get(query_url, timeout=3)
        soup = BeautifulSoup(r.text)
        
        url = ''
        # Check all returned links
        for a in soup.find_all('a', href=True):
            if 'bloomberg.com' in a['href'] and 'Executive' in a.text and not url: # Is this the first valid URL?
                return a['href']
                
    url = search_with(engine_y)
    if not url:
        url = search_with(engine_b)
    if not url:
        url = np.nan
    print('found ', url)
    return url

a, b = 0, 10
stepsize = 100

if not 'bio_urls_'+str(a)+'_'+str(b)+'.csv' in os.listdir('./bio_urls/'):     
    name_count = len(Names_to_research)

    while a < name_count:
        df = Names_to_research[a:b].copy()
        print('copied row count', len(df))
        df['bio_urls'] = df.apply(find_bio_urls, axis=1)
        print('processed',a,'to ',b)
        df.to_csv('./bio_urls/bio_urls_'+str(a)+'_'+str(b)+'.csv')
        print('saved',a,'to ',b)
        a = b
        b += stepsize
else: 
    print('Bio URLS already downloaded')

Bio URLS already downloaded


In [None]:
# Use Selenium with Firefox to access the site.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait

def scrape_bio(row):
    url = row['bio_urls']
    try:
        # Is there a url? If so, scrape politely.
        if not pd.isnull(url):
            driver.get(url)
            sleep_secs = random.randint(6,8)
            time.sleep(sleep_secs)
        else:
            return np.nan

        # Is there a show-more button?
        try: 
            read_more_button = driver.find_element_by_xpath('//a[@onclick="show_more(this);"]')
            read_more_button.click()
            sleep_secs2 = random.randint(1,2)
            time.sleep(sleep_secs2)        
        except NoSuchElementException as exception:
            pass

        # Is there a biography?
        try: 
            bio_text = driver.find_element_by_xpath('//*[@itemprop="description"]').text
            bio_text = bio_text.replace("'", " ")
            bio_text = bio_text.replace('"', " ")
        except NoSuchElementException as exception:
            bio_text = np.nan

        # Is there an age element?
        try:
            age = driver.find_element_by_xpath('//td[@class="largeDetail"][1]').text
        except NoSuchElementException as exception:
            age = np.nan

        # If the age does not make sense, return NaN.
        if type(age) is float or not age.isdigit():
            age = np.nan

        return_result = (bio_text, age)
        print(return_result)

        # If the result does not make sense, return nan for Biography and Age.
        if len(return_result) == 2:
            return return_result
        else:
            return (np.nan, np.nan)
    except TypeError:
        return (np.nan, np.nan)

# Open a Firefox instance and start scraping
driver = webdriver.Firefox()
# Every new instance of Firefox loses the ad blocker. No idea why.
# Activate it manually and press enter when ready.
driver.get('https://addons.mozilla.org/en-US/firefox/addon/ublock-origin/')
input('Activated ad blocker? [enter]')

bio_url_files = random.sample(os.listdir('./bio_urls/'), len(os.listdir('./bio_urls/')))
for datafile in bio_url_files:
    if datafile.replace('bio_urls', 'bios') not in os.listdir('./bios/') and datafile != '.DS_Store':
        df = pd.read_csv('./bio_urls/'+datafile, index_col=0, header=0)
        df['bios'] = np.nan
        df['age'] = np.nan
        print('starting ', datafile)
        try:
            df[['bios', 'age']] = df.apply(scrape_bio, axis=1, result_type='expand')
        # to avoid the program stopping abruptly
        except TypeError:
            print('Encountered TypeError')
            driver.quit()
        except KeyboardInterrupt:
            driver.quit()
        # Save the results
        df.to_csv('./bios/'+str(datafile).replace('bio_urls', 'bios'))
        # Keep track of what's been processed
        print('processed', datafile)
print('all bios downloaded')
driver.quit()

In [94]:
def features_from_bio(row):
    Bio = row['bios']
    Name = row['name']
    if not Bio or type(Bio) != str:
        return [np.nan]*5
    try:
        # Is it the correct Bio?
        if not Name.lower() in Bio.lower():
            return [np.nan]*5
        
        # Determine gender based on pronouns in text
        gender = np.nan
        if 'she' in Bio:
            gender = 'Female'
        elif 'he' in Bio:
            gender = 'Male'

        # Determine degree based on mentions in text
        bachelor = ['B.S.', 'Bachelor of Science', ' BS ', "bachelor’s degree"]
        master = ['M.S.', 'Masters of Science', ' MS ', 'Master', 'MBA', 'M.B.A.']
        doctor = ['Dr.', 'Doctor of']

        degree = np.nan
        if any(title.lower() in Bio.lower() for title in bachelor):
            degree = 'Bachelor'
        if any(title.lower() in Bio.lower() for title in master):
            degree = 'Master'    
        if any(title.lower() in Bio.lower() for title in doctor):
            degree = 'Doctorate'

        # Is there any mention of awards?
        awards = False
        accolades = [' award', ' fortune ', ' forbes ']
        for acc in accolades:
            if acc.lower() in Bio.lower():
                awards = True

        # Assume the earliest year mentioned is the start of their career.    
        years = [int(x) for x in re.findall(r'\d{4}', str(Bio))]
        if years:
            career_start = min(years)
        else:
            career_start = np.nan

        # Is there any mention of a top university 
        # https://www.topuniversities.com/university-rankings/world-university-rankings/2019
        Top25Universities = ['Massachusetts Institute of Technology', 'Stanford University', 'Harvard University',
                          'California Institute of Technology', 'University of Oxford', 'University of Cambridge', 
                          'Swiss Federal Institute of Technology', 'Imperial College London', 'University of Chicago',
                          'University College London', 'National University of Singapore', 'Nanyang Technological University',
                          'Princeton University', 'Cornell University', 'Yale University', 'Columbia University',
                          'Tsinghua University', 'University of Edinburgh', 'University of Pennsylvania', 'University of Michigan',
                          'Johns Hopkins University', 'école polytechnique fédérale de lausanne', 'University of Tokyo',
                             'Australian National University', 'University of Hong Kong'
                          ]

        TopUniversity = False
        for uni in Top25Universities:
            if uni.lower() in Bio.lower():
                TopUniversity = True
    except TypeError:
        return [np.nan]*5        
    
    return gender, degree, awards, career_start, TopUniversity

# if we found the wrong Bio for the name, erase the age.
# Other features are not generated in features_from_bio if
# names do not match.
def verify_name(row):
    if row['name'].lower() in row['bios'].lower():
        return row['age']
    else:
        return np.nan

for datafile in random.sample(os.listdir('./bios/'), len(os.listdir('./bios/'))):
    if not 'processed_'+datafile in os.listdir('./bios_processed/') and datafile != '.DS_Store':
        df = pd.read_csv('./bios/'+datafile, header=0, index_col=0)
        df[['gender', 'degree', 'awards', 'career start', 'top university']] = \
        df.apply(features_from_bio, axis=1, result_type='expand')
        df['age'] = df.apply(verify_name, axis=1)
        df.to_csv('./bios_processed/'+'processed_'+datafile)

In [3]:
dfs = []
for datafile in os.listdir('./bios_processed/'):
    if datafile != '.DS_Store':
        df = pd.read_csv('./bios_processed/'+datafile, header=0, index_col=0)
        dfs.append(df)

exec_features = pd.concat(dfs)

exec_features[exec_features['bios'].notnull()].sample(10)

Unnamed: 0,name,title,bio_urls,bios,age,gender,degree,awards,career start,top university
146,Patrick K. Decker,CEO,https://www.bloomberg.com/research/stocks/priv...,Mr. Patrick K. Decker has been the Chief Execu...,53.0,Male,Bachelor,False,2003.0,False
102,Mark E. Hood,CEO,https://www.bloomberg.com/research/stocks/peop...,Mr. Mark E. Tryniski has been the President an...,58.0,,,,,
2216,Jay W. Rembolt,CFO,https://www.bloomberg.com/research/stocks/peop...,Mr. Jay W. Rembolt has been the Chief Financia...,67.0,Male,Master,False,1991.0,False
581,Mark A. Klein,CEO,https://www.bloomberg.com/research/stocks/priv...,Mr. Mark A. Klein serves as the President and ...,63.0,Male,Master,False,1976.0,False
2264,David M. Johnson,CFO,https://www.bloomberg.com/research/stocks/peop...,Mr. David Johnson serves as the Chief Financia...,,,,,,
2304,William L. Prater,CFO,https://www.bloomberg.com/research/stocks/priv...,"Mr. William L. Trubeck, also known as Bill, se...",71.0,,,,,
629,James Q. Crowe,CEO,https://www.bloomberg.com/research/stocks/priv...,"Mr. James Q. Crowe, also known as Jim, serves ...",68.0,Male,Master,False,1993.0,False
149,John J. Greisch,CEO,https://www.bloomberg.com/research/stocks/priv...,Mr. John J. Greisch serves as an Independent D...,63.0,Male,Master,False,2006.0,False
3639,Jeff A. Zadoks,CFO,https://www.bloomberg.com/research/stocks/priv...,Mr. Jeff A. Zadoks has been Chief Financial Of...,53.0,Male,,False,1999.0,False
53,James Dimon,CEO,https://www.bloomberg.com/research/stocks/peop...,"Mr. James Dimon, also known as Jamie, has been...",62.0,Male,Master,False,1978.0,True


In [4]:
exec_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 857 entries, 0 to 709
Data columns (total 10 columns):
name              857 non-null object
title             857 non-null object
bio_urls          745 non-null object
bios              630 non-null object
age               559 non-null float64
gender            348 non-null object
degree            221 non-null object
awards            353 non-null object
career start      346 non-null float64
top university    353 non-null object
dtypes: float64(2), object(8)
memory usage: 73.6+ KB


In [5]:
exec_features.to_csv('./exec_features.csv')