# Feature Generation pt.2

This module will determine the characteristics for each executive whom we found in pt.1.  

## Features

This module will extract features from the Bloomberg executive biographies using basic if/then logic:
1. Gender
2. Highest degree attained
3. Awards received?
4. Years in career?
5. Top university education?

In [3]:
# to work with data
import pandas as pd

# to work with regex
import re

# to download
import requests

# to work with HTML tags
from bs4 import BeautifulSoup

# to time functions
import datetime

# to use NaN
import numpy as np

# to pause
import time
import random

# to work with local files
import os

In [4]:
try:
    Company_Exec
except NameError:
    Company_Exec = pd.read_csv('Company_CEO_CFO.csv', index_col=0)

In [5]:
Company_Exec[Company_Exec['name'] == 'ADVANCED MICRO DEVICES INC'].sort_values('fy', ascending=False)

Unnamed: 0,adsh,cik,name,stprba,period,fy,fp,filed,CEO,CFO
1933,0000002488-18-000042,2488,ADVANCED MICRO DEVICES INC,CA,20171231,2017,FY,20180227,Lisa T. Su,Devinder Kumar
7640,0000002488-17-000043,2488,ADVANCED MICRO DEVICES INC,CA,20161231,2016,FY,20170221,Lisa T. Su,Devinder Kumar
13849,0000002488-16-000111,2488,ADVANCED MICRO DEVICES INC,CA,20151231,2015,FY,20160218,Lisa T. Su,Devinder Kumar
22674,0001193125-15-054362,2488,ADVANCED MICRO DEVICES INC,CA,20141231,2014,FY,20150219,Lisa T. Su,Devinder Kumar
29734,0001193125-14-057240,2488,ADVANCED MICRO DEVICES INC,CA,20131231,2013,FY,20140218,Rory P. Read,Devinder Kumar
36966,0001193125-13-069422,2488,ADVANCED MICRO DEVICES INC,CA,20121231,2012,FY,20130221,Rory P. Read,Devinder Kumar
43784,0001193125-12-075837,2488,ADVANCED MICRO DEVICES INC,CA,20111231,2011,FY,20120224,Rory P. Read,Thomas J. Seifert
47351,0001193125-11-040392,2488,ADVANCED MICRO DEVICES INC,CA,20101231,2010,FY,20110218,,Thomas J. Seifert


Pair down the list so that:
1. the cik has at least 5 filings associated with it
2. the cik has at least 5 entries for CEO and CFO

In [7]:
##### get the counts of adsh (filings), CEOs and CFOs.
adsh_ceo_cfo_count_by_cik = Company_Exec.groupby('cik', as_index=True).count()[['adsh','CEO','CFO']]

# filter them
min_entry_count = 5

cik_to_include = adsh_ceo_cfo_count_by_cik[
                            (adsh_ceo_cfo_count_by_cik['adsh'] > min_entry_count) & #check condition 1
                            (adsh_ceo_cfo_count_by_cik['CEO'] > min_entry_count) & #check cond 2
                            (adsh_ceo_cfo_count_by_cik['CFO'] > min_entry_count)]\
                            .index.values #access the index values (CIK)

# include only filings that meet the filter
Company_Exec_filtered = Company_Exec[
    Company_Exec['cik'].isin(cik_to_include)].sort_values(['cik','fy'], ascending=[True, False]).reset_index()
Company_Exec_filtered

Unnamed: 0,index,adsh,cik,name,stprba,period,fy,fp,filed,CEO,CFO
0,577,0001144204-18-051414,2034,ACETO CORP,NY,20180630,2018,FY,20180928,William C. Kennally III,Rebecca Roof
1,6491,0001144204-17-045100,2034,ACETO CORP,NY,20170630,2017,FY,20170825,Salvatore Guccione,Douglas Roth
2,12927,0001571049-16-017785,2034,ACETO CORP,NY,20160630,2016,FY,20160826,Salvatore Guccione,Douglas Roth
3,19393,0001571049-15-007509,2034,ACETO CORP,NY,20150630,2015,FY,20150911,Salvatore Guccione,Douglas Roth
4,26250,0001571049-14-004448,2034,ACETO CORP,NY,20140630,2014,FY,20140905,Salvatore Guccione,Douglas Roth
5,32907,0001188112-13-002611,2034,ACETO CORP,NY,20130630,2013,FY,20130829,Salvatore Guccione,Douglas Roth
6,39801,0001188112-12-002832,2034,ACETO CORP,NY,20120630,2012,FY,20120907,Albert L. Eilender,Douglas Roth
7,1932,0000002178-18-000009,2178,"ADAMS RESOURCES & ENERGY, INC.",TX,20171231,2017,FY,20180312,,Townes G. Pressler
8,7639,0000002178-17-000019,2178,"ADAMS RESOURCES & ENERGY, INC.",TX,20161231,2016,FY,20170331,Thomas S. Smith,Josh C. Anders
9,13848,0000002178-16-000064,2178,"ADAMS RESOURCES & ENERGY, INC.",TX,20151231,2015,FY,20160311,Thomas S. Smith,Richard B. Abshire


Further pair down this list so that:
1. For each company there have been executives (including the current one) that stayed with the company for 3 years




In [8]:
Unique_CIK_filtered = Company_Exec_filtered['cik'].unique().copy()

Company_Exec_filt_Exec_3yr = []

for CIK in Unique_CIK_filtered:
    df = Company_Exec_filtered[Company_Exec_filtered['cik'] == CIK].copy()
    df['CEO_3yrs_ago'] = df['CEO'].shift(-3)
    df['CFO_3yrs_ago'] = df['CFO'].shift(-3)
    CEO_been_with_company_3yr = df['CEO'].iloc[0] == df['CEO_3yrs_ago'].iloc[0]
    CEO_been_with_company_3yr = df['CFO'].iloc[0] == df['CFO_3yrs_ago'].iloc[0]
    if CEO_been_with_company_3yr and CEO_been_with_company_3yr:
        Company_Exec_filt_Exec_3yr.append(df)

Company_Exec_filt_Exec_3yr_df = pd.concat(Company_Exec_filt_Exec_3yr)

Company_Exec_filt_Exec_3yr_df.head(5)

Unnamed: 0,index,adsh,cik,name,stprba,period,fy,fp,filed,CEO,CFO,CEO_3yrs_ago,CFO_3yrs_ago
14,1933,0000002488-18-000042,2488,ADVANCED MICRO DEVICES INC,CA,20171231,2017,FY,20180227,Lisa T. Su,Devinder Kumar,Lisa T. Su,Devinder Kumar
15,7640,0000002488-17-000043,2488,ADVANCED MICRO DEVICES INC,CA,20161231,2016,FY,20170221,Lisa T. Su,Devinder Kumar,Rory P. Read,Devinder Kumar
16,13849,0000002488-16-000111,2488,ADVANCED MICRO DEVICES INC,CA,20151231,2015,FY,20160218,Lisa T. Su,Devinder Kumar,Rory P. Read,Devinder Kumar
17,22674,0001193125-15-054362,2488,ADVANCED MICRO DEVICES INC,CA,20141231,2014,FY,20150219,Lisa T. Su,Devinder Kumar,Rory P. Read,Thomas J. Seifert
18,29734,0001193125-14-057240,2488,ADVANCED MICRO DEVICES INC,CA,20131231,2013,FY,20140218,Rory P. Read,Devinder Kumar,,Thomas J. Seifert


In [9]:
# Acquire unique names (CFO and CEO) from executives.
Unique_CEOs = pd.DataFrame(Company_Exec_filt_Exec_3yr_df['CEO'].unique())
Unique_CEOs.columns = ['name']
Unique_CEOs['title'] = 'CEO'

Unique_CFOs = pd.DataFrame(Company_Exec_filt_Exec_3yr_df['CFO'].unique())
Unique_CFOs.columns = ['name']
Unique_CFOs['title'] = 'CFO'

Names_to_research = pd.concat([Unique_CEOs,Unique_CFOs])
Names_to_research = Names_to_research[Names_to_research['name'].notnull()]

Names_to_research.reset_index(drop=True, inplace=True)

Names_to_research.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3657 entries, 0 to 3656
Data columns (total 2 columns):
name     3657 non-null object
title    3657 non-null object
dtypes: object(2)
memory usage: 57.2+ KB


In [1]:
def find_bio_urls(row):
    # put together the web url for a random search engine
    query = 'bloomberg executive profiles '+row['name']+' '+row['title']
    query = query.replace(' ', '+')
    
    # polite web scraping: pause and use random search engine
    time.sleep(random.randint(1,3)/2)
    engine = 'http://search.yahoo.com/search?p='
    #engine = 'https://www.bing.com/search?q='
    print('using',engine)
    query_url = engine + query
    r = requests.get(query_url, timeout=3)
    soup = BeautifulSoup(r.text)    
    
    url = ''
    
    # Check all returned links
    for a in soup.find_all('a', href=True):
        valid_url = 'bloomberg.com' in a['href'] \
        and 'Executive Profile & Biography' in a.text \
        and a.text[:5] is not '/url?'
        
        if valid_url:
            # select first hit
            if not url:
                url = a['href']
    if url:
        return url
    else:
        return np.nan

if not 'bio_urls_2210_2310.csv' in os.listdir('./bio_urls/'):     
    name_count = len(Names_to_research)

    a = 2210
    b = 2310
    while a < name_count:
        df = Names_to_research[a:b].copy()
        print('copied row count', len(df))
        df['bio_urls'] = df.apply(find_bio_urls, axis=1)
        print('processed',a,'to ',b)
        df.to_csv('./bio_urls/bio_urls_'+str(a)+'_'+str(b)+'.csv')
        print('saved',a,'to ',b)
        a = b
        b += 100
else: 
    print('Bio URLS already downloaded')

NameError: name 'os' is not defined

In [90]:
# Bloomberg blocks requests.get.
# Use Selenium with Firefox to access the site.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

def scrape_bio(url):
    # polite web scraping
    time.sleep(random.randint(1,5)/2)
    if not pd.isnull(url):
        driver.get(url)
    else:
        return np.nan
    try: 
        read_more_button = driver.find_element_by_xpath('//a[@onclick="show_more(this);"]')
        read_more_button.click()
    except NoSuchElementException as exception:
        pass
    try: 
        bio_text = driver.find_element_by_xpath('//p[@itemprop="description"]').text
    except NoSuchElementException as exception:
        bio_text = np.nan
    return bio_text
        
try: names_and_bios
except NameError:
    list_of_df = [] 
    names_and_bios = np.nan

    for datafile in ['bio_urls_0_10.csv']:
#   for datafile in os.listdir('./bio_urls/'):
        df = pd.read_csv('./bio_urls/'+datafile, index_col=0)
        #Open a Firefox instance
        driver = webdriver.Firefox()
        df['bios'] = df['bio_urls'].apply(scrape_bio)
        list_of_df.append(df)
        print('processed', datafile)

    names_and_bios = pd.concat(list_of_df, sort=False)
names_and_bios.head(10)

NoSuchWindowException: Message: Browsing context has been discarded


In [87]:
reset_selective -f names_and_bios

In [None]:
Bio = """Dr. Lisa T. Su has been the Chief Executive Officer 
and President of Advanced Micro Devices, Inc. since August 10, 2014 and 
October 2014 respectively. Dr. Su served as Chief Operating Officer of 
Advanced Micro Devices, Inc. from July 1, 2014 to October 2014. She 
served as Senior Vice President and General Manager of Global Business 
Unit at Advanced Micro Devices, Inc. from January 3, 2012 to July 2014. 
She served as Senior Vice President of Freescale Semiconductor 
Holdings I Ltd. from June 18, 2007 to December 2011 and served as its 
General Manager of Networking and Multimedia from September 2008 to 
December 2011. She is a highly-respected technology executive and has 
an Outstanding Record of business execution. She has an impressive 
track record and solid understanding of the semiconductor industry. 
She served as Senior Vice President and General Manager of Networking 
and Multimedia of Freescale Semiconductor Inc. from 2007 to 2011. 
Dr. Su served as the Chief Technology Officer of Freescale 
Semiconductor Holdings I Ltd. from June 18, 2007 to August 2009. She 
served as the Chief Technology Officer of Freescale Semiconductor Inc. 
since June 18, 2007. Dr. Su is a respected industry leader with more 
than 13 years of experience in technology development, product 
management and strategic alliances. She served as Vice President of 
Semiconductor Research and Development Center at International Business 
Machines Corp. since October 2005. She joined Freescale from IBM, where 
she served in various capacities including Vice President of Semiconductor 
Research and Development, Vice President of Technology Development and 
Alliances in the IBM Systems and Technology Group. Dr. Su served as a 
Director of the PowerPCTMproduct line and Emerging Products from 1995 
to June 2007. She served in various technical and business positions at 
IBM including Vice President of Technology Development. She has been a 
Director of Analog Devices, Inc. since June 7, 2012 and Advanced Micro 
Devices, Inc. since October 8, 2014. She has authored or co-authored 
more than 40 technical publications and co-authored a book chapter on 
next-generation consumer electronics. She was named in MIT Technology 
Review's Top 100 Young Innovators in 2002 and received the YWCA 2003 
Outstanding Achievement Award for Business. Prior to IBM, she was a 
member of the technical staff at Texas Instruments Incorporated in the 
Semiconductor Process and Device Center from 1994 to 1995. She is a 
Fellow of the Institute of Electronics and Electrical Engineers (IEEE). 
Dr. Su received B.S., M.S., and Doctorate Degrees in Electrical 
Engineering from the Massachusetts Institute of Technology. She has 
published more than 40 technical articles and was named a Fellow of 
the Institute of Electronics and Electrical Engineers in 2009. Dr. Su 
was named “2014 Executive of the Year” at the EETimes and EDN 2014 ACE 
Awards and was honored in MIT Technology Review’s Top 100 Young 
Innovators in 2002.
"""

In [None]:
# First stab at feature extracting using rule-based logic
fields = ['Gender', 'Degree', 'Awards', 'Career Start', 'Ivy League']

df = pd.DataFrame()
df['name'] = ['Lisa T. Su']
df['Bio'] = Bio

def features_from_bio(Bio):
    # Determine gender based on pronouns in text
    gender = np.nan
    if 'she' in Bio:
        gender = 'Female'
    elif 'he' in Bio:
        gender = 'Male'
    
    
    # Determine degree based on mentions in text     
    degree = np.nan
    if 'B.S.' or 'Bachelor of Science' in Bio:
        degree = 'Bachelor'
    if 'M.S.' or 'Masters of Science' in Bio:
        degree = 'Master'    
    if 'Dr.' or 'Doctor of' in Bio:
        degree = 'Doctorate'

    
    # Is there any mention of awards?
    awards = False
    if 'Award' or 'award' in Bio:
        awards = True

    
    # Assume the earliest year mentioned is the start of their career.    
    years = [int(x) for x in re.findall(r'\d{4}', str(Bio))]
    career_start = min(years)


    # Is there any mention of a top university 
    # https://www.topuniversities.com/university-rankings/world-university-rankings/2019
    Top25Universities = ['Massachusetts Institute of Technology', 'Stanford University', 'Harvard University',
                      'California Institute of Technology', 'University of Oxford', 'University of Cambridge', 
                      'Swiss Federal Institute of Technology', 'Imperial College London', 'University of Chicago',
                      'University College London', 'National University of Singapore', 'Nanyang Technological University',
                      'Princeton University', 'Cornell University', 'Yale University', 'Columbia University',
                      'Tsinghua University', 'University of Edinburgh', 'University of Pennsylvania', 'University of Michigan',
                      'Johns Hopkins University', 'école polytechnique fédérale de lausanne', 'University of Tokyo',
                         'Australian National University', 'University of Hong Kong'
                      ]
    
    TopUniversity = False
    for uni in Top25Universities:
        if uni in Bio:
            TopUniversity = True

    return gender, degree, awards, career_start, TopUniversity
    
df['Gender'], df['Degree'], df['Awards'], df['Career Start'], df['Top University'] = \
zip(*df['Bio'].map(features_from_bio))

df.head()