In [101]:
# # mount it
# from google.colab import drive
# drive.mount('/content/drive')
# # copy it there
# !mkdir -p /content/final_project/data/
# !cp -f /content/drive/MyDrive/ece1786/final_project/data/NLP_Labelled_Data_Company_v4.xlsx /content/final_project/data/

In [102]:
import pandas as pd
# This dataset is manually collected by selecting descriptions related to company's operations 
# from the 2021 annual reports found on company's public sites.  
# Primary labels are basic off the company's GICS code, which is widely reconigzed by the indsutry.
# Additional labels are created manually when reading the descriptions by considerating
# GICS standard: https://www.spglobal.com/marketintelligence/en/documents/112727-gics-mapbook_2018_v3_letter_digitalspreads.pdf

data = pd.read_excel("final_project/data/NLP_Labelled_Data_Company_v4.xlsx", sheet_name=2)
data = data[data["Ready"] == "Y"].dropna(subset=['Annual Report Description'])
data['WordCount'] = data['Annual Report Description'].apply(lambda n: len(n.split()))
columns = ["Ticker", "CompanyName", "IndustryGroupId", "IndustryGroup", "AdditionalLabel1", "AdditionalLabel2", "AdditionalLabel3", "Annual Report Description", "WordCount"]
data = data[columns]
data.shape

(191, 9)

In [103]:
label_ls = ['Automobiles & Components','Banks','Capital Goods',
 'Commercial & Professional Services','Consumer Durables & Apparel',
 'Consumer Services','Diversified Financials','Energy',
 'Food & Staples Retailing','Food, Beverage & Tobacco',
 'Health Care Equipment & Services','Household & Personal Products',
 'Insurance','Materials','Media & Entertainment',
 'Pharmaceuticals, Biotechnology & Life Sciences','Real Estate',
 'Retailing','Semiconductors & Semiconductor Equipment',
 'Software & Services','Technology Hardware & Equipment',
 'Telecommunication Services','Transportation','Utilities']
len(label_ls)

24

In [104]:
# Some quality checks
# All labesl are done properly
assert(data[data["IndustryGroup"].isin(label_ls)].shape == data.shape)
assert(data[data["AdditionalLabel1"].isin(label_ls) | data["AdditionalLabel1"].isna()].shape == data.shape)
assert(data[data["AdditionalLabel2"].isin(label_ls) | data["AdditionalLabel2"].isna()].shape == data.shape)
assert(data[data["AdditionalLabel3"].isin(label_ls) | data["AdditionalLabel3"].isna()].shape == data.shape)

# Make sure we have all labels included
assert(len(data["IndustryGroup"].unique()) == len(label_ls))

In [105]:
# Combined all possible labels into a single column, and remove all nan from each sublist
industry_group_list = data[["IndustryGroup", "AdditionalLabel1", "AdditionalLabel2", "AdditionalLabel3"]].values.tolist()
industry_group_list = [[item for item in row if item == item] for row in industry_group_list]
data["industry_group"] = industry_group_list

In [106]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer # for multi labels https://www.projectpro.io/recipes/one-hot-encoding-with-multiple-labels-in-python
from numpy import array

# Create one-hot labels
X = data["Annual Report Description"]
y = data["industry_group"]

one_hot = MultiLabelBinarizer()
labels = one_hot.fit_transform(y)
print(labels)
# print(one_hot.classes_)
data['labels'] = list(labels)
data


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Unnamed: 0,Ticker,CompanyName,IndustryGroupId,IndustryGroup,AdditionalLabel1,AdditionalLabel2,AdditionalLabel3,Annual Report Description,WordCount,industry_group,labels
9,ATVI,Activision Blizzard,5020,Media & Entertainment,Software & Services,,,We develop interactive entertainment content a...,233,"[Media & Entertainment, Software & Services]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
10,ATVI,Activision Blizzard,5020,Media & Entertainment,Software & Services,,,"Activision Blizzard, Inc. is a leading global ...",306,"[Media & Entertainment, Software & Services]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
14,AES,AES Corp,5510,Utilities,Real Estate,,,"Incorporated in 1981, AES is a global energy c...",355,"[Utilities, Real Estate]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
15,AFL,AFLAC Inc,4030,Insurance,,,,Aflac Japan is the principal contributor to th...,261,[Insurance],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
16,AFL,AFLAC Inc,4030,Insurance,,,,Aflac Incorporated was incorporated in 1973 un...,272,[Insurance],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...
586,VZ,Verizon Communications,5010,Telecommunication Services,,,,Verizon Communications Inc. (Verizon or the Co...,292,[Telecommunication Services],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
597,WMT,Walmart,3010,Food & Staples Retailing,,,,Our strategy is to make every day easier for b...,296,[Food & Staples Retailing],"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
598,WMT,Walmart,3010,Food & Staples Retailing,,,,Sam's Club operates in 44 states in the U.S. a...,299,[Food & Staples Retailing],"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
611,WHR,Whirlpool Corp.,2520,Consumer Durables & Apparel,,,,As a 110-year old company with a legacy of suc...,212,[Consumer Durables & Apparel],"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
