In [90]:
# mount it
from google.colab import drive
drive.mount('/content/drive')
# copy it there
!mkdir -p /content/final_project/data/
!cp -f /content/drive/MyDrive/ece1786/final_project/data/NLP_Labelled_Data_Company_v4.xlsx /content/final_project/data/
!cp -f /content/drive/MyDrive/ece1786/final_project/data/train_validation_data_with_embeddings.csv /content/final_project/data/
!cp -f /content/drive/MyDrive/ece1786/final_project/data/snp500_assisted_labelled.part2.csv /content/final_project/data/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
# This dataset is manually collected by selecting descriptions related to company's operations 
# from the 2021 annual reports found on company's public sites.  
# Primary labels are basic off the company's GICS code, which is widely reconigzed by the indsutry.
# Additional labels are created manually when reading the descriptions by considerating
# GICS standard: https://www.spglobal.com/marketintelligence/en/documents/112727-gics-mapbook_2018_v3_letter_digitalspreads.pdf

data = pd.read_excel("/content/final_project/data/NLP_Labelled_Data_Company_v4.xlsx", sheet_name=2)
data = data[data["Ready"] == "Y"].dropna(subset=['Annual Report Description'])
data['WordCount'] = data['Annual Report Description'].apply(lambda n: len(n.split()))
columns = ["Ticker", "CompanyName", "IndustryGroupId", "IndustryGroup", "AdditionalLabel1", "AdditionalLabel2", "AdditionalLabel3", "Annual Report Description", "WordCount"]
data = data[columns]
data.shape

(191, 9)

In [6]:
label_ls = ['Automobiles & Components','Banks','Capital Goods',
 'Commercial & Professional Services','Consumer Durables & Apparel',
 'Consumer Services','Diversified Financials','Energy',
 'Food & Staples Retailing','Food, Beverage & Tobacco',
 'Health Care Equipment & Services','Household & Personal Products',
 'Insurance','Materials','Media & Entertainment',
 'Pharmaceuticals, Biotechnology & Life Sciences','Real Estate',
 'Retailing','Semiconductors & Semiconductor Equipment',
 'Software & Services','Technology Hardware & Equipment',
 'Telecommunication Services','Transportation','Utilities']
len(label_ls)

24

In [None]:
# Some quality checks
# All labesl are done properly
assert(data[data["IndustryGroup"].isin(label_ls)].shape == data.shape)
assert(data[data["AdditionalLabel1"].isin(label_ls) | data["AdditionalLabel1"].isna()].shape == data.shape)
assert(data[data["AdditionalLabel2"].isin(label_ls) | data["AdditionalLabel2"].isna()].shape == data.shape)
assert(data[data["AdditionalLabel3"].isin(label_ls) | data["AdditionalLabel3"].isna()].shape == data.shape)

# Make sure we have all labels included
assert(len(data["IndustryGroup"].unique()) == len(label_ls))

In [None]:
# Combined all possible labels into a single column, and remove all nan from each sublist
industry_group_list = data[["IndustryGroup", "AdditionalLabel1", "AdditionalLabel2", "AdditionalLabel3"]].values.tolist()
industry_group_list = [[item for item in row if item == item] for row in industry_group_list]
data["industry_group"] = industry_group_list

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer # for multi labels https://www.projectpro.io/recipes/one-hot-encoding-with-multiple-labels-in-python
from numpy import array

# Create one-hot labels
X = data["Annual Report Description"]
y = data["industry_group"]

one_hot = MultiLabelBinarizer()
labels = one_hot.fit_transform(y)
print(labels)
print(one_hot.classes_)
# print(one_hot.classes_)
data['labels'] = list(labels)
data


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['Automobiles & Components' 'Banks' 'Capital Goods'
 'Commercial & Professional Services' 'Consumer Durables & Apparel'
 'Consumer Services' 'Diversified Financials' 'Energy'
 'Food & Staples Retailing' 'Food, Beverage & Tobacco'
 'Health Care Equipment & Services' 'Household & Personal Products'
 'Insurance' 'Materials' 'Media & Entertainment'
 'Pharmaceuticals, Biotechnology & Life Sciences' 'Real Estate'
 'Retailing' 'Semiconductors & Semiconductor Equipment'
 'Software & Services' 'Technology Hardware & Equipment'
 'Telecommunication Services' 'Transportation' 'Utilities']


Unnamed: 0,Ticker,CompanyName,IndustryGroupId,IndustryGroup,AdditionalLabel1,AdditionalLabel2,AdditionalLabel3,Annual Report Description,WordCount,industry_group,labels
9,ATVI,Activision Blizzard,5020,Media & Entertainment,Software & Services,,,We develop interactive entertainment content a...,233,"[Media & Entertainment, Software & Services]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
10,ATVI,Activision Blizzard,5020,Media & Entertainment,Software & Services,,,"Activision Blizzard, Inc. is a leading global ...",306,"[Media & Entertainment, Software & Services]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
14,AES,AES Corp,5510,Utilities,Real Estate,,,"Incorporated in 1981, AES is a global energy c...",355,"[Utilities, Real Estate]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
15,AFL,AFLAC Inc,4030,Insurance,,,,Aflac Japan is the principal contributor to th...,261,[Insurance],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
16,AFL,AFLAC Inc,4030,Insurance,,,,Aflac Incorporated was incorporated in 1973 un...,272,[Insurance],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...
586,VZ,Verizon Communications,5010,Telecommunication Services,,,,Verizon Communications Inc. (Verizon or the Co...,292,[Telecommunication Services],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
597,WMT,Walmart,3010,Food & Staples Retailing,,,,Our strategy is to make every day easier for b...,296,[Food & Staples Retailing],"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
598,WMT,Walmart,3010,Food & Staples Retailing,,,,Sam's Club operates in 44 states in the U.S. a...,299,[Food & Staples Retailing],"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
611,WHR,Whirlpool Corp.,2520,Consumer Durables & Apparel,,,,As a 110-year old company with a legacy of suc...,212,[Consumer Durables & Apparel],"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
data.groupby(by=["IndustryGroup"]).count()["CompanyName"]

IndustryGroup
Automobiles & Components                          15
Banks                                             12
Capital Goods                                     12
Commercial & Professional Services                 4
Consumer Durables & Apparel                        9
Consumer Services                                  6
Diversified Financials                             7
Energy                                             7
Food & Staples Retailing                           6
Food, Beverage & Tobacco                          10
Health Care Equipment & Services                   4
Household & Personal Products                      4
Insurance                                          8
Materials                                          7
Media & Entertainment                             14
Pharmaceuticals, Biotechnology & Life Sciences     5
Real Estate                                        6
Retailing                                         12
Semiconductors & Semiconductor E

In [None]:
print(f"Number of samples: {data.shape[0]}")
print(f"Number of samples with multiple classes: \
{len([sum([int(cls) for cls in row]) for row in data.labels if sum([int(cls) for cls in row]) > 1 ])}")

Number of samples: 191
Number of samples with multiple classes: 24


In [None]:
import pandas as pd
data = pd.read_csv("train_validation_data.csv")
data = data[data.text != "none"]
data

Unnamed: 0,ticker,text,length,industry_group,labels
0,A,"agilent technologies, inc. provides applicatio...",247,"Pharmaceuticals, Biotechnology & Life Sciences",[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
1,AA,"alcoa corporation, together with its subsidiar...",156,Materials,[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
2,AAL,"american airlines group inc., through its subs...",103,Transportation,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
3,AAN,"the aaron's company, inc. provides lease-to-ow...",88,Retailing,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
4,AAOI,"applied optoelectronics, inc. designs, manufac...",73,Technology Hardware & Equipment,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
...,...,...,...,...,...
4741,PSTV,"plus therapeutics, inc., a clinical-stage phar...",96,"Pharmaceuticals, Biotechnology & Life Sciences",[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
4742,BNTC,"benitec biopharma inc., a development-stage bi...",57,"Pharmaceuticals, Biotechnology & Life Sciences",[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
4743,NUWE,"nuwellis, inc., a medical device company, focu...",138,Health Care Equipment & Services,[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
4744,RADI,"radius global infrastructure, inc., together w...",73,Telecommunication Services,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]


In [None]:
import pandas as pd
data = pd.read_csv("/content/final_project/data/train_validation_data_with_embeddings.csv")
# Split every k=1000 records
k=1000
!mkdir /content/final_project/data/train_validation_data_with_embeddings/
for i in range(0, data.shape[0], k):
  data[i:i+k].to_parquet(f"/content/final_project/data/train_validation_data_with_embeddings/file.part{int(i/k)}.parquet", index=False)

mkdir: cannot create directory ‘/content/final_project/data/train_validation_data_with_embeddings/’: File exists


In [None]:
!ls -lh /content/final_project/data/train_validation_data_with_embeddings/

total 75M
-rw-r--r-- 1 root root 22M Nov 24 03:48 file.part0.parquet
-rw-r--r-- 1 root root 22M Nov 24 03:48 file.part1.parquet
-rw-r--r-- 1 root root 22M Nov 24 03:48 file.part2.parquet
-rw-r--r-- 1 root root 11M Nov 24 03:48 file.part3.parquet


In [96]:
import pandas as pd
import ast

data_snp500_1 = pd.read_csv("DeCo/data/snp500_assisted_labelled.part1.csv")
data_snp500_2 = pd.read_csv("/content/final_project/data/snp500_assisted_labelled.part2.csv")
data_snp500 = pd.concat([data_snp500_1, data_snp500_2])
data_snp500.dropna(subset=['hand_labeled'], inplace=True)

# Quality control step. Make sure labels are properly done with valid format
data_snp500.hand_labeled = data_snp500.hand_labeled.apply(lambda labels : ast.literal_eval(labels))
data_snp500.original = data_snp500.original.apply(lambda original: ast.literal_eval(original)[0])
data_snp500.preditions = data_snp500.preditions.apply(lambda labels : ast.literal_eval(labels))
# If failed find malformatted string representation for labels of list 
# for labels in data_snp500.hand_labeled:
#   try:
#     ast.literal_eval(labels)
#   except:
#     print(labels)

# Check all labels in the list are valid labels in label_ls
assert (data_snp500[data_snp500.hand_labeled.apply(lambda labels: all(label in label_ls for label in labels)) == False].shape[0] == 0)

# Check distribution
# First fix the "original" column and turn it back to a string, from a string representation of a list
print(f">>>>>>>>>> Number of samples: {data_snp500.shape[0]} >>>>>>>>>>")
print(data_snp500.groupby(['original']).count()['ticker'])

# Check number of tickers with multi-labels
num_multi_labels = data_snp500[data_snp500.hand_labeled.apply(lambda labels:len(labels) > 1)].shape[0]
print(f">>>>>>>>>> Number of samples with multiple labels: {num_multi_labels} >>>>>>>>>>")
print(data_snp500[data_snp500.hand_labeled.apply(lambda labels:len(labels) > 1)].groupby(['original']).count()['ticker'])

>>>>>>>>>> Number of samples: 319 >>>>>>>>>>
original
Automobiles & Components                           5
Banks                                             17
Capital Goods                                     27
Commercial & Professional Services                 9
Consumer Durables & Apparel                       18
Consumer Services                                 14
Diversified Financials                            17
Energy                                            21
Food & Staples Retailing                           5
Food, Beverage & Tobacco                          20
Health Care Equipment & Services                  11
Household & Personal Products                      6
Insurance                                         15
Materials                                         17
Media & Entertainment                              8
Pharmaceuticals, Biotechnology & Life Sciences    12
Real Estate                                       22
Retailing                                    

In [97]:
data_snp500.to_parquet('/content/final_project/data/snp500_assisted_labelled.merged.parquet', index=False)

In [98]:
data_snp500_reload = pd.read_parquet('/content/final_project/data/snp500_assisted_labelled.merged.parquet')
# Note that using parquet we store the list as a list object in the file, and when loading back
# the list column is recovered as a list properly without further processing
for label in data_snp500_reload[5:6].hand_labeled.values[0] :
  print (label)

Banks
Diversified Financials
