# Data preparation

In [11]:
import numpy as np
import pandas as pd
import json

In [12]:
df = pd.read_json('data_raw.json')
df = df[['description', 'sector']]

# replace empty strings with nan values
df = df.replace('', np.nan)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9856 entries, 0 to 9855
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  9856 non-null   object
 1   sector       9833 non-null   object
dtypes: object(2)
memory usage: 154.1+ KB


In [13]:
# drop nan values
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9833 entries, 0 to 9855
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  9833 non-null   object
 1   sector       9833 non-null   object
dtypes: object(2)
memory usage: 230.5+ KB


In [14]:
df.sector.value_counts()

Financial Services        2503
Healthcare                1452
Industrials               1307
Technology                1139
Consumer Cyclical         1023
Basic Materials            594
Real Estate                489
Consumer Defensive         410
Communication Services     403
Energy                     338
Utilities                  164
Industrial Goods             4
Financial                    3
Services                     3
Consumer Goods               1
Name: sector, dtype: int64

In [15]:
# consolidate data
df = df.replace('Services', 'Financial Services')
df = df.replace('Industrial Goods', 'Industrials')
df = df.replace('Financial', 'Financial Services')
df = df.replace('Consumer Goods', 'Consumer Cyclical')
df = df.replace(to_replace=['Energy', 'Utilities'], value='Energy & Utility')

In [16]:
df.sector.value_counts()

Financial Services        2509
Healthcare                1452
Industrials               1311
Technology                1139
Consumer Cyclical         1024
Basic Materials            594
Energy & Utility           502
Real Estate                489
Consumer Defensive         410
Communication Services     403
Name: sector, dtype: int64

In [17]:
df

Unnamed: 0,description,sector
0,Adara Acquisition Corp. does not have signific...,Financial Services
1,"AIM ImmunoTech Inc., an immuno-pharma company,...",Healthcare
2,"Ashford, Inc is an asset management firm. The ...",Financial Services
3,"The Arena Group Holdings, Inc., together with ...",Communication Services
4,"AMCON Distributing Company, together with its ...",Consumer Defensive
...,...,...
9851,"Wheaton Precious Metals Corp., a streaming com...",Basic Materials
9852,Westport Fuel Systems Inc. engages in the engi...,Consumer Cyclical
9853,"Western Copper and Gold Corporation, an explor...",Basic Materials
9854,Xanadu Mines Limited engages in the exploratio...,Basic Materials


In [18]:
# search through the descriptions
df[df.sector == 'Financial Services']

Unnamed: 0,description,sector
0,Adara Acquisition Corp. does not have signific...,Financial Services
2,"Ashford, Inc is an asset management firm. The ...",Financial Services
5,"Aberdeen Emerging Markets Equity Income Fund, ...",Financial Services
7,"Aberdeen Australia Equity Fund, Inc. is a clos...",Financial Services
11,Allspring Multi-Sector Income Fund is a closed...,Financial Services
...,...,...
9798,Royal Bank of Canada operates as a diversified...,Financial Services
9803,Sprott Inc. is a publicly owned asset manageme...,Financial Services
9806,"Sun Life Financial Inc., a financial services ...",Financial Services
9823,"The Toronto-Dominion Bank, together with its s...",Financial Services


In [19]:
data_dict = df.to_dict('records')

with open(f'data_prep.json', 'w') as file:
    json.dump(data_dict, file)