# Prediciting Sentiments

## Prepare `edgar` sampel dataframe

In [2]:
from ekorpkit import eKonf

df_cfg = eKonf.compose(config_group='pipeline=blank')
df_cfg.name = 'edgar_sample'
df_cfg.data_dir = "${cached_path:'https://github.com/entelecheia/ekorpkit-config/raw/main/data/edgar.zip',true,false}"
df_cfg.data_dir += "/edgar"
df_cfg.data_file = 'edgar.parquet'
df_cfg.columns_to_keep = ['id', 'filename', 'item', 'cik', 'company', 'text']
df = eKonf.instantiate(df_cfg)
df.head()

Unnamed: 0,id,filename,item,cik,company,text
1410,1534,1999/320193_10K_1999_0000912057-99-010244.json,item_1,320193,APPLE COMPUTER INC,"ITEM 1. \nBUSINESS GENERAL Apple Computer, Inc..."
1560,1697,1999/21344_10K_1999_0000021344-00-000009.json,item_1,21344,COCA COLA CO,ITEM 1. \nBUSINESS The Coca-Cola Company (toge...
2746,2977,1999/70858_10K_1999_0000950168-00-000621.json,item_1,70858,BANK OF AMERICA CORP /DE/,Item 1. \nBUSINESS General Bank of America Cor...
3762,4088,1999/80424_10K_1999_0000080424-99-000027.json,item_1,80424,PROCTER & GAMBLE CO,Item 1. \nBusiness. \n--------- General Develo...
4806,5211,1999/1018724_10K_1999_0000891020-00-000622.json,item_1,1018724,AMAZON COM INC,ITEM 1. \nBUSINESS This Annual Report on Form ...


## Prepare `financial_phrasebank` dataset

In [3]:
from ekorpkit import eKonf

ds_cfg = eKonf.compose(config_group='dataset')
ds_cfg.name = 'financial_phrasebank'
ds_cfg.data_dir = "${cached_path:'gd://15TUjIfQstCYXU82s0pcaxh2QcSWf1zyG:financial_phrasebank.zip',true,false}"
ds = eKonf.instantiate(ds_cfg)

## Compose a config for the LM sentiment analyser class

In [4]:
from ekorpkit import eKonf

config_group='model/sentiment=lm'
model_cfg = eKonf.compose(config_group=config_group)
model_cfg.preprocessor.tokenizer.nltk.lemmatize = True

## Instantiating a sentiment analyser class and prediting sentiments of `edgar` dataset

In [5]:
cfg = eKonf.compose(config_group='pipeline')
cfg.verbose = True
cfg.name = 'edgar_sentiments'
cfg.data_dir = "${cached_path:'https://github.com/entelecheia/ekorpkit-config/raw/main/data/edgar.zip',true,false}"
cfg.data_dir += "/edgar"
cfg.data_file = 'edgar.parquet'
cfg.columns_to_keep = ['id', 'filename', 'item', 'cik', 'company', 'text']
cfg._pipeline_ = ['predict']
cfg.predict.model = model_cfg
cfg.predict.output_dir = "./data/predict"
cfg.predict.output_file = f'{cfg.name}-lm.parquet'
cfg.num_workers = 100
df = eKonf.instantiate(cfg)
df.head()

Loading 1 dataframes from ['/root/.ekorpkit/.cache/cached_path/bba50371f814f64ee7290d4bfcc1af428ca7105a08f463fcb32d5b8ba35abec9.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted/edgar/edgar.parquet']


Predicting [text]: 100%|██████████| 100/100 [00:31<00:00,  3.18it/s]


 >> saved dataframe to ./data/predict/edgar_sentiments-lm.parquet


Unnamed: 0,id,filename,item,cik,company,text,positive,negative,num_tokens,polarity,subjectivity,polarity_label,uncertainty
1410,1534,1999/320193_10K_1999_0000912057-99-010244.json,item_1,320193,APPLE COMPUTER INC,"ITEM 1. \nBUSINESS GENERAL Apple Computer, Inc...",0.007679,0.009726,3907,-0.117647,0.017405,neutral,0.011262
1560,1697,1999/21344_10K_1999_0000021344-00-000009.json,item_1,21344,COCA COLA CO,ITEM 1. \nBUSINESS The Coca-Cola Company (toge...,0.004727,0.004137,6769,0.066667,0.008864,neutral,0.014773
2746,2977,1999/70858_10K_1999_0000950168-00-000621.json,item_1,70858,BANK OF AMERICA CORP /DE/,Item 1. \nBUSINESS General Bank of America Cor...,0.004575,0.007149,3497,-0.219512,0.011724,negative,0.008579
3762,4088,1999/80424_10K_1999_0000080424-99-000027.json,item_1,80424,PROCTER & GAMBLE CO,Item 1. \nBusiness. \n--------- General Develo...,0.004747,0.003956,1264,0.090909,0.008703,neutral,0.011867
4806,5211,1999/1018724_10K_1999_0000891020-00-000622.json,item_1,1018724,AMAZON COM INC,ITEM 1. \nBUSINESS This Annual Report on Form ...,0.013912,0.017229,12363,-0.106494,0.031141,neutral,0.019979


In [6]:
print(cfg.predict.output_dir)
print(cfg.predict.output_file)

./data/predict
edgar_sentiments-lm.parquet


## Instantiating a transformer classficiation model with `financial_phrasebank` dataset

In [5]:
from ekorpkit import eKonf

overrides=[
    '+model/transformer=simple_classification',
    '+model/transformer/pretrained=finbert',
]
model_cfg = eKonf.compose(overrides=overrides, config_group='model/transformer=simple_classification')
model_cfg.dataset = ds_cfg
model_cfg.verbose = False
model_cfg.config.num_train_epochs = 2
model_cfg.config.max_seq_length = 256
model_cfg.config.train_batch_size = 32
model_cfg.config.eval_batch_size = 32
model_cfg.labels = ['positive','neutral','negative']
model_cfg._call_ = ['train']
eKonf.instantiate(model_cfg)

<ekorpkit.models.transformer.simple.SimpleClassification at 0x7fd897d35250>

In [6]:
cfg = eKonf.compose(config_group='pipeline')
cfg.verbose = True
cfg.name = 'edgar_sentiments'
cfg.data_dir = "${cached_path:'https://github.com/entelecheia/ekorpkit-config/raw/main/data/edgar.zip',true,false}"
cfg.data_dir += "/edgar"
cfg.data_file = 'edgar.parquet'
cfg.columns_to_keep = ['id', 'filename', 'item', 'cik', 'company', 'text']
cfg._pipeline_ = ['predict']
cfg.predict.model = model_cfg
cfg.predict.output_dir = "./data/predict"
cfg.predict.output_file = f'{cfg.name}-finbert.parquet'
cfg.num_workers = 1
df = eKonf.instantiate(cfg)
df.head()

Loading 1 dataframes from ['/root/.ekorpkit/.cache/cached_path/bba50371f814f64ee7290d4bfcc1af428ca7105a08f463fcb32d5b8ba35abec9.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted/edgar/edgar.parquet']


  0%|          | 0/1585 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4252 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/945 [00:00<?, ?it/s]

 >> saved dataframe to ./data/predict/edgar_sentiments-finbert.parquet


Unnamed: 0,id,filename,item,cik,company,text,pred_labels
1410,1534,1999/320193_10K_1999_0000912057-99-010244.json,item_1,320193,APPLE COMPUTER INC,"ITEM 1. \nBUSINESS GENERAL Apple Computer, Inc...",positive
1560,1697,1999/21344_10K_1999_0000021344-00-000009.json,item_1,21344,COCA COLA CO,ITEM 1. \nBUSINESS The Coca-Cola Company (toge...,neutral
2746,2977,1999/70858_10K_1999_0000950168-00-000621.json,item_1,70858,BANK OF AMERICA CORP /DE/,Item 1. \nBUSINESS General Bank of America Cor...,neutral
3762,4088,1999/80424_10K_1999_0000080424-99-000027.json,item_1,80424,PROCTER & GAMBLE CO,Item 1. \nBusiness. \n--------- General Develo...,neutral
4806,5211,1999/1018724_10K_1999_0000891020-00-000622.json,item_1,1018724,AMAZON COM INC,ITEM 1. \nBUSINESS This Annual Report on Form ...,positive


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1585 entries, 1410 to 1291201
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           1585 non-null   int64 
 1   filename     1585 non-null   object
 2   item         1585 non-null   object
 3   cik          1585 non-null   object
 4   company      1585 non-null   object
 5   text         1585 non-null   object
 6   pred_labels  1585 non-null   object
dtypes: int64(1), object(6)
memory usage: 99.1+ KB
