**Table of contents**<a id='toc0_'></a>    
- [Libraries](#toc1_)    
- [Loading 10 text filings per type](#toc2_)    
- [Preprocessing: a pipeline example](#toc3_)    
- [Dictionary approach: Loughran-McDonald, VADER](#toc4_)    
  - [Loughran-McDonald](#toc4_1_)    
  - [Vader](#toc4_2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Libraries](#toc0_)

In [22]:
import os
import numpy as np
import time
os.chdir(os.environ.get('PROJECT_PATH'))
from secnlp.ml_logic import data as d
from secnlp.ml_logic import parsing as p
from secnlp.ml_logic import preprocessing as pre
import secnlp.ml_logic.parsing
from secnlp import utils as u
from secnlp.params import *
import pandas as pd
import importlib
import nltk
nltk.download('vader_lexicon')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Debora/nltk_data...


# <a id='toc2_'></a>[Loading 10 text filings per type](#toc0_)

In [2]:
# load whole filings table
df = u.read_data_from_bq(credentials = SERVICE_ACCOUNT, gcp_project = PROJECT, bq_dataset = DATASET_ID, table = FILINGS_10KQ_TABLE_ID)
df['date_filed'] = pd.to_datetime(df['date_filed'])

In [3]:
# sample 10 "Management & Discussion Analysis" sections of 10-K filings
importlib.reload(secnlp.ml_logic.parsing)
filing_sample_10k = df[(df['date_filed'].dt.year == 2023) & (df['form_type'] == '10-K')].sample(10)
filing_sample_10k['raw_filing'] = filing_sample_10k['file_name'].apply(lambda url: d.fetch_text_from_url(url, agent = AGENT))
filing_sample_10k['mda'] = filing_sample_10k['raw_filing'].apply(lambda x: p.parse_10k_filing_items(x, item = '7'))


Unable to locate Item 7
Unable to locate Item 7
Unable to locate Item 7


In [6]:
# sample 10 "Management & Discussion Analysis" sections of 10-Q filings
filing_sample_10q = df[(df['date_filed'].dt.year == 2023) & (df['form_type'] == '10-Q')].sample(10)
filing_sample_10q['raw_filing'] = filing_sample_10q['file_name'].apply(lambda url: d.fetch_text_from_url(url, agent = AGENT))
filing_sample_10q['mda'] = filing_sample_10q['raw_filing'].apply(lambda x: p.parse_10q_filing_items(x, item = '2'))

Unable to locate Item 2
Unable to locate Item 2
Unable to locate Item 2


# <a id='toc3_'></a>[Preprocessing: a pipeline example](#toc0_)

In [46]:
# Cleaning
text = p.cleaning(filing_sample_10k['mda'].iloc[1])
# Tokenizing
tokenized = word_tokenize(text)
# Lemmatizing
verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in tokenized]
noun_verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in verb_lemmatized]
# Vectorizing
vectorizer = CountVectorizer(stop_words=None,ngram_range = (2,2))
X_bow = vectorizer.fit_transform([" ".join(noun_verb_lemmatized)])

In [87]:
# Apply full preprocessing pipeline to the data
print(pre.pipeline_without_stop_words)
X_bow = pre.pipeline_without_stop_words.fit_transform([filing_sample_10k['mda'].iloc[1]])
display(pd.DataFrame(X_bow.toarray()))

Pipeline(steps=[('cleaning',
                 FunctionTransformer(func=<function cleaning at 0x142e6cb80>)),
                ('vectorizing',
                 CountVectorizer(ngram_range=(2, 2),
                                 tokenizer=<secnlp.ml_logic.preprocessing.LemmaTokenizer object at 0x147a8d690>))])




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,501,502,503,504,505,506,507,508,509,510
0,8,68,114,46,14,4,6,11,14,5,...,8,45,2,14,2,2,6,6,6,25


# <a id='toc4_'></a>[Dictionary approach: Loughran-McDonald, VADER](#toc0_)

## <a id='toc4_1_'></a>[Loughran-McDonald](#toc0_)

In [9]:
loughran_mcdonald = u.read_data_from_bq(credentials = SERVICE_ACCOUNT, gcp_project = PROJECT, bq_dataset = DATASET_ID, table = LOUGHRAN_MCDONALD_TABLE_ID)
display(loughran_mcdonald.head(10))

Unnamed: 0,Word,Seq_num,Word_Count,Word_Proportion,Average_Proportion,Std_Dev,Doc_Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Syllables,Source
0,WICKING,85138,613,2.684178e-08,1.801378e-08,1.877609e-06,306,0,0,0,0,0,0,0,0,10K_2014
1,MISALLOCATION,46609,1003,4.391894e-08,5.292175e-08,4.437696e-06,628,0,0,0,0,0,0,0,0,10K_2018
2,TIMESCALES,77593,1003,4.391894e-08,2.024676e-08,1.712593e-06,459,0,0,0,0,0,0,0,0,10K_2014
3,UNSEASONAL,81884,404,1.769018e-08,1.899141e-08,1.960269e-06,332,0,0,0,0,0,0,0,0,10K_2014
4,RANSOMWARE,60772,7271,3.183794e-07,1.577411e-07,4.265672e-06,5231,0,0,0,0,0,0,0,0,10K_2018
5,CARETAKING,10661,571,2.50027e-08,2.665711e-08,3.132363e-06,239,0,0,0,0,0,0,0,0,10K_2014
6,UNDERSUBSCRIPTION,80820,571,2.50027e-08,8.405338e-09,1.357159e-06,88,0,0,0,0,0,0,0,0,10K_2018
7,CRYPTOCURRENCY,17473,13673,5.987075e-07,5.765699e-07,3.263651e-05,1229,0,0,0,0,0,0,0,0,10K_2018
8,STRATEGIZING,73555,557,2.438968e-08,3.381807e-08,2.284727e-06,454,0,0,0,0,0,0,0,0,10K_2014
9,REPROPOSED,63732,348,1.523808e-08,6.361126e-09,6.235705e-07,209,0,0,0,0,0,0,0,0,10K_2014


In [26]:
loughran_mcdonald['Negative'].value_counts()

Negative
0        84176
2009      2305
2014        26
2011        13
-2020       10
2012         1
Name: count, dtype: Int64

In [27]:
pos_words = loughran_mcdonald[loughran_mcdonald['Positive'] > 0]['Word'].values
neg_words = loughran_mcdonald[loughran_mcdonald['Negative'] > 0]['Word'].values

In [28]:
# Cleaning
text = p.cleaning(filing_sample_10k['mda'].iloc[1])
# Tokenizing
tokenized = word_tokenize(text)
# Stop Words
stop_words = stopwords.words('english')
# Lemmatizing
verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in tokenized if word not in stop_words]
noun_verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in verb_lemmatized if word not in stop_words]
# Calculating sentiment score
num_pos = [i for i in noun_verb_lemmatized if i.upper() in pos_words]
num_neg = [i for i in noun_verb_lemmatized if i.upper() in neg_words]
sentiment_score_ld = round(len(num_pos) / (len(num_neg)+1), 2)
print(f"Positive Words Count: {len(num_pos)}, Negative Words Count: {len(num_neg)}, Sentiment Score:{sentiment_score_ld}")

Positive Words Count: 176, Negative Words Count: 643, Sentiment Score:0.3


## <a id='toc4_2_'></a>[Vader](#toc0_)

In [23]:
sent_analyzer = SentimentIntensityAnalyzer()
sentiment_score_vader = sent_analyzer.polarity_scores(filing_sample_10k['mda'].iloc[1])
print(sentiment_score_vader)


{'neg': 0.042, 'neu': 0.857, 'pos': 0.101, 'compound': 1.0}
