# IS4228 Project: NLP in Finance

In [12]:
%pip install spacy
!python3 -m spacy download en_core_web_sm
%pip install gensim
%pip install textblob
%pip install sumy

Note: you may need to restart the kernel to use updated packages.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ...

## Use Case: Summarisation

### Named Entity Recognition (NER)

In [16]:
import pandas as pd
import spacy
# Loading the English model
nlp = spacy.load('en_core_web_sm')

text = "Alphabet, the parent company of Google, bounced back from an absolutely dreadful day for tech shares, as its stock surged Thursday after the closing bell. All it had to do was to hand out billions of dollars to investors. The tech giant announced its first quarterly cash dividend, saying it will pay $0.20 per share on June 17 to shareholders of record as of June 10, as well as a $70 billion share buyback. Buybacks and dividends help to boost stock prices by rewarding investors with cash just for holding the stock — but they’re widely criticized for artificially inflating the stock price without spending on employees or improvements to the underlying business."

# Processing the text
doc = nlp(text)

ner_data = []
for entity in doc.ents:
    if entity.label_ == 'ORG':
        label = 'ORG (Organisation)'
    elif entity.label_ == 'GPE':
        label = 'GPE (Geopolitical Entity)'
    else:
        label = entity.label_
    ner_data.append([entity.text, label])

ner_df = pd.DataFrame(ner_data, columns=['Entity', 'Category'])
ner_df

Unnamed: 0,Entity,Category
0,Alphabet,GPE (Geopolitical Entity)
1,Google,ORG (Organisation)
2,Thursday,DATE
3,billions of dollars,MONEY
4,first,ORDINAL
5,quarterly,DATE
6,0.20,MONEY
7,June 17,DATE
8,June 10,DATE
9,$70 billion,MONEY


In [17]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

### Relationship Extraction (RE)

In [18]:
from spacy.matcher import Matcher 
from spacy.tokens import Span 

def get_relation(doc):
    # Matcher class object 
    matcher = Matcher(nlp.vocab)

    #define the pattern 
    pattern1 = [{'POS':'VERB', 'OP':'+'}, {'POS':'ADV', 'OP':'?'}, {'POS':'ADP', 'OP':'?'}]

    matcher.add("match_1",[pattern1])  

    matches = matcher(doc)
    relation_spans = []
    for i in range(len(matches)):
        span = Span(doc, matches[i][1], matches[i][2], 'Relation')
        relation_spans.append(span)

    return relation_spans

doc.spans['sc'] = get_relation(doc)
displacy.render(doc, style='span', jupyter=True, options={'colors': {'Relation': 'lightgreen'}})

### Topic Labelling

In [19]:
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import string

cleaned = text.translate(str.maketrans(dict.fromkeys(string.punctuation)))
doc1 = nlp(cleaned)

texts = [[t.text for t in doc1]]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
lda_model.show_topics()

[(0,
  '0.041*"to" + 0.035*"as" + 0.034*"of" + 0.033*"the" + 0.031*"stock" + 0.021*"for" + 0.018*"investors" + 0.018*"tech" + 0.018*"it" + 0.018*"cash"'),
 (1,
  '0.026*"stock" + 0.024*"as" + 0.023*"to" + 0.023*"the" + 0.020*"for" + 0.019*"of" + 0.014*"on" + 0.014*"it" + 0.014*"cash" + 0.014*"investors"'),
 (2,
  '0.045*"to" + 0.035*"the" + 0.030*"stock" + 0.029*"as" + 0.028*"of" + 0.020*"for" + 0.017*"tech" + 0.017*"its" + 0.015*"share" + 0.015*"it"'),
 (3,
  '0.012*"to" + 0.012*"stock" + 0.012*"as" + 0.012*"the" + 0.012*"for" + 0.012*"of" + 0.012*"tech" + 0.012*"its" + 0.012*"on" + 0.012*"it"'),
 (4,
  '0.012*"to" + 0.012*"stock" + 0.012*"the" + 0.012*"of" + 0.012*"as" + 0.012*"it" + 0.012*"June" + 0.012*"for" + 0.012*"its" + 0.012*"cash"'),
 (5,
  '0.012*"the" + 0.012*"to" + 0.012*"stock" + 0.012*"as" + 0.012*"of" + 0.012*"tech" + 0.012*"for" + 0.012*"share" + 0.012*"it" + 0.012*"its"'),
 (6,
  '0.022*"the" + 0.020*"to" + 0.018*"for" + 0.017*"stock" + 0.017*"of" + 0.016*"on" + 0.016

### Sentiment Analysis

In [20]:
from textblob import TextBlob

polarity = TextBlob(text).sentiment.polarity
subjectivity = TextBlob(text).sentiment.subjectivity

print('Polarity: ', polarity)
print('Subjectivity: ', subjectivity) 

Polarity:  -0.1357142857142857
Subjectivity:  0.6761904761904762


### Summarisation

In [21]:
# Load Packages              
from sumy.parsers.plaintext import PlaintextParser                   
from sumy.nlp.tokenizers import Tokenizer                      

# For Strings               
parser = PlaintextParser.from_string(text,Tokenizer("english"))         
from sumy.summarizers.text_rank import TextRankSummarizer                 

# Summarize using sumy TextRank                  
summarizer = TextRankSummarizer()                   
summary =summarizer(parser.document,2)                   
text_summary=""                  
for sentence in summary:                
    text_summary += str(sentence)   

In [22]:
doc2 = nlp(text_summary)

displacy.render(doc2, style='span', jupyter=True)


Available keys: []


## Use Case: De-identification

In [29]:
import re

def mask_entities(doc):
    # Mask entities using NER
    masked_text = text
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "ORG", "GPE"]:
            start, end = ent.start_char, ent.end_char
            masked_text = masked_text[:start] + "*"*len(ent.text) + masked_text[end:]
    
    standard_masked_text = re.sub(r"\*+", '***', masked_text)
    return standard_masked_text

deidentified_text = mask_entities(doc)
doc3 = nlp(deidentified_text)
displacy.render(doc3, style='span', jupyter=True)


Available keys: []
