In [1]:
#IGNORE THESE
!echo "Ignoring these. Uncomment if required"
!#pip install spacy
!#python -m spacy download en_core_web_lg

Ignoring these. Uncomment if required


In [2]:
import spacy
from spacy import displacy
import random

In [3]:
# nlp = spacy.load('en')
nlp = spacy.load('en_core_web_lg')
import warnings; warnings.simplefilter('ignore')

In [4]:
len(nlp.vocab)

1344233

In [5]:
doc = nlp("Let's trade an interest rate swap")
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

Let	0	let	False	False	Xxx	VERB	VB
's	3	-PRON-	False	False	'x	PRON	PRP
trade	6	trade	False	False	xxxx	VERB	VB
an	12	an	False	False	xx	DET	DT
interest	15	interest	False	False	xxxx	NOUN	NN
rate	24	rate	False	False	xxxx	NOUN	NN
swap	29	swap	False	False	xxxx	NOUN	NN


In [6]:
from spacy import displacy 
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ. WSJ means the Wall Street Journal')
displacy.render(doc, style='ent', jupyter=True)

In [7]:
custom_entities = ['FIN_PRODUCT', 'SPREAD', 'ASSET CLASS', 'RFQ', 'TRADE_EXECUTION', 'CURRENCY_PAIR', 'EQUITY_SYMBOL']
a = nlp.get_pipe('ner')
for ent in custom_entities:
    if 'extra_labels' in a.cfg and ent in a.cfg['extra_labels']:
        pass
    else:
        a.add_label(ent)
        
a.cfg['extra_labels'] = custom_entities
a.cfg['extra_labels']

['FIN_PRODUCT',
 'SPREAD',
 'ASSET CLASS',
 'RFQ',
 'TRADE_EXECUTION',
 'CURRENCY_PAIR',
 'EQUITY_SYMBOL']

In [8]:
def find_label(text, labels = {}, classification='Not-Financial'):
    l = []
    for label in labels:
        if label not in text:
            print(f"{label} not found in '{text}'")
        else:
            idx = text.index(label)
            l.append((idx, idx + len(label), labels[label]))
            
    return (text, {'entities': l})

In [9]:
GOLDEN_DATA = [
    ["We price the swap at 54 bips", {"We price": "RFQ", "swap":"FIN_PRODUCT", "54 bips":"CARDINAL"}, 'FINANCIAL'],
    ["TD offers 1.123 on 10 million of CADUSD", {"TD offers": "RFQ", "CADUSD":"CURRENCY_PAIR", "1.123":"CARDINAL", '10 million':"CARDINAL"}, 'FINANCIAL'],
    ["Names like AMZN trade frequently but tech like TWLO trades on lower volume", {'TWLO':"EQUITY_SYMBOL", 'AMZN':'EQUITY_SYMBOL'}, 'FINANCIAL'],
    ["TD bids 99.91 on US 10s. Done. Thanks for the trade. Confirm to follow", {'TD bids':"RFQ", 'Done.':'TRADE_EXECUTION', 'Thanks for the trade':'TRADE_EXECUTION', 'Confirm to follow':'TRADE_EXECUTION'}, 'FINANCIAL'],
    ["Amazon (AMZN) is trading 13% higher than 12 months ago but 22% lower than the peak", {'AMZN':"EQUITY_SYMBOL"}, 'FINANCIAL'],
    ["The river Amazon flows mostly through Brazil", {}, 'NON-FINANCIAL']
]

In [10]:
TRAIN_DATA = [ find_label(x[0], x[1]) for x in GOLDEN_DATA]
TRAIN_DATA

[('We price the swap at 54 bips',
  {'entities': [(0, 8, 'RFQ'),
    (13, 17, 'FIN_PRODUCT'),
    (21, 28, 'CARDINAL')]}),
 ('TD offers 1.123 on 10 million of CADUSD',
  {'entities': [(0, 9, 'RFQ'),
    (33, 39, 'CURRENCY_PAIR'),
    (10, 15, 'CARDINAL'),
    (19, 29, 'CARDINAL')]}),
 ('Names like AMZN trade frequently but tech like TWLO trades on lower volume',
  {'entities': [(47, 51, 'EQUITY_SYMBOL'), (11, 15, 'EQUITY_SYMBOL')]}),
 ('TD bids 99.91 on US 10s. Done. Thanks for the trade. Confirm to follow',
  {'entities': [(0, 7, 'RFQ'),
    (25, 30, 'TRADE_EXECUTION'),
    (31, 51, 'TRADE_EXECUTION'),
    (53, 70, 'TRADE_EXECUTION')]}),
 ('Amazon (AMZN) is trading 13% higher than 12 months ago but 22% lower than the peak',
  {'entities': [(8, 12, 'EQUITY_SYMBOL')]}),
 ('The river Amazon flows mostly through Brazil', {'entities': []})]

In [11]:
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe('ner')

#optimizer = nlp.begin_training()
optimizer = nlp.entity.create_optimizer()
#Use create optimizer to avoid wipping old data

In [12]:
from spacy.util import minibatch
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(20):
        random.shuffle(TRAIN_DATA)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA) #, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
        #print('Losses', losses)

In [13]:
displacy.render(nlp("We price the USDCAD at 54 bips. Do you want to trade that?"), style='ent', jupyter=True)

In [14]:
#Example of overriding ents explicitly
from spacy.tokens import Span
label_hash = doc.vocab.strings["RFQ"]
doc.ents = doc.ents + (Span(doc, 8, 13,label_hash),)
displacy.render(doc, style='ent', jupyter=True)

In [15]:
class CurrencyPairPipeline(object):
    def __init__(self, nlp):
        self.label_hash = nlp.vocab.strings["CURRENCY_PAIR"]
 
    def __call__(self, doc):
        import re
        from spacy.tokens import Span
        new_tokens = doc.ents
        for idx, token in enumerate(doc):
            #Checking if a currency pair, e.g. USDEUR, EURUSD, etc    
            if re.search(r"((USD|EUR|GBP|JPY|CAD)+[A-Z]{3})|([A-Z]{3}(USD|EUR|GBP|JPY|CAD))", token.text, re.IGNORECASE):
                #We found a match so need to update the entities
                span = Span(doc, idx, idx+1, label=self.label_hash)
                #Spacy only supports one label per phrase, so need to conditionally replace (e.g. USDEUR may be wrongly labelled as an ORG or GPE)
                new_tokens = tuple([t for t in new_tokens if t.start != idx]) + (span,)
        doc.ents = new_tokens
        return doc

In [16]:
class EquitySymbolPipeline(object):
    def __init__(self, nlp):
        if False: #6000+ tickers
            #Needs context manager to understand scope of the document
            import pandas as pd
            self.label_hash = nlp.vocab.strings["EQUITY_SYMBOL"]
            df = pd.read_csv("equity_symbols.csv", names=['SYMBOL', '?', 'NAME'])
            self.symbols = set(df['SYMBOL'].values)
            self.names = set(df['NAME'].values)
        else:
            import pandas as pd
            self.label_hash = nlp.vocab.strings["EQUITY_SYMBOL"]
            df = pd.read_csv("sp500.csv")
            self.symbols = set(df['Symbol'].values)
            self.names = set(df['Name'].values)
        
 
    def __call__(self, doc):
        #TODO - Move this into another pipeline
        Doc.set_extension('is_equity_context', default=False, force=True)
        doc._.is_equity_context = True

        if doc._.is_equity_context:
            import re
            from spacy.tokens import Span
            new_tokens = doc.ents
            for idx, token in enumerate(doc):
                if token.text in self.symbols:
                #Checking if a currency pair, e.g. USDEUR, EURUSD, etc    
                    #We found a match so need to update the entities
                    span = Span(doc, idx, idx+1, label=self.label_hash)
                    #Spacy only supports one label per phrase, so need to conditionally replace (e.g. USDEUR may be wrongly labelled as an ORG or GPE)
                    
                    try:
                        doc.ents = doc.ents + (span,)
                    except:
                        print(f"Collision in doc.ents {doc.ents} vs. equity ones {span}")
        return doc

Before using the custom pipeline:

In [17]:
from spacy.tokens import Doc
doc = nlp("Paris is the awesome capital of France. They use the euro. The current USDEUR rate is 1.112 to exchange currencies")

displacy.render(doc, style='ent', jupyter=True)

In [24]:
from spacy.tokens import Token
 
ccy_pipeline = CurrencyPairPipeline(nlp)
try:
    nlp.remove_pipe(name='ccy_pipeline')
except:
    print("Couldn't remove pipe")
nlp.add_pipe(ccy_pipeline, name='ccy_pipeline', last=True)


equity_pipeline = EquitySymbolPipeline(nlp)
try:
    nlp.remove_pipe(name='equity_pipeline')
except:
    print("Couldn't remove pipe")
nlp.add_pipe(equity_pipeline, name='equity_pipeline', last=True)

print(nlp.pipe_names)

Couldn't remove pipe
['tagger', 'parser', 'ner', 'ccy_pipeline', 'equity_pipeline']


In [19]:
from spacy.tokens import Doc
doc = nlp("Paris is the awesome capital of France. They use the euro. The current USDEUR "\
          "rate is 1.112 to exchange currencies")
displacy.render(doc, style='ent', jupyter=True)


After using the custom pipeline:

In [25]:
# nlp.disable_pipes('ccy_pipeline')
doc = nlp("EURUSD is more stable than USDGBP at the moment. Should I buy AMZN as it earns more in euros than in sterling?")
displacy.render(doc, style='ent', jupyter=True)

In [26]:
doc = nlp("1.123")
displacy.render(doc, style='ent', jupyter=True)

In [27]:
doc = nlp("We price EURUSD at 1.124. Do you want to trade it? That's 1.124 US dollars per 1 euro")
displacy.render(doc, style='ent', jupyter=True)

In [29]:
doc = nlp("TD bids 2 gold coins for your camels. Companies like AAPL are trending sideways")
displacy.render(doc, style='ent', jupyter=True)

Collision in doc.ents (TD bids, 2, AAPL) vs. equity ones AAPL


In [30]:
print(doc.similarity(nlp("Foreign Exchange currency")))
print(doc.similarity(nlp("Bears Shit in the woods")))
print(doc.similarity(nlp("Interest rate swaps trade")))

for t in doc:
    print(nlp.vocab.strings[t.lemma])
    print(t.lemma)

0.5571611378546174
0.6443713627739217
0.6003355630409812
td
9477787128463159791
bid
15851398737023972976
2
15180167692696242062
gold
16992302602439090065
coin
7394989984657272301
for
16037325823156266367
-PRON-
561228191312463089
camel
13923385298802530728
.
12646065887601541794
company
6905553075311563409
like
18194338103975822726
aapl
2865372538102694494
be
10382539506755952630
trend
14868696856168579472
sideways
13456308254581018767
