# Obtaining Word Embedding

import packages

In [1]:
import warnings
from collections import Counter
import itertools

warnings.filterwarnings(action='ignore')

In [11]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec, KeyedVectors, Phrases
from gensim.parsing.preprocessing import strip_short,strip_punctuation,\
                                         strip_numeric, strip_multiple_whitespaces
from gensim.test.utils import get_tmpfile
import glob
from nltk import tokenize
from nltk.corpus import stopwords
import inflect
from spherecluster import  VonMisesFisherMixture

In [3]:
articles = [] 
for i in glob.glob('./extracted_papers/*.txt'):
    paper = open(i, encoding='utf-8')
    articles.append(paper.read())

Clear out newline characters and non-unicode characters

In [4]:
a = ''.join([chr(n) for n in range(256)])
unwanted = '[' + re.escape(''.join([n for n in a if ord(n) < 32 or ord(n) > 126])) + ']'
cleaned_articles = list(map(lambda x: re.sub(unwanted, ' ', x), articles))
cleaned_articles = list(map(lambda x:x.lower(), cleaned_articles))

Strip out characters that are less than 1

In [5]:
def preprocess_text(s):
    """Remove unwanted text formats with numeric, whitespace, punctuation, short words stripped 
       Input: text string
       Output: post processed string
    """
    s = strip_numeric(s)
    s = strip_multiple_whitespaces(s)
    s = strip_punctuation(s)
    s = strip_short(s, minsize = 2)
    
    return s

In [6]:
cleaned_articles = list(map(preprocess_text, cleaned_articles))

Remove all non-alphabetical charaters

In [7]:
cleaned_sentences = []
for i in cleaned_articles:
    cleaned_sentences += list(map(lambda x: x, tokenize.sent_tokenize(i)))

In [8]:
stop_words = set(stopwords.words('english') + ['’','“', '‘', 'within', 
                                               'however','”','\uf8f6\uf8f7\uf8f7\uf8f7\uf8f8']) 

Strip stopwords, tokenize sentence to words

In [9]:
# cleaned_sentences_w = list(map(lambda sentence: tokenize.word_tokenize(sentence), cleaned_sentences))
cleaned_sentences_w = list(map(lambda sentence: [w for w in tokenize.word_tokenize(sentence) if not w in stop_words], 
                               cleaned_sentences))

turn all words to singular

In [10]:
def singularize(sentence):
    p = inflect.engine()
    for i,word in enumerate(sentence):
        if p.singular_noun(word):
            sentence[i] = p.singular_noun(word)
    return sentence
        
cleaned_sentences_w = list(map(singularize, cleaned_sentences_w))

bigram transform

In [11]:
bigram_transformer = Phrases(cleaned_sentences_w)

Train Word2Vec using gensim

In [12]:
model = Word2Vec(list(bigram_transformer[cleaned_sentences_w]), window= 5, min_count= 3, size= 200)
model.train(cleaned_sentences_w,total_examples=len(cleaned_sentences_w),epochs=50)

(290372552, 328155500)

In [56]:
model.wv.most_similar ('derivative', topn = 20)

[('payoff', 0.5001509189605713),
 ('option', 0.44860589504241943),
 ('integral', 0.4365573227405548),
 ('superhedging_price', 0.3820866048336029),
 ('differentiating', 0.3772635757923126),
 ('expectation', 0.375134140253067),
 ('marginal', 0.357364684343338),
 ('derivate', 0.3567274212837219),
 ('delta', 0.3548198640346527),
 ('payoff_function', 0.34959232807159424),
 ('∂', 0.342562735080719),
 ('moment', 0.3422113060951233),
 ('differential', 0.3385683000087738),
 ('schole', 0.33236658573150635),
 ('partial_derivative', 0.3323553800582886),
 ('arbitrage', 0.3317824602127075),
 ('option_payoff', 0.32606762647628784),
 ('vanilla', 0.32598644495010376),
 ('operator', 0.3222822844982147),
 ('unprotected_portfolio', 0.32208162546157837)]

Get word embedding

In [15]:
#word_vectors = get_tmpfile("word vectors.kv")
model.wv.save("word vectors.kv")

Build vocabulary inventory

In [16]:
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [17]:
word_counts = Counter(itertools.chain(*list(bigram_transformer[cleaned_sentences_w])))
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)} # rank words from importance

Load saved word vectors

In [3]:
filename = "word vectors.kv"
model = KeyedVectors.load(filename, mmap='r')

In [4]:
word_embedding = model.wv.vectors 
vocab = list(model.wv.vocab)

Normalize word embedding weights

In [5]:
linfnorm = np.linalg.norm(word_embedding, axis=1, ord=2)
word_embedding_normalized = word_embedding / linfnorm[:,None]

In [6]:
words_df = pd.DataFrame(word_embedding_normalized.T, columns=vocab)

In [7]:
words_df[['equity','stock', 'fixed_income', 'bond','real_estate','derivative', 'cds', 'swap', 'mortgage']]

Unnamed: 0,equity,stock,fixed_income,bond,real_estate,derivative,cds,swap,mortgage
0,-0.077254,-0.035838,-0.026334,-0.033983,0.032944,0.088212,0.010290,-0.046953,0.177935
1,0.039362,0.031508,-0.049707,0.056490,-0.085481,0.003974,-0.088718,0.036078,-0.046988
2,-0.020444,0.005016,0.001798,0.056177,-0.062651,0.002750,0.005555,0.067241,0.113706
3,-0.017430,-0.088992,0.047215,0.051243,-0.080043,-0.037357,-0.157706,-0.018426,-0.027340
4,0.027510,0.064764,0.059560,0.064073,0.092872,-0.040440,-0.039156,-0.005243,-0.045293
5,0.039634,0.130758,0.033730,-0.043517,-0.064624,-0.002343,-0.062548,-0.157512,0.043281
6,0.107616,-0.018249,0.008828,0.003732,-0.011873,0.169280,0.068275,-0.063931,-0.015851
7,0.064094,0.127592,-0.055271,-0.008005,0.018048,-0.061576,-0.076956,0.140811,0.041247
8,0.139534,0.023347,-0.110026,0.085256,-0.016133,-0.123957,-0.054268,0.013236,-0.016501
9,-0.145650,0.008573,0.056150,0.080369,0.020935,0.045687,0.057878,0.034731,0.078502


load classes and keywords

In [8]:
class_keywords_str  = open('class keywords.txt', encoding='utf-8').read()
class_keywords = {i.split(': ')[0]: i.split(': ')[1].split(', ') for i in class_keywords_str.split('\n')}

Obtain word vectors for each keyword from every class

In [9]:
class_keywords_supplied = {class_label: [np.array(words_df[word]) for word in words] 
                           for class_label, words in class_keywords.items()}

fit a vMF distribution for every class, obtain cluster centers and dispersion parameters

In [12]:
mu = []
kappa = []
for i in class_keywords_supplied.keys():
    keyword_mtx = np.vstack(class_keywords_supplied[i])
    vmF = VonMisesFisherMixture(n_clusters=1, n_jobs=10)
    vmF.fit(keyword_mtx)
    mu.append(vmF.cluster_centers_[0])
    kappa.append(vmF.concentrations_[0])