# Obtaining Word Embedding

import packages

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [31]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec, KeyedVectors, Phrases
from gensim.parsing.preprocessing import strip_short,strip_punctuation,\
                                         strip_numeric, strip_multiple_whitespaces
from gensim.test.utils import get_tmpfile
import glob
from nltk import tokenize

In [3]:
articles = [] 
for i in glob.glob('./extracted_papers/*.txt'):
    paper = open(i, encoding='utf-8')
    articles.append(paper.read())

Clear out newline characters and non-unicode characters

In [4]:
a = ''.join([chr(n) for n in range(256)])
unwanted = '[' + re.escape(''.join([n for n in a if ord(n) < 32 or ord(n) > 126])) + ']'
cleaned_articles = list(map(lambda x: re.sub(unwanted, ' ', x), articles))
cleaned_articles = list(map(lambda x:x.lower(), cleaned_articles))

Strip out characters that are less than 1

In [5]:
def preprocess_text(s):
    """Remove unwanted text formats with numeric, whitespace, punctuation, short words stripped 
       Input: text string
       Output: post processed string
    """
    s = strip_numeric(s)
    s = strip_multiple_whitespaces(s)
    s = strip_punctuation(s)
    s = strip_short(s, minsize = 3)
    
    return s

In [6]:
cleaned_articles = list(map(preprocess_text, cleaned_articles))

Remove all non-alphabetical charaters

In [7]:
cleaned_sentences = []
for i in cleaned_articles:
#     try:
#         abstract = i.split('abstract')[1].split('introduction')[0]
#     except:
#         abstract = i.split('introduction')[0]
#     cleaned_sentences += list(map(lambda x: x.lstrip(), tokenize.sent_tokenize(abstract)))
   cleaned_sentences += list(map(lambda x: x, tokenize.sent_tokenize(i)))

In [8]:
cleaned_sentences_w = list(map(lambda sentence: tokenize.word_tokenize(sentence), cleaned_sentences))

Train Word2Vec using gensim

In [9]:
bigram_transformer = Phrases(cleaned_sentences_w)
model = Word2Vec(list(bigram_transformer[cleaned_sentences_w]), window= 5, min_count= 3, size=50)
model.train(cleaned_sentences_w,total_examples=len(cleaned_sentences_w),epochs=20)

(75059504, 99120240)

In [40]:
model.wv.most_similar ('mortgage', topn = 15)

[('executive', 0.827147364616394),
 ('bailout', 0.8269398212432861),
 ('issuing', 0.8104901909828186),
 ('sovereign', 0.7969787120819092),
 ('debts', 0.7920305728912354),
 ('repayment', 0.7895249128341675),
 ('securitized', 0.7762141823768616),
 ('credits', 0.7728871703147888),
 ('central_bank', 0.7728233337402344),
 ('pension', 0.771578311920166),
 ('bailouts', 0.7702019214630127),
 ('devaluation', 0.7576361894607544),
 ('transactional', 0.7549120187759399),
 ('sme', 0.7494511604309082),
 ('secured', 0.7465015649795532)]

Get word embedding

In [11]:
word_embedding = model.wv.vectors 
vocab = list(model.wv.vocab)

In [13]:
word_embedding.shape

(67391, 50)

Build word embedding df for visualization

In [14]:
words_df = pd.DataFrame(word_embedding.T, columns=vocab)

In [30]:
words_df[['equity','stock', 'fixed_income', 'bond','real_estate','derivative', 'cds', 'swap', 'mortgage']]

Unnamed: 0,equity,stock,fixed_income,bond,real_estate,derivative,cds,swap,mortgage
0,0.016996,0.251323,0.119479,2.25126,0.146917,-0.463715,-0.426453,-0.35599,-0.36782
1,0.080411,2.697509,-0.032227,-3.430276,-0.099143,1.314498,0.972398,0.06628,1.050617
2,-0.505519,-0.559139,-0.236895,0.952695,-1.173605,0.719237,0.29455,-0.242641,-0.622741
3,-0.282616,0.872275,-0.303795,5.946072,-1.203236,-2.174049,-0.785108,0.18327,0.38421
4,-0.668458,3.450494,-0.163884,2.225072,-1.442413,3.400651,1.325211,-0.425354,-1.554919
5,-0.120788,0.38549,0.399068,-2.370301,0.716786,0.772074,0.586287,-0.008625,0.43699
6,-0.082196,-0.660383,-0.014724,-3.368577,-3.212409,-0.039484,-2.149263,-0.218504,-0.520202
7,-0.152134,-0.62191,0.182486,0.727318,-1.032617,-1.276923,0.399639,0.282809,1.133843
8,-0.036317,-0.134365,0.336155,-1.81019,-0.040297,0.952837,-0.146818,-0.198556,0.267689
9,-0.116603,0.79898,0.143185,-2.705616,1.773896,0.306864,0.311492,-0.113836,-0.94466


In [39]:
#word_vectors = get_tmpfile("word vectors.kv")
model.wv.save("word vectors.kv")