In [23]:
import string
from nltk.tokenize import sent_tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import operator
from nltk import ngrams
import gc


def load_doc(file):
    text = open(file, 'r').read()
    return text

def get_sentences(file):
    doc = load_doc(file)
    sentences = sent_tokenize(doc)
    return sentences

def clean_doc(file):
    docs = get_sentences(file)
    result = []
    for doc in docs:
        doc = doc.replace('--', ' ')
        tokens = doc.split()
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word.lower() for word in tokens]
        result.append(tokens)
    return result

def get_phrase_vec(model,tok_sent):
    numerator = np.zeros(100)
    for word in tok_sent:
        try:
            vec = model[word]
        except Exception as e:
            vec = np.zeros(100)
        numerator += vec
    return numerator/len(tok_sent)
        
def avg_sentence( model,file):
    c_docs = clean_doc(file)
    sent_vec = []
    for doc in c_docs:
        numerator = np.zeros(100)
        denominator = 0
        for word in doc:
            try:
                vec = model[word]
            except KeyError as e:
                vec = np.zeros(100)
            numerator += vec
        sent_vec.append((doc,numerator/len(doc)))
    return sent_vec

def search_word(model, input_word,textFile,Phrase=True):
    vector_sentences = avg_sentence(model,textFile)
    if Phrase:
        word_vec = get_phrase_vec(model, input_word.split(' '))
    else:
        try:
            word_vec = model[input_word]
        except KeyError as e:
            raise ValueError("Word not present in the vocabulary")
    n = 0
    ans = []
    for vec in vector_sentences:
        calc_vec = list(cosine_similarity(word_vec.reshape(1, -1),vec[1].reshape(1, -1)))[0][0]
        ans.append((' '.join(vec[0]),calc_vec))
    sent_with_highest_signal = sorted(ans, key = lambda x: x[1])[-1][0]
    uni_to_trigrams = [(' '.join(c),cosine_similarity(get_phrase_vec(model,c).reshape(1,-1),word_vec.reshape(1,-1))[0][0])
                                         for i in range(1,4) for c in ngrams(sent_with_highest_signal.split(),i)]
    print('-----------------------------THE MAIN SENTENCE------------------------------------------------------')
    print(sent_with_highest_signal)
    print('-----------------------------THE NGRAMS ALONGSIDE THEIR RESPECTIVE SCORES---------------------------')
    return sorted(uni_to_trigrams, key = lambda x: x[1])[-5:]

In [24]:
from gensim.models import KeyedVectors
google_negative_news =  '/Users/abdulrazzaq/gensim-data/GoogleNews-vectors-negative300.bin'
model=KeyedVectors.load_word2vec_format('model.txt')

In [25]:
gc.collect()

8

In [28]:
ans = search_word(model,'soaring revenue','article.txt',Phrase=True)
print(ans)

-----------------------------THE MAIN SENTENCE------------------------------------------------------
expect some combination of domestically developed payments schemes andor countries allowing global payments operators ie
-----------------------------THE NGRAMS ALONGSIDE THEIR RESPECTIVE SCORES---------------------------
[('developed payments schemes', 0.2819372226792563), ('developed payments', 0.282412453539377), ('allowing global payments', 0.2996403382999625), ('domestically developed payments', 0.306298603285112), ('global payments', 0.3299203234273732)]


## FINACIAL NEW CUSTOM WORD EMBEDDINGS

In [2]:
import pandas as pd

In [12]:
df1 = pd.read_csv('finance_articles.csv')
df2 = pd.read_csv('financial_articles.csv')

In [16]:
def get_doc_sent(df1,df2,filename):  
    total = df1.Text.tolist() + df2.Text.tolist()
    data = '\n'.join(total)
    file = open(filename,'w')
    file.write(data)
    file.close()

In [17]:
get_doc_sent(df1,df2,'financial_news.txt')

In [4]:
sentences = get_sentences('financial_news.txt')

In [5]:
def clean_financial_news(sentences):
    result = []
    for doc in sentences:
        doc = doc.replace('\n',' ')
        doc = doc.replace('--', ' ')
        tokens = doc.split()
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word.lower() for word in tokens]
        result.append(tokens)
    return result

In [6]:
cleaned_news = clean_financial_news(sentences)

In [7]:
print(cleaned_news[:4])

[['zacks', 'analysts', 'anticipate', 'ultragenyx', 'pharmaceutical', 'inc', 'rare', 'will', 'announce', 'quarterly', 'sales', 'of', 'million', 'posted', 'by', 'maurice', 'goldstein', 'on', 'may', 'equities', 'research', 'analysts', 'forecast', 'that', 'ultragenyx', 'pharmaceutical', 'inc', 'nasdaqrare', 'will', 'post', 'sales', 'of', 'million', 'for', 'the', 'current', 'fiscal', 'quarter', 'according', 'to', 'zacks'], ['seven', 'analysts', 'have', 'issued', 'estimates', 'for', 'ultragenyx', 'earnings', 'with', 'the', 'lowest', 'sales', 'estimate', 'coming', 'in', 'at', 'million', 'and', 'the', 'highest', 'estimate', 'coming', 'in', 'at', 'million'], ['ultragenyx', 'pharmaceutical', 'reported', 'sales', 'of', 'million', 'in', 'the', 'same', 'quarter', 'last', 'year', 'which', 'indicates', 'a', 'positive', 'yearoveryear', 'growth', 'rate', 'of'], ['the', 'business', 'is', 'scheduled', 'to', 'issue', 'its', 'next', 'earnings', 'report', 'on', 'thursday', 'august']]


In [8]:
from gensim.models import Word2Vec
import multiprocessing
EMB_DIM = 100

w2v = Word2Vec(cleaned_news, size=EMB_DIM, window=5,min_count=5,negative=15,iter=10,workers=multiprocessing.cpu_count())

In [9]:
word_vectors = w2v.wv
word_vectors.similar_by_word('million')

[('billion', 0.8768537640571594),
 ('percent', 0.5705723166465759),
 ('total', 0.5587888956069946),
 ('trillion', 0.51272052526474),
 ('crores', 0.49851346015930176),
 ('millions', 0.4977300465106964),
 ('net', 0.488506019115448),
 ('eur', 0.4871695637702942),
 ('sek', 0.48228564858436584),
 ('year', 0.47676876187324524)]

In [10]:
word_vectors.similar_by_word('debt')

[('debts', 0.6732707023620605),
 ('borrowings', 0.598007321357727),
 ('loan', 0.5769705772399902),
 ('loans', 0.5721713304519653),
 ('liquidity', 0.5491695404052734),
 ('interest', 0.5271838307380676),
 ('borrowing', 0.5213406682014465),
 ('bonds', 0.5212751626968384),
 ('cash', 0.5018055438995361),
 ('bond', 0.499487966299057)]

In [11]:
word_vectors.similar_by_word('sales')

[('revenues', 0.6219348907470703),
 ('revenue', 0.6071063280105591),
 ('arpu', 0.5076402425765991),
 ('yearonyear', 0.4826663136482239),
 ('inventories', 0.47095710039138794),
 ('consumption', 0.46385905146598816),
 ('liquor', 0.4626953899860382),
 ('profit', 0.45940881967544556),
 ('exports', 0.4573214054107666),
 ('apac', 0.4533824920654297)]

In [14]:
from warnings import filterwarnings
filterwarnings('ignore')
cosine_similarity(w2v['bank'].reshape(1, -1), w2v['revenue'].reshape(1, -1))

array([[0.1723237]], dtype=float32)

In [16]:
cosine_similarity(w2v['debt'].reshape(1, -1), w2v['loan'].reshape(1, -1))

array([[0.5769705]], dtype=float32)

In [17]:
word_vectors.save_word2vec_format('model.txt', binary=False)