In [10]:
import string
from nltk.tokenize import sent_tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import operator
from nltk import ngrams
import gc


def load_doc(file):
    text = open(file, 'r').read()
    return text

def get_sentences(file):
    doc = load_doc(file)
    sentences = sent_tokenize(doc)
    return sentences

def clean_doc(file):
    docs = get_sentences(file)
    result = []
    for doc in docs:
        doc = doc.replace('--', ' ')
        tokens = doc.split()
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word.lower() for word in tokens]
        result.append(tokens)
    return result

def get_phrase_vec(model,tok_sent):
    numerator = np.zeros(300)
    for word in tok_sent:
        try:
            vec = model[word]
        except Exception as e:
            vec = np.zeros(300)
        numerator += vec
    return numerator/len(tok_sent)
        
def avg_sentence( model,file):
    c_docs = clean_doc(file)
    sent_vec = []
    for doc in c_docs:
        numerator = np.zeros(300)
        denominator = 0
        for word in doc:
            try:
                vec = model[word]
            except KeyError as e:
                vec = np.zeros(300)
            numerator += vec
        sent_vec.append((doc,numerator/len(doc)))
    return sent_vec

def search_word(model, input_word,textFile,Phrase=True):
    vector_sentences = avg_sentence(model,textFile)
    if Phrase:
        word_vec = get_phrase_vec(model, input_word.split(' '))
    else:
        try:
            word_vec = model[input_word]
        except KeyError as e:
            raise ValueError("Word not present in the vocabulary")
    n = 0
    ans = []
    for vec in vector_sentences:
        calc_vec = list(cosine_similarity(word_vec.reshape(1, -1),vec[1].reshape(1, -1)))[0][0]
        ans.append((' '.join(vec[0]),calc_vec))
    sent_with_highest_signal = sorted(ans, key = lambda x: x[1])[-1][0]
    uni_to_trigrams = [(' '.join(c),cosine_similarity(get_phrase_vec(model,c).reshape(1,-1),word_vec.reshape(1,-1))[0][0])
                                         for i in range(1,4) for c in ngrams(sent_with_highest_signal.split(),i)]
    print('-----------------------------THE MAIN SENTENCE------------------------------------------------------')
    print(sent_with_highest_signal)
    print('-----------------------------THE NGRAMS ALONGSIDE THEIR RESPECTIVE SCORES---------------------------')
    return sorted(uni_to_trigrams, key = lambda x: x[1])[-5:]

In [1]:
from gensim.models import KeyedVectors
model=KeyedVectors.load_word2vec_format("/Users/abdulrazzaq/gensim-data/GoogleNews-vectors-negative300.bin", binary=True)

In [39]:
gc.collect()

95

In [40]:
ans = search_word(model,'uber about to ipo','article.txt',Phrase=True)
print(ans)

-----------------------------THE MAIN SENTENCE------------------------------------------------------
the firm has a market cap of billion a pe ratio of and a beta of
-----------------------------THE NGRAMS ALONGSIDE THEIR RESPECTIVE SCORES---------------------------
[('and a beta', 0.31662712170703106), ('a beta of', 0.31662712170703106), ('beta', 0.31662712170703117), ('a beta', 0.31662712170703117), ('beta of', 0.31662712170703117)]


## FINACIAL NEW CUSTOM WORD EMBEDDINGS

In [2]:
import pandas as pd

In [12]:
df1 = pd.read_csv('finance_articles.csv')
df2 = pd.read_csv('financial_articles.csv')

In [16]:
def get_doc_sent(df1,df2,filename):  
    total = df1.Text.tolist() + df2.Text.tolist()
    data = '\n'.join(total)
    file = open(filename,'w')
    file.write(data)
    file.close()

In [17]:
get_doc_sent(df1,df2,'financial_news.txt')

In [18]:
sentences = get_sentences('financial_news.txt')

In [23]:
def clean_financial_news(sentences):
    result = []
    for doc in sentences:
        doc = doc.replace('\n',' ')
        doc = doc.replace('--', ' ')
        tokens = doc.split()
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word.lower() for word in tokens]
        result.append(tokens)
    return result

In [24]:
cleaned_news = clean_financial_news(sentences)

In [25]:
print(cleaned_news[:4])

[['zacks', 'analysts', 'anticipate', 'ultragenyx', 'pharmaceutical', 'inc', 'rare', 'will', 'announce', 'quarterly', 'sales', 'of', 'million', 'posted', 'by', 'maurice', 'goldstein', 'on', 'may', 'equities', 'research', 'analysts', 'forecast', 'that', 'ultragenyx', 'pharmaceutical', 'inc', 'nasdaqrare', 'will', 'post', 'sales', 'of', 'million', 'for', 'the', 'current', 'fiscal', 'quarter', 'according', 'to', 'zacks'], ['seven', 'analysts', 'have', 'issued', 'estimates', 'for', 'ultragenyx', 'earnings', 'with', 'the', 'lowest', 'sales', 'estimate', 'coming', 'in', 'at', 'million', 'and', 'the', 'highest', 'estimate', 'coming', 'in', 'at', 'million'], ['ultragenyx', 'pharmaceutical', 'reported', 'sales', 'of', 'million', 'in', 'the', 'same', 'quarter', 'last', 'year', 'which', 'indicates', 'a', 'positive', 'yearoveryear', 'growth', 'rate', 'of'], ['the', 'business', 'is', 'scheduled', 'to', 'issue', 'its', 'next', 'earnings', 'report', 'on', 'thursday', 'august']]


In [27]:
from gensim.models import Word2Vec
import multiprocessing
EMB_DIM = 300

w2v = Word2Vec(cleaned_news, size=EMB_DIM, window=5,min_count=5,negative=15,iter=10,workers=multiprocessing.cpu_count())

In [44]:
word_vectors = w2v.wv
word_vectors.similar_by_word('million')

[('billion', 0.8043133616447449),
 ('percent', 0.481628954410553),
 ('total', 0.45313483476638794),
 ('net', 0.42137467861175537),
 ('trillion', 0.3999459743499756),
 ('per', 0.39225542545318604),
 ('millions', 0.38231557607650757),
 ('year', 0.36807942390441895),
 ('approximately', 0.36258527636528015),
 ('tons', 0.35911741852760315)]