In [10]:
import string
from nltk.tokenize import sent_tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import operator


def load_doc(file):
    text = open(file, 'r').read()
    return text

def get_sentences(file):
    doc = load_doc(file)
    sentences = sent_tokenize(doc)
    return sentences

def clean_doc(file):
    docs = get_sentences(file)
    result = []
    for doc in docs:
        doc = doc.replace('--', ' ')
        tokens = doc.split()
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word.lower() for word in tokens]
        result.append(tokens)
    return result

def get_phrase_vec(model,input_word_phrase):
    numerator = np.zeros(25)
    denominator = 0
    for word in input_word_phrase.split(' '):
        try:
            vec = model[word]
            denominator +=1
        except Exception as e:
            pass
        numerator += vec
    return numerator/denominator
        
def avg_sentence( model,file):
    c_docs = clean_doc(file)
    sent_vec = []
    for doc in c_docs:
        numerator = np.zeros(25)
        denominator = 0
        for word in doc:
            try:
                denominator += 1
                vec = model[word]
            except KeyError as e:
                pass
            numerator += vec
        sent_vec.append((doc,numerator/denominator))
    return sent_vec

def search_word(model, input_word,textFile,Phrase=True):
    vector_sentences = avg_sentence(model,textFile)
    if Phrase:
        word_vec = get_phrase_vec(model, input_word)
    else:
        try:
            word_vec = model[input_word]
        except KeyError as e:
            raise ValueError("Word not present in the vocabulary")
    n = 0
    ans = []
    for vec in vector_sentences:
        calc_vec = list(cosine_similarity(word_vec.reshape(1, -1),vec[1].reshape(1, -1)))[0][0]
        ans.append((' '.join(vec[0]),calc_vec))
    return sorted(ans, key = lambda x: x[1]) 

In [3]:
from gensim.models import KeyedVectors
model=KeyedVectors.load_word2vec_format("/Users/abdulrazzaq/gensim-data/glove-twitter-25/glove-twitter-25.gz")

In [11]:
ans = search_word(model,'pharmacy is a very good business','article.txt',Phrase=True)
print(ans)

[('several analysts recently commented on the stock', 0.8922033751298493), ('institutional investors and hedge funds own of the stock', 0.9051617030331485), ('ultragenyx pharmaceutical company profile', 0.90547840638274), ('get ultragenyx pharmaceutical alerts ultragenyx pharmaceutical nasdaqrare last issued its earnings results on monday may', 0.9055612359833983), ('seven analysts have issued estimates for ultragenyx earnings with the lowest sales estimate coming in at million and the highest estimate coming in at million', 0.9217683664206967), ('the biopharmaceutical company reported earnings per share eps for the quarter missing the thomson consensus estimate of by', 0.9257237314479116), ('zacks analysts anticipate ultragenyx pharmaceutical inc rare will announce quarterly sales of million posted by maurice goldstein on may equities research analysts forecast that ultragenyx pharmaceutical inc nasdaqrare will post sales of million for the current fiscal quarter according to zacks', 