In [3]:
import fasttext 
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
import math
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
HorrorMovieSummaries = '/Users/valentine/Desktop/EPFL/Master/Ma1/ADA/HorrorMovieSummaries.txt'

In [5]:
model = fasttext.train_unsupervised(HorrorMovieSummaries)

Read 1M words
Number of words:  22283
Number of labels: 0
Progress: 100.0% words/sec/thread:   53702 lr:  0.000000 avg.loss:  2.271591 ETA:   0h 0m 0s


In [6]:
model.get_nearest_neighbors('ghost')

[(0.9432440400123596, 'ghost,'),
 (0.9272719025611877, 'ghosts;'),
 (0.8721774816513062, 'ghostly'),
 (0.8704167008399963, 'ghosts'),
 (0.8700905442237854, 'ghost.'),
 (0.8285982608795166, 'ghosts,'),
 (0.8260856866836548, "ghost's"),
 (0.8131915330886841, 'ghostbuster'),
 (0.7696982622146606, 'ghosts.'),
 (0.7548399567604065, 'ghost-like')]

In [7]:
vocabulary = model.words
word_embeddings = np.array([model[word] for word in vocabulary])

In [8]:
with open(HorrorMovieSummaries, ) as f:
    content = f.readlines()
        
original_documents = [x.strip() for x in content] 

In [9]:
# Create a dictionary of vectors
vector_dict = dict(zip(vocabulary, word_embeddings))

def aggregate_vector_list(vlist, aggfunc):
    if aggfunc == 'max':
        return np.array(vlist).max(axis=0)
    elif aggfunc == 'min':
        return np.array(vlist).min(axis=0)
    elif aggfunc == 'mean':
        return np.array(vlist).mean(axis=0)
    else:
        return np.zeros(np.array(vlist).shape[1])

possible_aggfuncs = ["max", "min", "mean"]

aggregated_doc_vectors = {}

# Aggregate vectors of documents beforehand
for aggfunc in possible_aggfuncs:
    aggregated_doc_vectors[aggfunc] = np.zeros((len(original_documents), word_embeddings.shape[1]))
    for index, doc in enumerate(original_documents):
        vlist = [vector_dict[token] for token in fasttext.tokenize(doc) if token in vector_dict]
        if(len(vlist) < 1):
            continue 
        else:
            aggregated_doc_vectors[aggfunc][index] = aggregate_vector_list(vlist, aggfunc) 

In [12]:
def aggregate_query(query, aggfunc):
    tokens = fasttext.tokenize(query)
    if(len(tokens) == 1):
        if(tokens[0] in vocabulary):
            return vector_dict[tokens[0]]
    elif(len(tokens) > 1):
        vlist = []
        for token in tokens:
            if (token in vocabulary):
                vlist.append(vector_dict[token])
        
        return aggregate_vector_list(vlist, aggfunc)
    else:
        print("%s is not in the vocabulary." % (query))
    
def get_most_similar_documents(query_vector, aggfunc, k = 5):
    query_vector = query_vector.reshape(1, -1)
    sim = cosine_similarity(query_vector, aggregated_doc_vectors[aggfunc])
    # Rank the document vectors according to their cosine similarity with 
    indexes = np.argsort(sim, axis=-1, kind='quicksort', order=None) # This is sorted in ascending order
    indexes = indexes[0]
    indexes = indexes[::-1] # Convert to descending
    return indexes

def search_vec_embeddings(query, topk = 10, aggfunc = 'mean'):
    query_vector = aggregate_query(query, aggfunc)
    indexes = get_most_similar_documents(query_vector, aggfunc)
    # Print the top k documents
    indexes = indexes[0:topk]
    IDs = []
    for index in indexes:
        print(original_documents[index])
        IDs.append(original_documents[index].split()[0])
    return IDs


In [13]:
search_vec_embeddings('Killer', aggfunc = 'mean')

26513406	Three criminology students visit the site of the murders perpetrated by an infamous serial killer. They intend to collect proof on film for a final doctrate paper.{{cite web}}{{cite web}}
23799559	Based on the true story of serial murderer Gary Ridgway, the film depicts how he would approach prostitutes in bars, then take them to his homes and brutally kill them. Then he'd throw the corpses into the Green River, which is where the name "Green River Killer" comes from. Soon the investigating police officers are on his trackCD Universe - Green River Killer DVD Movie.
21150086	In 1888, Jack the Ripper is on his killing spree. Scotland Yard Inspector O'Neill  welcomes a visit from his old friend New York City detective Sam Lowry  who agrees to assist with the investigation. Sam becomes attracted to modern woman Anne Ford  but her guardian, Dr. Tranter  doesn't approve. The police slowly close in the killer as the public becomes more alarmed. The killer's identity is revealed and t

['26513406',
 '23799559',
 '21150086',
 '28376038',
 '21287052',
 '13296475',
 '11699160',
 '21182881',
 '24665513',
 '23565442']