In [1]:
import pandas as pd
import numpy as np
import os
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import time

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/aganap12/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data_directory = '../dataset/aws-case-studies-blogs-dataset'
file_paths = [os.path.join(data_directory, file) for file in os.listdir(data_directory) if file.endswith('.txt')]

In [4]:
tagged_data = []

for i, file_path in enumerate(file_paths):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        words = word_tokenize(text)
        words = [word.lower() for word in words]
        tagged_data.append(TaggedDocument(words, tags=['doc_' + str(i)]))

In [5]:
model = Doc2Vec()
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [6]:
document_vectors = [model.dv['doc_' + str(i)] for i in range(len(tagged_data))]

In [7]:
vector_dim = len(document_vectors[0])
vector_dim

100

In [8]:
import faiss

In [9]:
index = faiss.IndexFlatL2(vector_dim)

In [10]:
vectors_np = np.array(document_vectors).astype('float32')

In [11]:
query = "Impossible"
query_vector = model.infer_vector(query.lower().split(" "))
query_vector_np = np.array([query_vector]).astype('float32')
k = 5

In [12]:
print(f"query: ", query)
print(f"query vector np: ", query_vector_np)

query:  Impossible
query vector np:  [[-0.05687655 -0.00830367 -0.03966373  0.03002366 -0.01787311 -0.02410796
   0.0034225   0.02776333 -0.03873994  0.03093769 -0.02414584 -0.01320349
  -0.00567666 -0.02316937  0.01254354 -0.04767926  0.00289311  0.01243417
  -0.02308659 -0.00029333  0.00828836  0.02198659 -0.02134611  0.02374741
  -0.01084088 -0.01702005 -0.0468153  -0.04631354  0.01536928 -0.0497291
   0.0437691   0.01909008 -0.01138894 -0.03359936 -0.03054984  0.02224889
  -0.0103604  -0.02768533 -0.02809381 -0.03413468  0.01769208 -0.01010516
   0.00916497  0.00037042 -0.01481792 -0.06018982 -0.01002642 -0.01692454
  -0.02098884 -0.05032591 -0.01508853  0.02784039 -0.01729966 -0.03031973
  -0.03567794  0.04113714  0.00750158  0.03514955 -0.05919312  0.02942149
   0.05943093  0.05109467  0.02294203 -0.01164501  0.02192098  0.00486652
   0.0238451  -0.00182904 -0.05043935 -0.01726627 -0.09077703  0.00924592
  -0.04071798 -0.00925202  0.01333387  0.03753878  0.03307252 -0.03371513
  

In [13]:
index.reset()
index.add(vectors_np)
start_time = time.time()
distances, indices = index.search(query_vector_np, k)
end_time = time.time()
retrieval_time = (end_time - start_time) * 1e6
print("Retrieval time:", retrieval_time, "microseconds")

Retrieval time: 117281.91375732422 microseconds


In [14]:
indices

array([[297, 299, 198, 333, 339]])

In [15]:
nearest_files = [file_paths[i] for i in indices.flatten()]
nearest_files

['../dataset/aws-case-studies-blogs-dataset/Better Mortgage using Amazon Elastic Kubernetes _ Better Mortgage Video _ AWS.txt',
 '../dataset/aws-case-studies-blogs-dataset/TEG on using Machine Learning and Amazon Personalize to boost user engagement and ticket sales _ Ticketek Video _ AWS.txt',
 '../dataset/aws-case-studies-blogs-dataset/Razer Deepened Gamer Engagement using Amazon Personalize _ Video Testimonial _ AWS.txt',
 '../dataset/aws-case-studies-blogs-dataset/AppsFlyer Amazon EKS Case Study _ Advertising _ AWS.txt',
 '../dataset/aws-case-studies-blogs-dataset/Circle of Life _ Amazon Web Services.txt']