In [1]:
import pandas as pd
import numpy as np
import os
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/aganap12/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data_directory = './dataset/aws-case-studies-blogs-dataset'
file_paths = [os.path.join(data_directory, file) for file in os.listdir(data_directory) if file.endswith('.txt')]

In [4]:
tagged_data = []

for i, file_path in enumerate(file_paths):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        words = word_tokenize(text)
        words = [word.lower() for word in words]
        tagged_data.append(TaggedDocument(words, tags=['doc_' + str(i)]))

In [5]:
model = Doc2Vec()
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [6]:
document_vectors = [model.dv['doc_' + str(i)] for i in range(len(tagged_data))]

In [7]:
vector_dim = len(document_vectors[0])
vector_dim

100

In [8]:
import faiss

In [77]:
index = faiss.IndexFlatL2(vector_dim)

In [78]:
vectors_np = np.array(document_vectors).astype('float32')

In [95]:
query = "Impossible"
query_vector = model.infer_vector(query.lower().split(" "))
query_vector_np = np.array([query_vector]).astype('float32')
k = 5

In [96]:
print(f"query: ", query)
print(f"query vector np: ", query_vector_np)

query:  Impossible
query vector np:  [[-0.01299587 -0.00353058 -0.04049246 -0.00553903 -0.00101143 -0.01854867
  -0.00633003  0.01701172 -0.01044346  0.03374536  0.00243368 -0.02397835
  -0.01233336 -0.01264377  0.01176688 -0.05135999 -0.00831006  0.01067295
  -0.00025316 -0.0420626   0.00578015  0.01724018 -0.01756684  0.02963734
  -0.00810184  0.00090881 -0.03945294 -0.03216579 -0.01328333 -0.04100549
   0.06391503  0.03699208 -0.00595694 -0.04445992 -0.02976395  0.02315748
  -0.00686445 -0.01566941 -0.04087256 -0.0214999   0.02058663 -0.01258939
  -0.00606857 -0.05469578  0.0092194  -0.02426293 -0.00903691 -0.01042843
   0.00562668 -0.00485665 -0.02114696  0.00989622 -0.04447369 -0.06061672
  -0.01748548  0.0466786   0.01143741  0.02421646 -0.03835709  0.03592908
   0.044184    0.06704877  0.02986864 -0.02461152 -0.0099878   0.02591041
   0.01938703 -0.02415827 -0.05534476 -0.00873513 -0.04166371  0.00113672
  -0.02034007 -0.00457506  0.02015191  0.01310941  0.01386276 -0.03367967
 

In [103]:
index.reset()
index.add(vectors_np)
distances, indices = index.search(query_vector_np, k)

In [104]:
indices

array([[297, 299, 198, 247, 333]])

In [105]:
nearest_files = [file_paths[i] for i in indices.flatten()]
nearest_files

['./dataset/aws-case-studies-blogs-dataset/Better Mortgage using Amazon Elastic Kubernetes _ Better Mortgage Video _ AWS.txt',
 './dataset/aws-case-studies-blogs-dataset/TEG on using Machine Learning and Amazon Personalize to boost user engagement and ticket sales _ Ticketek Video _ AWS.txt',
 './dataset/aws-case-studies-blogs-dataset/Razer Deepened Gamer Engagement using Amazon Personalize _ Video Testimonial _ AWS.txt',
 './dataset/aws-case-studies-blogs-dataset/DB Energie Case Study.txt',
 './dataset/aws-case-studies-blogs-dataset/AppsFlyer Amazon EKS Case Study _ Advertising _ AWS.txt']