## Semantic Search Engine

In [18]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import torch
from PyPDF2 import PdfReader
import os
import pandas as pd

In [14]:
def extract_text_from_pdf(url: str) -> str:
    # creating a pdf reader object
    reader = PdfReader(url)
    
    # printing number of pages in pdf file
    text = ''    
    for page_num in range(len(reader.pages)):
        # Extract text from the current page
        page = reader.pages[page_num]
        text += page.extract_text()
    return text


In [16]:
base_path = '../data/raw/dataset/'
content = []
names = []
with os.scandir(base_path) as entries:
    for entry in entries:
        if entry.name.endswith(".pdf"):
            names.append(entry.name)
            content.append(extract_text_from_pdf(base_path+entry.name))
dataset = {'document':names, 'summary': content}

In [20]:
df = pd.DataFrame(dataset)
df

Unnamed: 0,document,summary
0,Xalaxian_Advanced_Materials_Science.pdf,File Name: Xalaxian Advanced Materials Science...
1,Xalaxian_Astral_Projection_Techniques.pdf,File Name: Xalaxian Astral Projection Techniqu...
2,Xalaxian_Aurora-Equivalent_Energy_Expressions.pdf,File Name: Xalaxian_Aurora-Equivalent_Energy_E...
3,Xalaxian_Aurora-like_Phenomena.pdf,File Name: Xalaxian_Aurora-like_Phenomena.pdf\...
4,Xalaxian_Binary_Star_Energy_Harvesting.pdf,File Name: Xalaxian Binary Star Energy Harvest...
...,...,...
85,Xalaxian_Tranquil_Energy_Expanse_Research.pdf,File Name: Xalaxian Tranquil Energy Expanse Re...
86,Xalaxian_Twilight_Adaptation_Mechanisms.pdf,File Name: Xalaxian_Twilight_Adaptation_Mechan...
87,Xalaxian_Twilight_Sky_Phenomena.pdf,File Name: Xalaxian_Twilight_Sky_Phenomena.pdf...
88,Xalaxian_Universal_Ethics.pdf,File Name: Xalaxian Universal Ethics Framework...


In [21]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [22]:
passage_embeddings = list(model.encode(df['summary'].to_list(), show_progress_bar=True))

Batches: 100%|██████████| 3/3 [02:07<00:00, 42.52s/it]


In [23]:

# Print the shape of the first passage embedding
passage_embeddings[0].shape

(384,)

In [24]:

# Define a function to find relevant news articles based on a given query
def find_relevant_news(query):
    # Encode the query using the sentence transformer model
    query_embedding = model.encode(query)
    # Print the shape of the query embedding
    query_embedding.shape

    # Calculate the cosine similarity between the query embedding and the passage embeddings
    similarities = util.cos_sim(query_embedding, passage_embeddings)

    # Find the indices of the top 3 most similar passages
    top_indicies = torch.topk(similarities.flatten(), 3).indices

    # Get the top 3 relevant passages by slicing the summaries at 200 characters and adding an ellipsis
    top_relevant_passages = [df.iloc[x.item()]['summary'][:200] + "..." for x in top_indicies]

    # Return the top 3 relevant passages
    return top_relevant_passages

In [25]:
find_relevant_news("Material Science Technology")

  b = torch.tensor(b)


["File Name: Xalaxian Advanced Materials Science\nDescription: This document outlines the Xalaxians' cutting-edge materials science \ntechnology. The materials produced possess remarkable properties, such...",
 'File Name: Xalaxian_Planetary_Energy-Matter_Conversion.pdf\nDescription: This document provides an overview of the Xalaxians’ energy-to-\nmatter conversion technology, which allows them to transform ene...',
 "File Name: Xalaxian Planetary Energy-Matter Interaction\nDescription: This document outlines the intricate design and functionality of the \nXalaxians' planetary energy-matter interaction technology. Th..."]