## Semantic Search Engine

In [2]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import torch
from PyPDF2 import PdfReader
import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def extract_text_from_pdf(url: str) -> str:
    # creating a pdf reader object
    reader = PdfReader(url)
    
    # printing number of pages in pdf file
    text = ''    
    for page_num in range(len(reader.pages)):
        # Extract text from the current page
        page = reader.pages[page_num]
        text += page.extract_text()
    return text


Create Dataset

In [7]:
base_path = '../data/raw/dataset/'
content = []
names = []
with os.scandir(base_path) as entries:
    for entry in entries:
        if entry.name.endswith(".pdf"):
            names.append(entry.name)
            text = extract_text_from_pdf(base_path+entry.name)
            content.append(text)
        elif entry.name.endswith(".txt"):
            names.append(entry.name)
            id = lambda x: x
            file = open(base_path+entry.name,"r+", encoding="utf8")
            lines = file.readlines()
            text = ' '.join(lines)
            content.append(text)
            file.close()
            
dataset = {'document':names, 'summary': content}

In [8]:
df = pd.DataFrame(dataset)
df

Unnamed: 0,document,summary
0,Xalaxian_Advanced_Materials_Science.pdf,File Name: Xalaxian Advanced Materials Science...
1,Xalaxian_Astral_Projection_Techniques.pdf,File Name: Xalaxian Astral Projection Techniqu...
2,Xalaxian_Aurora-Equivalent_Energy_Expressions.pdf,File Name: Xalaxian_Aurora-Equivalent_Energy_E...
3,Xalaxian_Aurora-like_Phenomena.pdf,File Name: Xalaxian_Aurora-like_Phenomena.pdf\...
4,Xalaxian_Binary_Star_Energy_Harvesting.pdf,File Name: Xalaxian Binary Star Energy Harvest...
...,...,...
99,Xalaxian_Tranquil_Energy_Expanse_Research.pdf,File Name: Xalaxian Tranquil Energy Expanse Re...
100,Xalaxian_Twilight_Adaptation_Mechanisms.pdf,File Name: Xalaxian_Twilight_Adaptation_Mechan...
101,Xalaxian_Twilight_Sky_Phenomena.pdf,File Name: Xalaxian_Twilight_Sky_Phenomena.pdf...
102,Xalaxian_Universal_Ethics.pdf,File Name: Xalaxian Universal Ethics Framework...


In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [10]:
passage_embeddings = list(model.encode(df['summary'].to_list(), show_progress_bar=True))

Batches: 100%|██████████| 4/4 [02:48<00:00, 42.17s/it]


In [11]:

# Print the shape of the first passage embedding
passage_embeddings[0].shape

(384,)

In [13]:

# Define a function to find relevant news articles based on a given query
def find_relevant_info(query):
    # Encode the query using the sentence transformer model
    query_embedding = model.encode(query)
    # Print the shape of the query embedding
    query_embedding.shape

    # Calculate the cosine similarity between the query embedding and the passage embeddings
    similarities = util.cos_sim(query_embedding, passage_embeddings)

    # Find the indices of the top 3 most similar passages
    top_indicies = torch.topk(similarities.flatten(), 3).indices

    # Get the top 3 relevant passages by slicing the summaries at 200 characters and adding an ellipsis
    top_relevant_passages = [df.iloc[x.item()]['summary'][:200] + "..." for x in top_indicies]

    # Return the top 3 relevant passages
    return top_relevant_passages

In [14]:
find_relevant_info("Society and Culture")

  b = torch.tensor(b)


['File Name: Xalaxian Societal Structure\nDescription: This document details the unique societal structure of the Xalaxians. \nIt outlines the fundamental elements of their collective decision-making proc...',
 'Insights into Xalaxian society would be crucial for navigating our relationship with them and understanding their decision-making process. \n \n Xalaxians operate in a communal-style society and there i...',
 'File Name: Xalaxian_Collective_Consciousness_Impacts.pdf\nDescription: This document provides an overview of the impact of Xalaxian \ncollective consciousness on their society and culture.\nData:\n- The X...']

In [15]:
find_relevant_info("Space time manipulation")

["File Name: Xalaxian Space-Time Manipulation Technology\nDescription: This document outlines the intricate design and functionality of the \nXalaxians' space-time manipulation technology. Predominantly u...",
 'File Name: Xalaxian Spatial Manipulation\nDescription: This document examines the advanced spatial manipulation \ncapabilities of the Xalaxians. Enabled by their intricate knowledge of space-time, \nthe ...',
 'File Name: Xalaxian Astral Projection Techniques\nDescription: This document outlines the advanced astral projection techniques \nutilized by the Xalaxians to traverse vast cosmic distances almost insta...']

In [16]:
find_relevant_info("Great Technology")

['As the Xalaxians are an advanced civilization, understanding their technology might lead to quantum leaps in human tech innovation.\n \n The Xalaxians have harnessed the power of energy to create advanc...',
 'File Name: Xalaxian_Planetary_Energy_Conservation.pdf\nDescription: This document outlines the principles and practices of Xalaxian \nenergy conservation and sustainability. Xalaxian society is committe...',
 'File Name: Xalaxian Strategies for Shared Knowledge\nDescription: This document examines the strategies employed by the Xalaxians in \ntheir efforts to share their knowledge and technology with less adv...']