# Semantic Index

## loading libraries

In [None]:
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import torch


## defining the dataset

In [None]:


dataset = load_dataset("multi_news", split="test")
df = dataset.to_pandas().sample(2000, random_state=42)

## Defining the model

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
passage_embeddings = list(model.encode(df["summary"].to_list(), show_progress_bar=True))

## Querying the embedding

In [None]:

query = "Find me some articles about technology and artificial intelligence"
# note that this isn't simply searching for keywords, but rather semantic similarity between these words and the ones in the articles

query_embedding = model.encode(query)

print(query_embedding.shape)
print(passage_embeddings[0].shape)
# must be sure that the query embedding and passage embedding are the same shape

similarities = util.cos_sim(query_embedding, passage_embeddings)

top_indices = torch.topk(similarities.flatten(), 3).indices

top_relevant_passages = [df.iloc[x.item()]["summary"][:200]+ "..." for x in top_indices]

print(top_relevant_passages)

## make it into a function

In [None]:

def find_relevant_news(query):
    # Encode the query using the sentence transformer model
    query_embedding = model.encode(query)
    # Print the shape of the query embedding
    query_embedding.shape

    # Calculate the cosine similarity between the query embedding and the passage embeddings
    similarities = util.cos_sim(query_embedding, passage_embeddings)

    # Find the indices of the top 3 most similar passages
    top_indicies = torch.topk(similarities.flatten(), 3).indices

    # Get the top 3 relevant passages by slicing the summaries at 200 characters and adding an ellipsis
    top_relevant_passages = [df.iloc[x.item()]['summary'][:200] + "..." for x in top_indicies]

    # Return the top 3 relevant passages
    return top_relevant_passages

## Find relevant news for different queries

In [None]:
find_relevant_news("Natural disasters")
find_relevant_news("Law enforcement and police")
find_relevant_news("Politics, diplomacy and nationalism")