In [None]:
# -*- coding: utf-8 -*-
"""RAG-IMPLEMENTATION.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/14XBnxd2YV49t_Z9xIeDC_os5ZiBdCEjA
"""

!pip install transformers
!pip install sentence_transformers
!pip install requests
!pip install bs4
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup #BeautifulSoup is a web scrapping tool used to parse html and xml documents
from transformers import pipeline #used to generate text using a pre-trained model
from sentence_transformers import SentenceTransformer #used to convert text into vectors
from sklearn.metrics.pairwise import cosine_similarity #used to calculate the similarity between two vectors

!huggingface-cli login ##accessing the HuggingFace platform using the account token

#accessing openAI's gpt2 model for text generation from the HuggingFace
generator_gpt2 = pipeline('text-generation', model='openai-community/gpt2') #Hugging Face model hub is a centralized repository of pre-trained models including openai

##loading a SentenceTransformer model (all-MiniLM-L6-v2) for embeddings from the HuggingFace
#generating semantically meaningful embeddings for sentences
#word embeddings are most common method of vectorization
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

urls = [
    'https://www.dc.com/characters/superman',
    'https://www.dc.com/characters/batman',
    'https://www.dc.com/characters/the-joker',
    'https://www.dc.com/characters/doomsday'
]

text_paragraphs = []

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = " ".join([tag.get_text().strip() for tag in soup.find_all(['p', 'h1', 'h2', 'h3']) if tag.get_text().strip()])
    text_paragraphs.append(paragraphs)

print(text_paragraphs)

df_text_paragraphs = pd.DataFrame(text_paragraphs, columns=["text"])

document_embeddings = embedding_model.encode(text_paragraphs)
df_document_embeddings = pd.DataFrame(document_embeddings)
df_paragraphs_embeddings = pd.concat([df_text_paragraphs, df_document_embeddings], axis=1)

question = "What are the superpowers of the heroes?"
question_embedding = embedding_model.encode([question])[0]

similarities = cosine_similarity([question_embedding], document_embeddings)[0]

# Get top four documents based on similarities
top_indices = np.argsort(similarities)[::-1][:4]

top_k_doc = [(text_paragraphs[i], similarities[i]) for i in top_indices]
print(top_k_doc)

context = " ".join([doc for doc, _ in top_k_doc])
print(context)

system_prompt = "Please provide a concise and accurate response."
prompt = f"system: {system_prompt} Question: {question} Context: {context}"

response_with_rag = generator_gpt2(prompt, max_new_tokens=50, num_return_sequences=1)[0]['generated_text']

print("Response with RAG:\n", response_with_rag, end="\n\n")

# Calculate Precision

# For demonstration, assume we have a list of relevant sections' indices
# In a real scenario, you would determine this based on actual relevance to the question
relevant_indices = [0, 1, 2, 3]  # Example: All top 4 retrieved sections are relevant

# Calculate precision
total_retrieved = len(top_indices)
number_relevant_retrieved = len([i for i in top_indices if i in relevant_indices])

precision = number_relevant_retrieved / total_retrieved if total_retrieved > 0 else 0
print(f"Precision: {precision}")



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGr

Token indices sequence length is longer than the specified maximum sequence length for this model (9548 > 1024). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[("Doomsday A weapon crafted in the most extreme conditions imaginable, Doomsday is an ultimate and unstoppable force of destruction and devastation.Long ago, scientists on the planet Krypton engineered a creature to be the ultimate weapon. This creature, the Ultimate, had no purpose except destruction, and made his way across other planets before finally being defeated. Buried on the primitive planet Earth, he stirred awake after many centuries and unleashed death, earning himself the name Doomsday. The monster was finally stopped by Superman, who seemingly lost his own life in the process. Driven by an instinct to hunt the inhabitants of his own homeworld, Doomsday has resurfaced time and time again to threaten Superman and his adopted planet—no matter what gets in his way.For more on Doomsday's history, visit his page on DCUniverse.com. A weapon crafted in the most extreme conditions imaginable, Doomsday is an ultimate and unstoppable force of destruction and devastation. Long ago, 

IndexError: index out of range in self