<a href="https://colab.research.google.com/github/giuliocapecchi/LM-project/blob/main/rag_jja.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q huggingface-hub python-dotenv transformers sentence-transformers langchain-community langchain-huggingface langchain tqdm regex gradio unidecode pymupdf chromadb bitsandbytes langchain_chroma --progress-bar off

In [2]:
#!pip install -U bitsandbytes --progress-bar off
!pip install -q huggingface-hub python-dotenv transformers sentence-transformers langchain-community langchain-huggingface langchain tqdm regex gradio unidecode pymupdf chromadb bitsandbytes --progress-bar off

In [3]:
# questo serve
import requests
import zipfile
import os

# URL del file ZIP
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/hotpotqa.zip"

# Nome del file ZIP locale
zip_file_name = "hotpotqa.zip"

# Cartella di estrazione
extract_dir = "hotpotqa"

# Scaricare il file ZIP
print("Scaricamento del file ZIP...")
response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(zip_file_name, "wb") as f:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)
    print(f"Scaricamento completato: {zip_file_name}")
else:
    print(f"Errore durante il download: {response.status_code}")
    exit()

# Estrarre il contenuto del file ZIP
print(f"Estrazione del file ZIP in {extract_dir}...")
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(zip_file_name, "r") as zip_ref:
    zip_ref.extractall(extract_dir)
print(f"Estrazione completata. Contenuti disponibili nella cartella: {extract_dir}")

# remove zip file after extraction
os.remove(zip_file_name)
print(f"File ZIP rimosso: {zip_file_name}")


Scaricamento del file ZIP...
Scaricamento completato: hotpotqa.zip
Estrazione del file ZIP in hotpotqa...
Estrazione completata. Contenuti disponibili nella cartella: hotpotqa


In [4]:
import json

def print_first_three_elements(file_path):
    """
    Legge un file JSON Lines (.jsonl) e stampa i primi tre elementi.

    :param file_path: Percorso al file corpus.jsonl
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            count = 0
            for line in file:
                # Decodifica ogni riga come oggetto JSON
                data = json.loads(line.strip())
                print(data)
                count += 1
                # Ferma dopo aver letto 3 elementi
                if count == 3:
                    break
    except FileNotFoundError:
        print(f"File non trovato: {file_path}")
    except json.JSONDecodeError as e:
        print(f"Errore nel parsing del file JSONL: {e}")

# Esempio di utilizzo
file_path = "hotpotqa/hotpotqa/corpus.jsonl"
print_first_three_elements(file_path)
# Calcola e stampa il numero totale di documenti
with open(file_path, 'r', encoding='utf-8') as file:
    total_documents = sum(1 for _ in file)
print(f"Numero totale di documenti: {total_documents}")


{'_id': '12', 'title': 'Anarchism', 'text': 'Anarchism is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies, although several authors have defined them more specifically as institutions based on non-hierarchical free associations. Anarchism holds the state to be undesirable, unnecessary and harmful.', 'metadata': {'url': 'https://en.wikipedia.org/wiki?curid=12'}}
{'_id': '25', 'title': 'Autism', 'text': "Autism is a neurodevelopmental disorder characterized by impaired social interaction, impaired verbal and non-verbal communication, and restricted and repetitive behavior. Parents usually notice signs in the first two years of their child's life. These signs often develop gradually, though some children with autism reach their developmental milestones at a normal pace and then regress. The diagnostic criteria require that symptoms become apparent in early childhood, typically before age three.

In [5]:
from dotenv import load_dotenv
import os
from huggingface_hub import login


try:
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
    print("Running in Google Colab. Using userdata to get HF_TOKEN.")
except ModuleNotFoundError:
    load_dotenv()
    HF_TOKEN = os.getenv('HF_TOKEN')
    print("Not running in Google Colab. Using load_dotenv to get HF_TOKEN.")

login(token=HF_TOKEN)

Running in Google Colab. Using userdata to get HF_TOKEN.


In [6]:
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

if torch.backends.mps.is_available():
    device = "mps"
    print("MPS is available. Working on MPS.")
elif torch.cuda.is_available():
    device = "cuda:0"
    print("CUDA is available. Working on GPU.")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
else:
    device = "cpu"
    print("CUDA and MPS not available. Working on CPU.")

CUDA is available. Working on GPU.
GPU name: Tesla T4


In [7]:
import json
import re
from unidecode import unidecode
from tqdm import tqdm
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents.base import Document

# Carica documenti dal file JSON Lines
def load_documents(file_path):
    documents = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line.strip())
            if "text" in data:
                documents.append(Document(page_content=data["text"], metadata={"title": data.get("title", "")}))
    total_length = sum(len(doc.page_content) for doc in documents)
    avg_length = total_length / len(documents) if documents else 0
    print(f"Average length of documents: {avg_length:.3f}")
    return documents

# Preprocessa i testi
def preprocess_text(text):
    # Rimuovi formule matematiche
    text = re.sub(r'\$.*?\$', '', text)
    # Rimuovi URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Rimuovi tag HTML
    text = re.sub(r'<.*?>', '', text)
    # Rimuovi caratteri non ASCII
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # Rimuovi caratteri speciali e numeri, mantenendo solo lettere e punteggiatura di base
    text = re.sub(r'[^a-zA-Z\s.,;:!?\'"-]', '', text)
    # Normalizza Unicode
    text = unidecode(text)
    # Rimuovi spazi multipli
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Percorso al file del dataset
documents = load_documents(file_path)

# Preprocessa i contenuti
for doc in tqdm(documents, desc="Preprocessing documents"):
    doc.page_content = preprocess_text(doc.page_content)

# TODO : i documenti sono già corti, non serve splittarli
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# split_documents = []
# for doc in tqdm(documents, desc="Preprocessing documents"):
#     if len(doc["page_content"]) > 1500:
#         chunks = text_splitter.split_text(doc["page_content"])
#         split_documents.extend([{"page_content": chunk, "metadata": doc["metadata"]} for chunk in chunks])

Average length of documents: 267.998


Preprocessing documents: 100%|██████████| 5233329/5233329 [03:23<00:00, 25729.75it/s]


In [None]:
from tqdm import tqdm
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# Configure the model for GPU usage
model_kwargs = {'device': 'cuda'}  # Ensure GPU acceleration
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs=model_kwargs)

# Split documents into batches for efficient processing
batch_size = 512  # Adjust based on your GPU's memory (e.g., 128 for RTX 3070)

def batch_embeddings(documents, embeddings, batch_size):
    all_embeddings = []
    for i in tqdm(range(0, len(documents), batch_size), desc="Calculating embeddings in batches"):
        batch = documents[i:i + batch_size]
        batch_texts = [doc.page_content for doc in batch]
        batch_embeddings = embeddings.embed_documents(batch_texts)  # Efficient batch processing
        all_embeddings.extend([{"page_content": doc.page_content, "metadata": doc.metadata, "embedding": emb}
                               for doc, emb in zip(batch, batch_embeddings)])
    return all_embeddings

# Calculate embeddings in batches
embedding_results = batch_embeddings(documents, embeddings, batch_size)

# Create Chroma vector store from the precomputed embeddings
print("Creating Chroma database...")
chroma_docs = [{"page_content": item["page_content"], "metadata": item["metadata"]} for item in embedding_results]
db = Chroma.from_documents(chroma_docs, embeddings, persist_directory="chroma_db")

# Create retriever
retriever = db.as_retriever()

  db = Chroma(persist_directory="chroma_db", embedding_function=embeddings)


Initializing Chroma...


Processing and storing batches:   0%|          | 52/40886 [00:40<11:35:36,  1.02s/it]ERROR:chromadb.db.mixins.embeddings_queue:Exception occurred invoking consumer for subscription 656eb5cafbfa46a096eaaca36c8f4cbbto topic persistent://default/default/90f30593-7186-4a52-8526-a692ea3f192f 
Processing and storing batches:   0%|          | 67/40886 [00:50<8:00:27,  1.42it/s]

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
generation_config.repetition_penalty = 1.6  # Add repetition penalty



model.to(device)

In [None]:
from sentence_transformers import SentenceTransformer


# Initialize the sentence-transformers model
embedder_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder_model = SentenceTransformer(embedder_model_name)

def embedder(chunk):
    embeddings = embedder_model.encode(chunk, convert_to_tensor=True)
    return embeddings.cpu().numpy()


# Use the retriever to get the top_k_chunks for a given query
def search(query, retriever, k=10):
    results = retriever.invoke(query, k=k)
    return [result.page_content for result in results]

## PROMPT AND ANSWER QUESTION FUNCTION

In [None]:
base_prompt = """You are an AI assistant for RAG. Your task is to understand the user question, and provide an answer using the provided contexts.

Your answers are correct, high-quality, and written by a domain expert. If the provided context does not contain the answer, simply state, "The provided context does not have the answer."

User question: {user_query}

Contexts:
{chunks_information}

Answer:
"""

In [None]:
import time

def answer_questions(questions):
    count = 0
    error = 0
    results = {}

    current_time = time.strftime("%m%d-%H%M%S")

    pbar = tqdm(questions, total=len(questions), desc="Answering questions...", unit="question")
    for q in pbar:
        top_k_chunks = search(q['question'], retriever, k=10)
        retrieved_chunks = [chunk for chunk in top_k_chunks]
        prompt = base_prompt.format(user_query=q['question'], chunks_information="\n".join(retrieved_chunks))
        encoding = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.inference_mode():
            outputs = model.generate(
                input_ids=encoding.input_ids,
                attention_mask=encoding.attention_mask,
                generation_config=generation_config,
                num_beams=5,  # Use beam search for better results
                early_stopping=True,  # Stop early if all beams finish
            )

        # Exclude the prompt tokens from the generated output
        generated_tokens = outputs[0][len(encoding.input_ids[0]):]
        generated_unpreprocessed_sequence = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
        match = re.search(r'\b[1-4]\b', generated_unpreprocessed_sequence)
        answer = match.group(0) if match else ""  # first number found or empty string

        with open(f"quiz/runs_basemodel/quiz_answers_{current_time}.txt", "a", encoding="utf-8") as f:
            f.write(f"Question: {q['question']}\nAnswer: {answer}\nCorrect answer:{q['correct']}\nGenerated unpreprocessed sequence: {generated_unpreprocessed_sequence}\n--------------------------------------------------------------------\n\n")

        results[q['question_id']] = answer

        if len(answer) != 1 or answer not in "1234":
            error += 1
        else:  # the format is correct, now check if the answer is correct
            if str(q['correct']) == answer:
                count += 1
        pbar.set_postfix(Corrects=f"{count}/{len(questions)}", Errors=error)

    print("-------------------------\tFINISHED RUN. Error count: ", error, "-------------------------")
    return results, count / len(questions) * 100

# Example questions
# questions = [
#     {"question": "", "correct": "1", "question_id": "1"},
#     # Add more questions as needed
# ]

# results, score = answer_questions(questions)
# print(f"Final score: {score}%")

## GRADIO

In [None]:
import gradio as gr

def query_rag_model(user_query):
    top_k_chunks = search(user_query, retriever, k=10)
    retrieved_chunks = [chunk for chunk in top_k_chunks]
    prompt = base_prompt.format(user_query=user_query, chunks_information="\n".join(retrieved_chunks))
    encoding = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids,
            attention_mask=encoding.attention_mask,
            generation_config=generation_config,
            num_beams=5,
            early_stopping=True,
        )
    generated_tokens = outputs[0][len(encoding.input_ids[0]):]
    generated_unpreprocessed_sequence = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
    return generated_unpreprocessed_sequence

iface = gr.Interface(
    fn=query_rag_model,
    inputs="text",
    outputs="text",
    title="RAG Model Query Interface",
    description="Ask questions to the RAG model and get answers based on the provided PDF context."
)

iface.launch()