In [None]:
import pandas as pd
import numpy as np
import os
import gc
import torch


In [None]:
# !pip install -r requirements.txt

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "xxx"

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma

In [None]:
df = pd.read_csv('Input_Data/test_subsampled.csv')
df.head()

In [None]:
df = df[['question','context','answer']]
df = df.sample(frac=1, random_state=42)
df = df.reset_index(drop=True)
df = pd.DataFrame(df)

In [None]:
import tiktoken
import ast
# Function to count tokens in a string using tiktoken
def count_tokens(text):
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))

# Function to safely parse context - handles both string and list formats
def parse_context(context):
    """
    Parse context whether it's stored as string representation of list or actual list
    """
    if isinstance(context, str):
        try:
            # Try to parse as literal (for string representations of lists)
            parsed = ast.literal_eval(context)
            return parsed
        except (ValueError, SyntaxError):
            # If it's just a plain string, return as is
            return context
    else:
        # If it's already a list or other structure, return as is
        return context

# Function to count tokens for nested context structure
def count_context_tokens(context):
    """
    Count tokens in context, handling different possible formats
    """
    total_tokens = 0
    
    # First parse the context
    parsed_context = parse_context(context)
    
    # Handle different formats
    if isinstance(parsed_context, list):
        for item in parsed_context:
            if isinstance(item, (list, tuple)) and len(item) >= 2:
                # Expected format: [title, text_snippets]
                title = item[0]
                text_snippets = item[1]
                
                # Count tokens in title
                if isinstance(title, str):
                    total_tokens += count_tokens(title)
                
                # Count tokens in text snippets
                if isinstance(text_snippets, list):
                    for snippet in text_snippets:
                        if isinstance(snippet, str):
                            total_tokens += count_tokens(snippet)
                elif isinstance(text_snippets, str):
                    total_tokens += count_tokens(text_snippets)
            else:
                # Handle single items or unexpected formats
                if isinstance(item, str):
                    total_tokens += count_tokens(item)
    elif isinstance(parsed_context, str):
        # If it's just a string, count tokens directly
        total_tokens += count_tokens(parsed_context)
    
    return total_tokens


# Apply the count_context_tokens function to the 'context' column
df['context_token_count'] = df['context'].apply(count_context_tokens)

In [None]:
df

In [None]:
# Find the maximum token count
max_tokens = df['context_token_count'].max()

In [None]:
# Function to flatten context for processing
def flatten_context(context):
    """
    Flatten context into a single string, handling different formats
    """
    parsed_context = parse_context(context)
    
    if isinstance(parsed_context, list):
        flattened_parts = []
        for item in parsed_context:
            if isinstance(item, (list, tuple)) and len(item) >= 2:
                # Expected format: [title, text_snippets]
                title = item[0]
                text_snippets = item[1]
                
                # Add title
                if isinstance(title, str):
                    flattened_parts.append(title)
                
                # Add text snippets
                if isinstance(text_snippets, list):
                    for snippet in text_snippets:
                        if isinstance(snippet, str):
                            flattened_parts.append(snippet)
                elif isinstance(text_snippets, str):
                    flattened_parts.append(text_snippets)
            else:
                # Handle single items
                if isinstance(item, str):
                    flattened_parts.append(item)
        
        return " ".join(flattened_parts)
    elif isinstance(parsed_context, str):
        return parsed_context
    else:
        return str(parsed_context)

df['raw_context'] = df['context'].apply(flatten_context)

In [None]:
# import matplotlib.pyplot as plt
# df.context_token_count.hist(bins=100)
# plt.show()

In [None]:
df

In [None]:
# Function to create chunks with overlap while preserving sentence boundaries
def create_chunks(text, chunk_size=256, overlap=64):
    # Split text into sentences using newlines
    sentences = text.split('\n')

    chunks = []
    current_chunk = []
    current_length = 0

    i = 0
    while i < len(sentences):
        sentence = sentences[i]
        sentence_length = count_tokens(sentence)

        # If adding this sentence would exceed chunk size, finalize current chunk
        if current_length + sentence_length > chunk_size and current_chunk:
            # Join current chunk sentences
            chunk_text = '\n'.join(current_chunk)
            chunks.append(chunk_text)

            # Create overlap for next chunk
            overlap_chunk = []
            overlap_length = 0

            # Go backwards from current position to create overlap
            j = len(current_chunk) - 1
            while j >= 0 and overlap_length < overlap:
                overlap_sentence = current_chunk[j]
                overlap_sentence_length = count_tokens(overlap_sentence)

                if overlap_length + overlap_sentence_length <= overlap:
                    overlap_chunk.insert(0, overlap_sentence)
                    overlap_length += overlap_sentence_length
                    j -= 1
                else:
                    break

            # Start new chunk with overlap
            current_chunk = overlap_chunk
            current_length = overlap_length

        # Add current sentence to chunk
        current_chunk.append(sentence)
        current_length += sentence_length
        i += 1

    # Add the last chunk if it has content
    if current_chunk:
        chunk_text = '\n'.join(current_chunk)
        chunks.append(chunk_text)

    # Join chunks with '<c>' separator
    return '<c>'.join(chunks)

# Create the chunks column
df['chunks'] = df['raw_context'].apply(create_chunks)

In [None]:
df

In [None]:
from weaviate.classes.config import Property, DataType
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import weaviate

embedder = SentenceTransformer('all-MiniLM-L6-v2')

def embed_text(text: str) -> list[float]:
    return embedder.encode(text).tolist()

client = weaviate.connect_to_weaviate_cloud(
    cluster_url="xxx",
    auth_credentials=weaviate.AuthApiKey(api_key="xxx"),
)

class_name = "VectorRAG"
if not client.collections.exists(class_name):
    client.collections.create(
        name=class_name,
        vectorizer_config=None,  # no auto embedding
        properties=[
            Property(name="content", data_type=DataType.TEXT),
        ],
    )

collection = client.collections.get(class_name)

with collection.batch.dynamic() as batch:
    for chunks_str in tqdm(df["chunks"], desc="Uploading chunks"):
        chunks = [c.strip() for c in str(chunks_str).split("<c>") if c.strip()]
        for chunk in chunks:
            vector = embed_text(chunk)
            batch.add_object(
                properties={"content": chunk},
                vector=vector
            )


In [None]:
def get_top_chunks(question: str, top_k: int = 3):
    vector = embed_text(question)
    response = collection.query.near_vector(
        near_vector=vector,
        limit=top_k,
        return_properties=["content"]
    )
    chunks = [r.properties["content"] for r in response.objects]
    return " ".join(chunks)


In [None]:
# Apply to all rows in df
tqdm.pandas()
df["retrieved_context"] = df["question"].progress_apply(get_top_chunks)


In [None]:
df

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"


In [None]:
import pandas as pd
df = pd.read_csv('Triplet_Retrieval_Output.csv')
df

In [None]:
df['retrieved_context'] = df['retrieved_context_text']+df['retrieved_triplets']
df

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
device = 'cuda:0'
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1",device_map =device)
llm = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1",device_map = device)

def answer_generator_single_hop(query, context):
    # Use the globally loaded tokenizer and llm from file_context_1
    # Prepare the prompt for the model
    prompt = (
        "You are an expert at answering the question just based on the context. "
        "Given the context, answer the user question. If you cannot answer the question based on context, "
        "state properly that you cannot answer the question.\n\n"
        f"Context:\n{context}\n\nUser question: {query}\nAnswer:"
    )

    # Tokenize and generate, passing attention_mask for reliable results
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    pad_token_id = tokenizer.eos_token_id

    output_ids = llm.generate(input_ids, attention_mask=attention_mask, pad_token_id=pad_token_id)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract only the answer after "Answer:"
    if "Answer:" in output:
        answer = output.split("Answer:")[-1].strip().split("\n")[0]
    else:
        answer = output.strip().split("\n")[-1]
    answer = answer.replace('"', '').replace("'", "")
    return answer

In [None]:

df['predicted_answer_mistral'] = df.apply(
    lambda row: answer_generator_single_hop(row['question'], row['retrieved_context']),
    axis=1
)


In [None]:

# Delete model and tokenizer
del llm
del tokenizer

# Clear CUDA cache (GPU memory)
torch.cuda.empty_cache()

# Run Python garbage collector (cleans CPU memory)
gc.collect()


In [None]:
df.to_csv('Hybrid_RAG_output_mistral.csv',index = False)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
device = 'cuda:0'
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B",device_map=device)
llm = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B",device_map=device)
def answer_generator_single_hop(query, context):
    # Use the globally loaded tokenizer and llm from file_context_1
    # Prepare the prompt for the model
    prompt = (
        "You are an expert at answering the question just based on the context. "
        "Given the context, answer the user question. If you cannot answer the question based on context, "
        "state properly that you cannot answer the question.\n\n"
        f"Context:\n{context}\n\nUser question: {query}\nAnswer:"
    )

    # Tokenize and generate, passing attention_mask for reliable results
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    pad_token_id = tokenizer.eos_token_id

    output_ids = llm.generate(input_ids, attention_mask=attention_mask, pad_token_id=pad_token_id)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract only the answer after "Answer:"
    if "Answer:" in output:
        answer = output.split("Answer:")[-1].strip().split("\n")[0]
    else:
        answer = output.strip().split("\n")[-1]
    answer = answer.replace('"', '').replace("'", "")
    return answer

In [None]:

df['predicted_answer_LLama'] = df.apply(
    lambda row: answer_generator_single_hop(row['question'], row['retrieved_context']),
    axis=1
)


In [None]:
df.to_csv('Ouput_Data/Hybrid_RAG_output_mistral-llama.csv',index = False)

In [None]:

# Delete model and tokenizer
del llm
del tokenizer

# Clear CUDA cache (GPU memory)
torch.cuda.empty_cache()

# Run Python garbage collector (cleans CPU memory)
gc.collect()


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
device = 'cuda:0'
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl",device_map=device)
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl").to(device)


def answer_generator_single_hop(query, context, max_new_tokens=128):
    """
    Generate an answer using a seq2seq model (e.g., FLAN-T5) based on the given context and question.
    """
    # Build the model prompt in instruction format (T5-style)
    prompt = (
        "You are an expert at answering the question based on the context. "
        "If the context does not contain the answer, say so clearly.\n\n"
        f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    )

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)

    # Generate output (T5 generates sequence-to-sequence output)
    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        # num_beams=4,            # optional: better quality
        early_stopping=True
    )

    # Decode and clean up
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    answer = output_text.strip().replace('"', '').replace("'", "")

    return answer


In [None]:

df['predicted_answer_FlanT5'] = df.apply(
    lambda row: answer_generator_single_hop(row['question'], row['retrieved_context']),
    axis=1
)


In [None]:
df.to_csv('Ouput_Data/Hybrid_RAG_output_mistral-llama_FlanT5.csv',index = False)

In [None]:

# Delete model and tokenizer
del llm
del tokenizer

# Clear CUDA cache (GPU memory)
torch.cuda.empty_cache()

# Run Python garbage collector (cleans CPU memory)
gc.collect()


## Triplet insertion

In [None]:
import pandas as pd
d = pd.read_csv('Refined_triplets_final_HybridRAG.csv')
d.head()

In [None]:
d['extracted_triplets_microsoft_Phi-4-mini-instruct'][0]