In [7]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4"


In [1]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import json
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.tokenize import sent_tokenize
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, StoppingCriteria, StoppingCriteriaList

# Initialize NLP tools
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()

  from tqdm.autonotebook import tqdm, trange
2025-02-24 21:15:31.903862: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-24 21:15:31.921036: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740431731.941913 1284124 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740431731.949675 1284124 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-24 21:15:31.973288: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow b

In [2]:
# Define prompts
system_prompt = """
You are a knowledgeable and helpful assistant. The user has asked a question on Stack Overflow. 
Use the provided context to craft an accurate, concise, and highly relevant response. 
Present your answer in a clear and well-structured paragraph format, avoiding the use of bullet points or lists.
DO NOT GENREATE INCOMPLETE CODE AND EXCESSIVE CODE TO DISTRACT PEOPLE!
"""

prompt_template = """
### QUESTION:
{question}

### CONTEXT:
{context}

Please provide your best answer below:

"""

In [None]:
# baseline_path = "complete_testing_data/SAMPLE_combined_unseen_data.csv"
baseline_path = 'Sythetic_old_question.csv'
embeddings_path = "../AnswerEmbedding/all_embeddings.npy"
sentences_path = "../AnswerEmbedding/all_sentences.npy"

# Load baseline data
baseline = pd.read_csv(baseline_path)
testingset = baseline.dropna(subset=['Accepted Answer Body'])
# testingset = baseline.dropna(subset=['Accepted Answer'])
# Load embeddings and sentences
sentence_embedding = np.load(embeddings_path, allow_pickle=True)
sentence_context = np.load(sentences_path, allow_pickle=True)

# Initialize the encoder model and FAISS index
device = "cuda" if torch.cuda.is_available() else "cpu"
encoder_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)

dim = sentence_embedding.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(sentence_embedding)

# Prepare the list of testing questions


In [4]:
# Define functions
class EndOfAnswerCriteria(StoppingCriteria):
    def __init__(self, stop_string: str, tokenizer):
        self.stop_string = stop_string
        self.tokenizer = tokenizer

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        decoded = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
        return self.stop_string in decoded

def compose_prompt(question, top_results):
    sentences_only = [i[1] for i in top_results if len(i) > 0]
    context_str = "\n".join(sentences_only)
    final_prompt = prompt_template.format(question=question, context=context_str)
    return system_prompt, final_prompt

def process_rag_with_threshold(threshold, max_results=10):    # Load baseline data
    
    local_testingset = testingset.copy()
    # testing_question = local_testingset['Paraphrased Question'].to_list()
    testing_question = local_testingset['Title'].to_list()
    # Search and filter results based on the threshold
    SEN_LIST = []
    for question_title in tqdm(testing_question):
        query_embedding = encoder_model.encode(question_title, convert_to_tensor=True).cpu().numpy()
        distances, indices = index.search(np.array([query_embedding]), index.ntotal)

        filtered_results = [
            (idx, sentence_context[idx], dist) 
            for idx, dist in zip(indices[0], distances[0]) if dist >= threshold
        ][:max_results]

        entry = {
            "question": question_title,
            "results": filtered_results
        }
        SEN_LIST.append(entry)

    # Create prompts
    user_msg_list = []
    for entry in SEN_LIST:
        question = entry["question"]
        top_results = entry["results"]
        sys_msg, user_msg = compose_prompt(question, top_results)
        user_msg_list.append(user_msg)

    local_testingset['Step2PROMPT'] = user_msg_list

    # Define the Llama model for response generation
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '<PAD>'})
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    model.resize_token_embeddings(len(tokenizer))

    text_generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
        max_new_tokens=512,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.pad_token_id
    )

    stop_token = "END_OF_ANSWER"
    stopping_criteria = StoppingCriteriaList([EndOfAnswerCriteria(stop_token, tokenizer)])

    user_prompts = local_testingset["Step2PROMPT"].to_list()
    response_list = []

    for user_prompt in tqdm(user_prompts):
        outputs = text_generator(user_prompt, stopping_criteria=stopping_criteria)
        result = outputs[0]["generated_text"]
        if stop_token in result:
            result = result.split(stop_token)[0].strip()
        response_list.append(result)

    # Clean responses
    new_list = []
    for response in response_list:
        if '[/INST]' in response:
            cleaned = response.split('[/INST]', 1)[1].strip()
        else:
            cleaned = response.strip()
        new_list.append(cleaned)

    local_testingset['Step2Response'] = new_list
    testingset_output = local_testingset.drop(columns=['Generated Response'], errors='ignore')
    return testingset_output




In [None]:
# Example usage:
threhsold_list = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for tl in threhsold_list:
    results = process_rag_with_threshold(tl)
    # path = "unseen_testing/RAGv1_threshold_{}.csv".format(tl)
    path = "sythetic_testing/QB1_threshold_{}.csv".format(tl)
    results.to_csv(path, index=False)
    print("finish ",tl)

100%|██████████| 385/385 [5:49:31<00:00, 54.47s/it]  


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/385 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
  3%|▎         | 10/385 [08:48<5:05:41, 48.91s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 385/385 [5:53:10<00:00, 55.04s/it]  

finish  0.5



