In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [None]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import json
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.tokenize import sent_tokenize
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, StoppingCriteria, StoppingCriteriaList
import os
import _pickle as cPickle
import pickle
# Initialize NLP tools
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
from huggingface_hub import login
login(token="")
# Define prompts
system_prompt = """
You are a knowledgeable and helpful assistant. The user has asked a question on Stack Overflow. 
Use the provided context to craft an accurate, concise, and highly relevant response. 
Present your answer in a clear and well-structured paragraph format, avoiding the use of bullet points or lists.
DO NOT GENREATE INCOMPLETE CODE AND EXCESSIVE CODE TO DISTRACT PEOPLE!
"""

prompt_template = """
### QUESTION:
{question}

### CONTEXT:
{context}

Please provide your best answer below:

"""

  from tqdm.autonotebook import tqdm, trange
2025-02-26 01:13:56.798396: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-26 01:13:56.820897: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740532436.848742 1757126 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740532436.857048 1757126 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-26 01:13:56.883550: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow b

In [2]:
ovo_path = "../CODE_POST_OVERALL-EMBEDDINGS_DATA_V3.pkl"
device = "cuda" if torch.cuda.is_available() else "cpu"
# 1. Load OVO_data once
with open(ovo_path, 'rb') as f:
    OVO_data = pickle.load(f)


# 2. Extract raw questions, question embeddings, and answer_sentences_list once
raw_questions = [item['raw_question'] for item in OVO_data]
question_embeddings = [item['question_embedding'] for item in OVO_data]
answer_sentences_list = [item['answer_sentences'] for item in OVO_data]

# 3. Build the embeddings matrix once
embeddings_matrix = torch.stack(question_embeddings)

# 4. Initialize SentenceTransformer once

encoder_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)

print("Initialization complete.")

  return torch.load(io.BytesIO(b))


Initialization complete.


In [3]:


# Define functions
class EndOfAnswerCriteria(StoppingCriteria):
    def __init__(self, stop_string: str, tokenizer):
        self.stop_string = stop_string
        self.tokenizer = tokenizer

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        decoded = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
        return self.stop_string in decoded

def compose_prompt(question, relevant_sentences, system_prompt, prompt_template):
    sentences_only = [item["sentence"] for item in relevant_sentences]
    context_str = "\n".join(sentences_only)
    final_prompt = prompt_template.format(question=question, context=context_str)
    return system_prompt, final_prompt

def find_similar(original_context, top_k, model, embeddings_matrix, question_embeddings, raw_questions, raw_accepted_answers, answer_sentences_list, threshold):
    original_embedding = model.encode(original_context, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(original_embedding, embeddings_matrix).squeeze().tolist()
    top_indices = sorted(range(len(cos_scores)), key=lambda idx: cos_scores[idx], reverse=True)[:top_k]

    all_sentences_info = []
    for idx in top_indices:
        sentences = answer_sentences_list[idx]
        question_similarity = cos_scores[idx]
        for sentence in sentences:
            answer_embeddings = model.encode(sentence, convert_to_tensor=True)
            similarity = util.pytorch_cos_sim(original_embedding, answer_embeddings).item()
            if similarity >= threshold:
                all_sentences_info.append({
                    "sentence": sentence,
                    "similarity": similarity,
                    "question": raw_questions[idx],
                    "question_similarity": question_similarity
                })

    sorted_sentences_info = sorted(all_sentences_info, key=lambda x: x["similarity"], reverse=True)
    return {
        "top_similar_questions": [raw_questions[idx] for idx in top_indices],
        "sorted_sentences_info": sorted_sentences_info
    }

def process_rag_with_threshold_v2(threshold, baseline_path="complete_testing_data/SAMPLE_combined_unseen_data.csv", max_results=10):
    baseline = pd.read_csv(baseline_path)
    testingset = baseline.dropna(subset=['Accepted Answer Body'])

    # with open(ovo_path, 'rb') as f:
    #     OVO_data = pickle.load(f)

    # raw_questions = [item['raw_question'] for item in OVO_data]
    # question_embeddings = [item['question_embedding'] for item in OVO_data]
    # answer_sentences_list = [item['answer_sentences'] for item in OVO_data]

    # embeddings_matrix = torch.stack(question_embeddings)
    # device = "cuda" if torch.cuda.is_available() else "cpu"
    # encoder_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)

    # title_list = testingset['Paraphrased Question'].to_list()
    title_list = testingset['Title'].to_list()
    setence_results = []
    for question_title in tqdm(title_list):
        output = find_similar(question_title, max_results, encoder_model, embeddings_matrix, question_embeddings, raw_questions, None, answer_sentences_list, threshold)
        setence_results.append({
            "title": question_title,
            "relevant_question": output["top_similar_questions"],
            "relevant_sentence": output["sorted_sentences_info"]
        })

    no_relevant_sentences = []
    prompt_list = []
    for entry in setence_results:
        question = entry["title"]
        relevant_sentences = entry["relevant_sentence"]

        if not relevant_sentences:
            no_relevant_sentences.append(0)
            sys_msg, user_msg = compose_prompt(question, [], system_prompt, prompt_template)
        else:
            no_relevant_sentences.append(1)
            sys_msg, user_msg = compose_prompt(question, relevant_sentences, system_prompt, prompt_template)
        prompt_list.append(user_msg)

    testingset['Step2PROMPT_v2'] = prompt_list
    testingset['IFUnseen'] = no_relevant_sentences
    return testingset

def run_llama_model(testingset, model_id="meta-llama/Meta-Llama-3.1-8B-Instruct"):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '<PAD>'})
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    model.resize_token_embeddings(len(tokenizer))

    text_generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
        max_new_tokens=512,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.pad_token_id
    )

    stop_token = "END_OF_ANSWER"
    stopping_criteria = StoppingCriteriaList([EndOfAnswerCriteria(stop_token, tokenizer)])

    user_prompts = testingset["Step2PROMPT_v2"].to_list()
    response_list = []
    for user_prompt in tqdm(user_prompts):
        outputs = text_generator(user_prompt, stopping_criteria=stopping_criteria)
        result = outputs[0]["generated_text"]
        if stop_token in result:
            result = result.split(stop_token)[0].strip()
        response_list.append(result)

    new_list = []
    for response in response_list:
        if '[/INST]' in response:
            cleaned = response.split('[/INST]', 1)[1].strip()
        else:
            cleaned = response.strip()
        new_list.append(cleaned)

    testingset['Step2Response_v2'] = new_list
    return testingset




In [None]:
# Example usage:
threhsold_list = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
# os.makedirs("step2-1-v2-results", exist_ok=True)
for tl in threhsold_list:
    results = process_rag_with_threshold_v2(tl)
    results = run_llama_model(results)
    # path = "unseen_testing/RAGv2_threshold_{}.csv".format(tl)
    path = "sythetic_testing/QB3_threshold_{}.csv".format(tl)
    results.to_csv(path, index=False)



  0%|          | 0/385 [00:00<?, ?it/s]

100%|██████████| 385/385 [36:30<00:00,  5.69s/it]


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/385 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
  3%|▎         | 10/385 [06:37<4:16:28, 41.04s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 385/385 [3:58:51<00:00, 37.22s/it]  
