In [None]:
!pip install -q -U transformers 
!pip install -q -U peft
!pip install -q -U bitsandbytes
!pip install accelerate datasets

In [2]:
from datasets import load_dataset
from transformers import  AutoModelForCausalLM, AutoTokenizer,pipeline,GPT2LMHeadModel,GPT2Tokenizer
import torch

from peft import LoraConfig,prepare_model_for_kbit_training,get_peft_model


In [3]:
def generate_prompt(dta):
    return f"""
    <|im_start|>user\n{dta["Question"]}<|im_end|>\n<|im_start|>assistant\n{dta["Answer"]}<|im_end|>\n

    """

In [None]:
!huggingface-cli login --token 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 --add-to-git-credential


In [None]:
model_id_colorist_final=""

def formatted_prompt(question)-> str:
    return f" <|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"


gpt_tokenizer=GPT2Tokenizer.from_pretrained(model_id_colorist_final )
gpt_model=GPT2LMHeadModel.from_pretrained("openai-community/gpt2-medium",pad_token_id=gpt_tokenizer.eos_token_id)
finetuned_model=GPT2LMHeadModel.from_pretrained(model_id_colorist_final,pad_token_id=gpt_tokenizer.eos_token_id)


def response_generator(prompt,model,tokenizer):
    device = torch.device("cuda")

    ids = tokenizer.encode(
        formatted_prompt(prompt),
        return_tensors='pt'
    )
    attention_mask = torch.ones_like(ids)


    res=model.generate(
        ids, 
        attention_mask=attention_mask,        
        max_length=350,
        num_return_sequences=1,
        do_sample=True,  # Use greedy decoding for solid output
        num_beams=20,     # Use beam search for better coherence
        repetition_penalty=1.2,  # Penalize repetition
        no_repeat_ngram_size=2,  # Prevent repeating bigrams
        temperature=0.4,  # Control randomness
        top_k=50,         # Limit sampling to top 50 tokens
        top_p=0.9,        # Nucleus sampling
        early_stopping=True

    )
    

    
    generated_answer=""
    

    for i, output in enumerate(res):
        generated_answer += ("{}: {}...".format(i, tokenizer.decode(
            output,
            skip_special_tokens=True
        ))) + " "
        
    return generated_answer.replace(prompt,"")


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def cosine_and_jaccard_similarity(input_text, output_text):

    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([input_text, output_text])

    # Calculate cosine similarity
    cosine_sim = float( cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0])
    
    # Convert strings to sets of words
    set1 = set(input_text.split())
    set2 = set(output_text.split())
    
    # Calculate Jaccard similarity
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    jaccard_sim =len(intersection) / len(union)
    
    return {
        "jaccard":jaccard_sim,
        "cosine":cosine_sim 
    }
    
    

In [None]:
import pandas as pd

df=pd.read_csv("")
df_list=df.to_dict(orient="records")
sims=[]
res_list=[]

for  example in df_list:
    prompt = formatted_prompt(example["Question"])

    answer = example['Answer']

    finetune_response= response_generator(prompt,finetuned_model,gpt_tokenizer)    
    gpt_response=response_generator(prompt,gpt_model,gpt_tokenizer)

    print("="*50)
    print("GPT Answer")
    print(gpt_response)
    
    
    print("="*50)
    print("Finetuned Answer")    
    print(finetune_response)
    
    print("-" * 50)
    print("Original Answer")
    print(answer)
    
    sim_fine_tune = (cosine_and_jaccard_similarity(answer,finetune_response))
    
    print("="*50)
    print("Finetuned Answer Simmilarity")    
    
    print(sim_fine_tune)


    
    sim_gpt = (cosine_and_jaccard_similarity(answer,gpt_response))
    
    print("="*50)
    print("GPT Answer Simmilarity")    
    print(sim_gpt)
    
    res_list.append({
        "gpt":gpt_response,
        "fine_tune":finetune_response,
        "original":answer
    })
    
    sims.append({
        "gpt":sim_gpt,
        "fine_tune":sim_fine_tune,
    })

    
    



In [8]:
import json

f1=open("sim.json","w")
f1.write(json.dumps(sims))
f1.close()
f2=open("res.json","w")
f2.write(json.dumps(res_list))
f2.close()
