In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

# Step 2: Load the Dataset
# dataset_path = '/content/drive/My Drive/relabeled_pawsx_train/train_paws_x_en.tsv'
# dataset_path = '/content/drive/My Drive/relabeled_pawsx_train/paws-x-test.tsv'
# dataset_path = '/content/drive/My Drive/relabeled_pawsx_train/stsbenchmark_train.tsv'
dataset = pd.read_csv(dataset_path, sep='\t')

In [None]:
# Step 3: Set Up and Use the Model
!pip install transformers accelerate huggingface_hub bitsandbytes

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the model and tokenizer
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

# Convert the model to 4-bit precision
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # Can be changed based on your preference and available resources
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from tqdm import tqdm
from collections import Counter
# Step 6: Define Function to Create Prompts with Sentence Pairs
def create_prompts_with_sentence_pairs(df, system_message):
    prompts = []
    for _, row in df.iterrows():
        prompt = f"""<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
{system_message}<|eot_id|>
<|start_header_id|>user<|end_header_id|>

Are the following sentences paraphrases?


Sentence 1: {row['sentence1']}
Sentence 2: {row['sentence2']}

Answer with 'Yes' or 'No'<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

"""
        prompts.append(prompt)
    return prompts

def generate_batch(prompts, batch_size):
    results = []
    texts = []
    full_texts = []
    for i in tqdm(range(0, len(prompts), batch_size)):
        batch = prompts[i:i + batch_size]
        # Ensure batch is a list of strings
        batch = [str(prompt) for prompt in batch]
        generated = text_generator(
            batch,
            max_new_tokens=3,
            do_sample=True,
            top_k=40,
            top_p=0.95
        )
        for prompt, generation in zip(batch, generated):
            new_text = generation[0]['generated_text'][len(prompt):]
            shortened_text = new_text.lower().strip()[:min(3, len(new_text))]
            prediction = shortened_text == "yes"
            full_texts.append(new_text)
            texts.append(shortened_text)
            results.append(prediction)
        print()
        print("Positive Completed " + str(sum(results)))
        print("Total Completed " + str(len(results)))
    return results


In [None]:
# Create a text generation pipeline
text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [None]:
prompts = create_prompts_with_sentence_pairs(dataset,  system_message = "You are a helpful Assistant.")

In [None]:
import logging
from transformers import logging as transformers_logging

# Suppress transformers logging
transformers_logging.set_verbosity_error()

# Suppress general logging
logging.getLogger("transformers").setLevel(logging.ERROR)


In [None]:
experiment_name = "LLama3 ICL_4 (Ex. Same Content)"
batch_size = 128
predictions = generate_batch(prompts, batch_size)

In [None]:
dataset[experiment_name] = predictions

In [None]:
dataset[experiment_name] = dataset[experiment_name].apply(int)
dataset

In [None]:
dataset.to_csv(dataset_path, sep="\t")