In [59]:
import os
import random
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict

In [2]:
CACHE_DIR = "/gscratch/argon/kahuja/.cache/"
LANG = "aeb_Arab"
DATA_DIR = "../data/sentiment_analysis/arabic/"
MODEL = "mistralai/Mistral-7B-v0.1"

In [12]:
datasets = {}
for split in ["train", "test", "validation"]:
    datasets[split] = pd.read_csv(os.path.join(DATA_DIR, LANG, f"{split}.csv"), header = None)
    datasets[split].columns = ["text", "label"]
    datasets[split] = Dataset.from_pandas(datasets[split])
    datasets[split] = datasets[split].filter(lambda example: not( example["text"] == "sentence" and example["label"] == "label"))
    print(f"{split} size: {len(datasets[split])}")
    
datasets = DatasetDict(datasets)

Filter:   0%|          | 0/12304 [00:00<?, ? examples/s]

train size: 12303


Filter:   0%|          | 0/3403 [00:00<?, ? examples/s]

test size: 3402


Filter:   0%|          | 0/1368 [00:00<?, ? examples/s]

validation size: 1368


In [13]:
set(datasets["train"]["label"])

{'negative', 'positive'}

In [14]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12303
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3402
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1368
    })
})

In [15]:
datasets["train"].shuffle()

Dataset({
    features: ['text', 'label'],
    num_rows: 12303
})

In [16]:
def get_few_shot_examples(dataset, fs_per_label = 4, seed = 42):
    random.seed(seed)
    labels = list(set(dataset["label"]))
    few_shot_examples = []
    for label in labels:
        label_examples = dataset.filter(lambda example: example["label"] == label)
        # shuffle the examples
        label_examples = label_examples.shuffle(seed = seed)
        # get the first fs_per_label examples
        label_examples = label_examples.select(range(min(fs_per_label, len(label_examples))))
        few_shot_examples += [example for example in label_examples]
    
    #Shuffle the few shot examples
    random.shuffle(few_shot_examples)
    return few_shot_examples

In [17]:
few_shot_examples = get_few_shot_examples(datasets["train"], fs_per_label = 4, seed = 42)
print(f"Number of few shot examples: {len(few_shot_examples)}")

Filter:   0%|          | 0/12303 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12303 [00:00<?, ? examples/s]

Number of few shot examples: 8


In [18]:
few_shot_examples

[{'text': 'يا سيدي على عيشوشة والله تمثل مليح وتمثيلها مقنع',
  'label': 'positive'},
 {'text': 'fibelou dhamer ema mehwech motrobi. yetkelem belghacha. wallah meye7chemch',
  'label': 'negative'},
 {'text': 'sl3a pffffffffffffffffffffff', 'label': 'negative'},
 {'text': 'ملًا رويق', 'label': 'negative'},
 {'text': 'أحسن فنان', 'label': 'positive'},
 {'text': 'Maaasta', 'label': 'negative'},
 {'text': 'Chou ma7lek wenti mastoura ya5tyyyyy', 'label': 'positive'},
 {'text': 'Ja3four ma7lek oi ma7la idmarik inti il im7ali italvsa oi ena n7bk barcha oi  heoi mouch 5ayeb oi i7eb i3anet ines bravo',
  'label': 'positive'}]

In [19]:
def construct_prompt(test_example, fs_examples, prompt_for_each_label = False, labels = []):
    
    def example_to_prompt(example, add_label=True):
        ex_prompt = f"Sentence: {example['text']}\n"
        if add_label:
            ex_prompt += f"Label: {example['label']}\n"
        return ex_prompt
        
    # To Do: Add domain of the text in the instruction like "In this task you given text from {domain}"
    prompt = "In this task, you are given a piece of text. Your task is to classify the sentiment of the text based on its content.\n"
    
    for example in fs_examples:
        prompt += example_to_prompt(example, add_label = True)
        prompt += "\n"
    
    if not prompt_for_each_label:
        prompt += example_to_prompt(test_example, add_label = False)
        return prompt
    else:
        prompts = [
            prompt + example_to_prompt(test_example, add_label = True)
            for label in labels
        ]
        gold_label_idx = labels.index(test_example["label"])
        return prompts, gold_label_idx

In [20]:
print(construct_prompt(
    test_example = datasets["test"][0],
    fs_examples = few_shot_examples,
))

In this task, you are given a piece of text. Your task is to classify the sentiment of the text based on its content.
Sentence: يا سيدي على عيشوشة والله تمثل مليح وتمثيلها مقنع
Label: positive

Sentence: fibelou dhamer ema mehwech motrobi. yetkelem belghacha. wallah meye7chemch
Label: negative

Sentence: sl3a pffffffffffffffffffffff
Label: negative

Sentence: ملًا رويق
Label: negative

Sentence: أحسن فنان
Label: positive

Sentence: Maaasta
Label: negative

Sentence: Chou ma7lek wenti mastoura ya5tyyyyy
Label: positive

Sentence: Ja3four ma7lek oi ma7la idmarik inti il im7ali italvsa oi ena n7bk barcha oi  heoi mouch 5ayeb oi i7eb i3anet ines bravo
Label: positive

Sentence: ليلى متمكنة برشا من المسرح



In [21]:
print(construct_prompt(
    test_example = datasets["test"][0],
    fs_examples = few_shot_examples,
    prompt_for_each_label = True,
    labels = list(set(datasets["train"]["label"]))
))

(['In this task, you are given a piece of text. Your task is to classify the sentiment of the text based on its content.\nSentence: يا سيدي على عيشوشة والله تمثل مليح وتمثيلها مقنع\nLabel: positive\n\nSentence: fibelou dhamer ema mehwech motrobi. yetkelem belghacha. wallah meye7chemch\nLabel: negative\n\nSentence: sl3a pffffffffffffffffffffff\nLabel: negative\n\nSentence: ملًا رويق\nLabel: negative\n\nSentence: أحسن فنان\nLabel: positive\n\nSentence: Maaasta\nLabel: negative\n\nSentence: Chou ma7lek wenti mastoura ya5tyyyyy\nLabel: positive\n\nSentence: Ja3four ma7lek oi ma7la idmarik inti il im7ali italvsa oi ena n7bk barcha oi  heoi mouch 5ayeb oi i7eb i3anet ines bravo\nLabel: positive\n\nSentence: ليلى متمكنة برشا من المسرح\nLabel: positive\n', 'In this task, you are given a piece of text. Your task is to classify the sentiment of the text based on its content.\nSentence: يا سيدي على عيشوشة والله تمثل مليح وتمثيلها مقنع\nLabel: positive\n\nSentence: fibelou dhamer ema mehwech motro

In [23]:
model = AutoModelForCausalLM.from_pretrained(MODEL, cache_dir=CACHE_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL, cache_dir=CACHE_DIR)

device = "cuda" if torch.cuda.is_available() else "cpu"
# if torch.backends.mps.is_available():
#     device = "mps"
model.to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm()
  

In [56]:

def get_neg_log_prob(model, tokenizer, prompt, device):
    
    model.eval()
    tokenized_out = tokenizer(prompt, return_tensors="pt")
    input_ids = tokenized_out["input_ids"].to(device)
    labels = input_ids
    
    with torch.no_grad():
        output = model(input_ids, labels=labels)
    
    return output.loss

def generate(model, tokenizer, prompt, device, max_tokens = 20):
    
    tokenized_out = tokenizer(prompt, return_tensors="pt")
    input_ids = tokenized_out["input_ids"].to(device)
    labels = tokenized_out["input_ids"].to(device)
    with torch.no_grad():
        output = model.generate(input_ids, max_new_tokens=max_tokens)
    generated_text= tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    prefix_to_remove = prompt
    if generated_text.startswith(prefix_to_remove):
        generated_text = generated_text[len(prefix_to_remove):].strip()
#     generated_text = generated_text.split("\n\n")[0].split("Label:")[-1].strip()
    
    return generated_text

    
def evaluate_model_nll(model, tokenizer, test_prompts):
    preds = []
    acc = 0
    for test_prompt in tqdm(test_prompts):
        neg_log_probA = get_neg_log_prob(model, tokenizer, test_prompt["correct_prompt"])
        neg_log_probB = get_neg_log_prob(model, tokenizer, test_prompt["incorrect_prompt"])
        if neg_log_probA < neg_log_probB:
            acc += 1
    acc = acc / len(test_prompts)
    
    return acc

def process_text(text):
#     return text.split("Label:")[-1].strip().lower()
    return text.lower()

def process_generation(generation):
    return generation.split("\n\n")[0].split("Label:")[-1].strip().lower()

def evaluate_generation(generated_text, label):
    return float(process_generation(generated_text) == process_text(label))

def evaluate_model_gen(model, tokenizer, test_prompts, device):
    
    preds = []
    correct_or_not = []
    accs = 0
    for test_prompt in tqdm(test_prompts):
        generated_text = generate(model, tokenizer, test_prompt["prompt"], device = device)
        preds.append(process_text(generated_text))
        correct_or_not.append(evaluate_generation(generated_text, test_prompt["label"]))
    
    return np.mean(correct_or_not), preds, correct_or_not

In [57]:
test_prompts = [{"prompt": construct_prompt(test_example, few_shot_examples), "label": test_example["label"]}
                    for _, test_example in zip(range(5), datasets["validation"])]

In [60]:
acc, preds, correct_or_not = evaluate_model_gen(model,
                                                tokenizer, 
                                                test_prompts, device = "cpu")

  0%|          | 0/5 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio