In [30]:
import os
import random
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict

In [55]:
LANG = "aeb_Arab"
DATA_DIR = "../data/sentiment_analysis/arabic/"
MODEL = "mistralai/Mistral-7B-v0.1"

In [44]:
datasets = {}
for split in ["train", "test", "validation"]:
    datasets[split] = pd.read_csv(os.path.join(DATA_DIR, LANG, f"{split}.csv"), header = None)
    datasets[split].columns = ["text", "label"]
    datasets[split] = Dataset.from_pandas(datasets[split], )
    print(f"{split} size: {len(datasets[split])}")
    
datasets = DatasetDict(datasets)

train size: 12303
test size: 3402
validation size: 1368


In [45]:
set(datasets["train"]["label"])

{'negative', 'positive'}

In [46]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12303
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3402
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1368
    })
})

In [47]:
datasets["train"].shuffle()

Dataset({
    features: ['text', 'label'],
    num_rows: 12303
})

In [51]:
def get_few_shot_examples(dataset, fs_per_label = 4, seed = 42):
    random.seed(seed)
    labels = list(set(dataset["label"]))
    few_shot_examples = []
    for label in labels:
        label_examples = dataset.filter(lambda example: example["label"] == label)
        # shuffle the examples
        label_examples = label_examples.shuffle(seed = seed)
        # get the first fs_per_label examples
        label_examples = label_examples.select(range(min(fs_per_label, len(label_examples))))
        few_shot_examples += [example for example in label_examples]
    
    #Shuffle the few shot examples
    random.shuffle(few_shot_examples)
    return few_shot_examples

In [52]:
few_shot_examples = get_few_shot_examples(datasets["train"], fs_per_label = 4, seed = 42)
print(f"Number of few shot examples: {len(few_shot_examples)}")

Filter:   0%|          | 0/12303 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12303 [00:00<?, ? examples/s]

Number of few shot examples: 8


In [53]:
few_shot_examples

[{'text': 'ملًا رويق', 'label': 'negative'},
 {'text': 'Chou ma7lek wenti mastoura ya5tyyyyy', 'label': 'positive'},
 {'text': 'أحسن فنان', 'label': 'positive'},
 {'text': 'يا سيدي على عيشوشة والله تمثل مليح وتمثيلها مقنع',
  'label': 'positive'},
 {'text': 'sl3a pffffffffffffffffffffff', 'label': 'negative'},
 {'text': 'Ja3four ma7lek oi ma7la idmarik inti il im7ali italvsa oi ena n7bk barcha oi  heoi mouch 5ayeb oi i7eb i3anet ines bravo',
  'label': 'positive'},
 {'text': 'fibelou dhamer ema mehwech motrobi. yetkelem belghacha. wallah meye7chemch',
  'label': 'negative'},
 {'text': 'Maaasta', 'label': 'negative'}]

In [59]:
def construct_prompt(test_example, fs_examples, prompt_for_each_label = False, labels = []):
    
    def example_to_prompt(example, add_label=True):
        ex_prompt = f"Sentence: {example['text']}\n"
        if add_label:
            ex_prompt += f"Label: {example['label']}\n"
        return ex_prompt
        
    # To Do: Add domain of the text in the instruction like "In this task you given text from {domain}"
    prompt = "In this task, you are given a piece of text. Your task is to classify the sentiment of the text based on its content.\n"
    
    for example in fs_examples:
        prompt += example_to_prompt(example, add_label = True)
        prompt += "\n"
    
    if not prompt_for_each_label:
        prompt += example_to_prompt(test_example, add_label = False)
        return prompt
    else:
        prompts = [
            prompt + example_to_prompt(test_example, add_label = True)
            for label in labels
        ]
        gold_label_idx = labels.index(test_example["label"])
        return prompts, gold_label_idx

In [60]:
print(construct_prompt(
    test_example = datasets["test"][0],
    fs_examples = few_shot_examples,
))

In this task, you are given a piece of text. Your task is to classify the sentiment of the text based on its content.
Sentence: ملًا رويق
Label: negative

Sentence: Chou ma7lek wenti mastoura ya5tyyyyy
Label: positive

Sentence: أحسن فنان
Label: positive

Sentence: يا سيدي على عيشوشة والله تمثل مليح وتمثيلها مقنع
Label: positive

Sentence: sl3a pffffffffffffffffffffff
Label: negative

Sentence: Ja3four ma7lek oi ma7la idmarik inti il im7ali italvsa oi ena n7bk barcha oi  heoi mouch 5ayeb oi i7eb i3anet ines bravo
Label: positive

Sentence: fibelou dhamer ema mehwech motrobi. yetkelem belghacha. wallah meye7chemch
Label: negative

Sentence: Maaasta
Label: negative

Sentence: ليلى متمكنة برشا من المسرح



In [61]:
print(construct_prompt(
    test_example = datasets["test"][0],
    fs_examples = few_shot_examples,
    prompt_for_each_label = True,
    labels = list(set(datasets["train"]["label"]))
))

(['In this task, you are given a piece of text. Your task is to classify the sentiment of the text based on its content.\nSentence: ملًا رويق\nLabel: negative\n\nSentence: Chou ma7lek wenti mastoura ya5tyyyyy\nLabel: positive\n\nSentence: أحسن فنان\nLabel: positive\n\nSentence: يا سيدي على عيشوشة والله تمثل مليح وتمثيلها مقنع\nLabel: positive\n\nSentence: sl3a pffffffffffffffffffffff\nLabel: negative\n\nSentence: Ja3four ma7lek oi ma7la idmarik inti il im7ali italvsa oi ena n7bk barcha oi  heoi mouch 5ayeb oi i7eb i3anet ines bravo\nLabel: positive\n\nSentence: fibelou dhamer ema mehwech motrobi. yetkelem belghacha. wallah meye7chemch\nLabel: negative\n\nSentence: Maaasta\nLabel: negative\n\nSentence: ليلى متمكنة برشا من المسرح\nLabel: positive\n', 'In this task, you are given a piece of text. Your task is to classify the sentiment of the text based on its content.\nSentence: ملًا رويق\nLabel: negative\n\nSentence: Chou ma7lek wenti mastoura ya5tyyyyy\nLabel: positive\n\nSentence: أحسن

In [62]:
model = AutoModelForCausalLM.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

device = "cuda" if torch.cuda.is_available() else "cpu"
# if torch.backends.mps.is_available():
#     device = "mps"
model.to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

KeyboardInterrupt: 