In [46]:
import argparse
import os
import csv
import json
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset, Dataset, DatasetDict
from transformers import logging
from generate_utility import *
from transformers.generation import GenerationConfig
from peft import PeftModel
import bitsandbytes as bnb

logging.set_verbosity_error()

# CACHE_DIR = "/gscratch/argon/kahuja/.cache/"
# DATA_DIR = "data/sentiment_analysis/arabic/"
# MODEL2HFSTR = {"mistral": "mistralai/Mistral-7B-v0.1"}



def load_datasets(lang):
    datasets = {}
    for split in ["train", "val"]:
        with open(os.path.join(DATADIR,lang,f"{split}.jsonl"), encoding="utf-8") as f:
            lines = f.read().splitlines()
        line_dicts = [json.loads(line) for line in lines]
        datasets[split] = pd.DataFrame(line_dicts)
        datasets[split] = Dataset.from_pandas(datasets[split])
        # datasets[split] = datasets[split].filter(
        #     lambda example: not (
        #         example["text"] == "sentence" and example["label"] == "label"
        #     )
        # )
        print(f"{split} size: {len(datasets[split])}")

    datasets = DatasetDict(datasets)
    return datasets

    return datasets



In [61]:
from datasets import load_dataset, Dataset, DatasetDict, load_from_disk
import pandas as pd
import os
import json

DATADIR='../data'
def make_all_train_data(DATADIR):
    datasets={'train':[]}
    for f in os.listdir(DATADIR):
        if str(f).startswith('copa'):
            for f1 in os.listdir(os.path.join(DATADIR,f)):
                if str(f1).startswith('train'):
                    print(f,f1)
                    with open(os.path.join(DATADIR,f,f1), encoding="utf-8") as f2:
                        lines = f2.read().splitlines()
                    line_dicts = [json.loads(line) for line in lines]
                    datasets['train'].extend(line_dicts)
    datasets['train'] = Dataset.from_pandas(pd.DataFrame(data=datasets['train']))
    datasets = DatasetDict(datasets)
    datasets=datasets.shuffle(seed=41)
    return datasets

def make_all_val_data(DATADIR):
    datasets={}
    for f in os.listdir(DATADIR):
        if str(f).startswith('copa'):
            for f1 in os.listdir(os.path.join(DATADIR,f)):
                if str(f1).startswith('train'):
                    print(f,f1)
                    with open(os.path.join(DATADIR,f,f1), encoding="utf-8") as f2:
                        lines = f2.read().splitlines()
                    line_dicts = [json.loads(line) for line in lines]
                    datasets[str(f)]=line_dicts
                    datasets[str(f)] = Dataset.from_pandas(pd.DataFrame(data=datasets[str(f)]))
    datasets = DatasetDict(datasets)
    datasets=datasets.shuffle(seed=41)
    return datasets

train_datasets=make_all_train_data(DATADIR)
train_datasets.save_to_disk(os.path.join(DATADIR,'all_train'))

val_datasets=make_all_val_data(DATADIR)
val_datasets.save_to_disk(os.path.join(DATADIR,'all_val'))

copa-en train.jsonl
copa-hr train.jsonl
copa-mk train.jsonl
copa-mk train.trans.jsonl
copa-sl train.jsonl
copa-sl-cer train.jsonl
copa-sr train.jsonl
copa-sr train.trans.jsonl
copa-sr-tor train.jsonl
copa-sr-tor train.trans.jsonl


Flattening the indices: 100%|██████████| 4/4 [00:00<00:00, 68.02ba/s]

copa-en train.jsonl
copa-hr train.jsonl
copa-mk train.jsonl
copa-mk train.trans.jsonl
copa-sl train.jsonl
copa-sl-cer train.jsonl
copa-sr train.jsonl
copa-sr train.trans.jsonl
copa-sr-tor train.jsonl
copa-sr-tor train.trans.jsonl



Flattening the indices: 100%|██████████| 1/1 [00:00<00:00, 162.00ba/s]
Flattening the indices: 100%|██████████| 1/1 [00:00<00:00, 142.05ba/s]
Flattening the indices: 100%|██████████| 1/1 [00:00<00:00, 139.54ba/s]
Flattening the indices: 100%|██████████| 1/1 [00:00<00:00, 153.03ba/s]
Flattening the indices: 100%|██████████| 1/1 [00:00<00:00, 152.45ba/s]
Flattening the indices: 100%|██████████| 1/1 [00:00<00:00, 140.20ba/s]
Flattening the indices: 100%|██████████| 1/1 [00:00<00:00, 139.06ba/s]


In [63]:
val_datasets['copa-en']

Dataset({
    features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx'],
    num_rows: 400
})

In [51]:
dataset=load_from_disk(os.path.join(DATADIR,'all_train'))

In [65]:
dataset['validation']=val_datasets['copa-en']

In [70]:
for r in val_datasets:
    print(r,val_datasets[r])

copa-en Dataset({
    features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx'],
    num_rows: 400
})
copa-hr Dataset({
    features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx', 'changed'],
    num_rows: 400
})
copa-mk Dataset({
    features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx', 'changed'],
    num_rows: 400
})
copa-sl Dataset({
    features: ['choice1', 'choice2', 'idx', 'label', 'premise', 'question'],
    num_rows: 400
})
copa-sl-cer Dataset({
    features: ['choice1', 'choice2', 'idx', 'label', 'premise', 'question'],
    num_rows: 400
})
copa-sr Dataset({
    features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx', 'changed'],
    num_rows: 400
})
copa-sr-tor Dataset({
    features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx', 'changed'],
    num_rows: 400
})


In [48]:
DATADIR="../data"

In [49]:
dataset=load_datasets("copa-hr")

train size: 400
val size: 100


In [51]:
# import json
# import pandas as pd


# lines = []

# with open(os.path.join(DATADIR,'copa-hr','train.jsonl')) as f:
#     lines = f.read().splitlines()

# line_dicts = [json.loads(line) for line in lines]
# df_final = pd.DataFrame(line_dicts)

In [52]:
df_final.columns

Index(['premise', 'choice1', 'choice2', 'question', 'label', 'idx', 'changed'], dtype='object')

In [53]:
lang="Croatian"
preamble = f"""You are a helpful following assistant whose goal is to select the preferred (least wrong) output for a given instruction in {lang}."""

prompt_template="""Instruction: Given the premise, ""{premise}"", What is the correct {question}?
{question} A: {choice1}
{question} B: {choice2}
Correct {question}: {correct_answer}"""

choices=["A","B"]

def get_few_shot_examples(dataset, fs_per_label=1, seed=42):
    labels = list(set(dataset["label"]))
    few_shot_examples = []
    for label in labels:
        label_examples = dataset.filter(lambda example: example["label"] == label and example["question"]=='cause')
        # shuffle the examples
        label_examples = label_examples.shuffle(seed=seed)
        # get the first fs_per_label examples
        label_examples = label_examples.select(
            range(min(fs_per_label, len(label_examples)))
        )
        few_shot_examples += [example for example in label_examples]
        
        label_examples = dataset.filter(lambda example: example["label"] == label and example["question"]=='effect')
        # shuffle the examples
        label_examples = label_examples.shuffle(seed=seed)
        # get the first fs_per_label examples
        label_examples = label_examples.select(
            range(min(fs_per_label, len(label_examples)))
        )
        few_shot_examples += [example for example in label_examples]

    # Shuffle the few shot examples
    random.shuffle(few_shot_examples)
    return few_shot_examples

def construct_prompt(ds_examples):
    def example_to_prompt(example, add_label=True):
        ex_prompt = f"Sentence: {example['text']}\n"
        if add_label:
            ex_prompt += f"Label: {example['label']}\n"
        return ex_prompt

    # To Do: Add domain of the text in the instruction like "In this task you given text from {domain}

    # Format the first five rows as examples for 5-shot prompting
    prompt_examples = "\n\n".join([ prompt_template.format(**d,correct_answer=choices[int(d["label"])-1]) for d in ds_examples])
    prompt_examples=preamble+"\n\n\n"+prompt_examples
    return prompt_examples
#     for example in fs_examples:
#         prompt += example_to_prompt(example, add_label=True)
#         prompt += "\n"

#     if not prompt_for_each_label:
#         prompt += example_to_prompt(test_example, add_label=False)
#         return prompt
#     else:
#         prompts = [
#             prompt + example_to_prompt(test_example, add_label=True) for label in labels
#         ]
#         gold_label_idx = labels.index(test_example["label"])
#         return prompts, gold_label_idx


In [54]:
fs_examp=get_few_shot_examples(dataset['train'])

Filter: 100%|██████████| 400/400 [00:00<00:00, 62826.60 examples/s]
Filter: 100%|██████████| 400/400 [00:00<00:00, 88506.10 examples/s]
Filter: 100%|██████████| 400/400 [00:00<00:00, 88124.89 examples/s]
Filter: 100%|██████████| 400/400 [00:00<00:00, 85702.98 examples/s]


In [55]:
fs_prompt=construct_prompt(fs_examp)

In [56]:
print(fs_prompt)

You are a helpful following assistant whose goal is to select the preferred (least wrong) output for a given instruction in Croatian.


Instruction: Given the premise, ""Ometen sam u razgovoru sa ženom."", What is the correct cause?
cause A: Svi su u sobi govorili.
cause B: Žena je pričala smiješnu priču.
Correct cause: B

Instruction: Given the premise, ""Mačka je prela."", What is the correct cause?
cause A: Ogrebla me.
cause B: Pomazio sam je.
Correct cause: A

Instruction: Given the premise, ""Ravnatelj škole uveo je pravila oblačenja."", What is the correct effect?
effect A: Učenici su se pobunili protiv te odluke.
effect B: Učenici su izbačeni iz škole.
Correct effect: B

Instruction: Given the premise, ""Starija žena doživjela je moždani udar."", What is the correct effect?
effect A: Ženina kći došla joj je očistiti kuću.
effect B: Ženina kći uselila se da bi vodila brigu o njoj.
Correct effect: A


In [10]:
base_model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-hf',
                                                  token="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", 
                                                 cache_dir='../models/Llama-2-7b-hf')

Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.01s/it]


In [11]:
tokenizer = AutoTokenizer.from_pretrained('../models/mala-500')

In [16]:
base_model.resize_token_embeddings(260164)
model = PeftModel.from_pretrained(base_model, '../models/mala-500')

In [28]:
tokenizer.pad_token_id = tokenizer.eos_token_id
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): ModulesToSaveWrapper(
          (original_module): Embedding(260164, 4096)
          (modules_to_save): ModuleDict(
            (default): Embedding(260164, 4096)
          )
        )
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()

In [52]:
def get_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument('-model', '--model', type=str, help='data folder location', required=True)
    args = parser.parse_args()
    return args  

if __name__ == "__main__":
    # args = get_arguments()
    # print(args)
    # model_name=args.model
    model_name="llama2_7b_chat"
    
    model_configs={
    'llama2_7b_chat':{
        'model_path':'/projects/antonis/models/LLaMA-v2/Llama2-chat-hf/7B'
    },
    'llama2_7b':{
        'model_path':'../models/Llama-2-7b'
    },
    }

    load_4bit=False
    load_8bit=False


    model_path=model_configs[model_name]['model_path']
    model, tokenizer = get_model(model_path,load_4bit=False,load_8bit=False)

Loading checkpoint shards: 100%|██████████| 3/3 [01:04<00:00, 21.50s/it]


In [14]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

checkpoint = "../models/aya-101"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Loading checkpoint shards: 100%|██████████| 11/11 [00:55<00:00,  5.06s/it]


In [23]:
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)

In [33]:
encodings=tokenizer(all_val_prompts[0:10], return_tensors="pt", padding='longest', truncation=False).to("cuda")
output_ids=model.generate(**encodings, **gen_config)
responses=tokenizer.batch_decode(output_ids, skip_special_tokens=True)
print(responses)

['B', 'B', 'A', 'B', 'cause B', 'A', 'cause A', 'cause A', 'cause B', 'A']


In [15]:
gen_config = {
                "temperature": 0.7,
                "top_p": 0.1,
                "repetition_penalty": 1.18,
                "top_k": 40,
                "do_sample": True,
                "max_new_tokens": 5,
                "pad_token_id": tokenizer.eos_token_id
                    }

In [35]:
def generate_result(prompts,gen_config,model_name='aya',bs=4):
    all_response=[]
    all_response_raw=[]
    end=len(prompts)
    for start in tqdm(range(0,end,bs)):
        stop=min(start+bs,len(prompts)-1)
        if start<stop:
            prompts_batch=prompts[start:stop]
            encodings=tokenizer(prompts_batch, return_tensors="pt", padding='longest', truncation=False).to("cuda")
            with torch.no_grad():
                output_ids = model.generate(**encodings, **gen_config)
            responses=tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            for i,response_raw in enumerate(responses):
                sample_no=i+start
                if model_name!='aya':
                    response=response_raw[len(prompts[sample_no]):]
                    response=response.split("\n")[0].strip() if "\n" in response else response.strip()
                else:
                    response=response_raw[-1]
                all_response.append(response)
                all_response_raw.append(response_raw)
                
    return all_response_raw,all_response

In [57]:
all_val_prompts=[]
all_val_labels=[]
tokenizer.pad_token_id = tokenizer.eos_token_id
for row in dataset['val']:
    prompt=(fs_prompt + "\n\n" + prompt_template.format(**row, correct_answer="")).strip()
    all_val_prompts.append(prompt)
    all_val_labels.append(row['label'])

In [59]:
all_response_raw,all_response=generate_result(all_val_prompts,gen_config,'aya')

100%|██████████| 25/25 [00:28<00:00,  1.12s/it]


In [60]:
count=0
for i,res in enumerate(all_response):
    if res in choices:
        if choices.index(res)==all_val_labels[i]:
            count+=1
acc=count/len(all_response)
print(acc)

0.7474747474747475


  from .autonotebook import tqdm as notebook_tqdm


In [58]:
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional, Union
import time
import json

import numpy as np
import torch
from datasets import load_dataset

import transformers
from transformers import (
    AutoConfig,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase
from transformers.trainer_utils import get_last_checkpoint, is_main_process

In [173]:
datasets=load_datasets("copa-en")

train size: 400
val size: 100


In [174]:
datasets['train'][0]

{'premise': 'My body cast a shadow over the grass.',
 'choice1': 'The sun was rising.',
 'choice2': 'The grass was cut.',
 'question': 'cause',
 'label': 0,
 'idx': 0}

In [162]:
model_name="classla/bcms-bertic"
config = AutoConfig.from_pretrained(model_name
    )
tokenizer = AutoTokenizer.from_pretrained(
        model_name
    )
model = AutoModelForMultipleChoice.from_pretrained(model_name
    )

In [175]:
context_name = "premise"
question_header_name = "question"
ending_names = [f"choice{i}" for i in [1, 2]]
num_answers = 2

In [176]:
@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.

    Args:
        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
            The tokenizer used for encoding the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:

            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.

            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch


def preprocess_function(examples):
    first_sentences = [
      [f"{context} What was the {question}?"] * num_answers for context, question in zip(examples[context_name], examples[question_header_name])
    ]
    second_sentences = [
            [f"{examples[end][i]}" for end in ending_names] for i, _ in enumerate(first_sentences)
        ]
    # Flatten out
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    # Tokenize
    tokenized_examples = tokenizer(
        first_sentences,
        second_sentences,
        truncation=True,
        max_length=128,
        padding="max_length"
    )

    # Un-flatten
    # unflattened = {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
    unflattened = {k: [v[i : i + num_answers] for i in range(0, len(v), num_answers)] for k, v in tokenized_examples.items()}
    return unflattened

tokenized_datasets = datasets.map(
preprocess_function,
batched=True
)

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Map: 100%|██████████| 400/400 [00:00<00:00, 7589.58 examples/s]
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Map: 100%|██████████| 100/100 [00:00<00:00, 6192.13 examples/s]


In [177]:
# Data collator
data_collator = (
    DataCollatorForMultipleChoice(tokenizer)
)


In [178]:
def compute_metrics(eval_predictions):
        predictions, label_ids = eval_predictions
        preds = np.argmax(predictions, axis=1)
        return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [179]:
trainer = Trainer(
        model=model,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["val"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [180]:
trainer.eval_dataset

Dataset({
    features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

In [181]:
train_result = trainer.train()

{'train_runtime': 13.1622, 'train_samples_per_second': 91.17, 'train_steps_per_second': 11.396, 'train_loss': 0.6914518229166666, 'epoch': 3.0}


In [182]:
esults = trainer.evaluate()

{'eval_loss': 0.6911947727203369, 'eval_accuracy': 0.47999998927116394, 'eval_runtime': 0.3688, 'eval_samples_per_second': 271.154, 'eval_steps_per_second': 35.25, 'epoch': 3.0}


In [185]:
#eng---
# mbert=55
# bert=66
# xlmr=54
# bertic=48
# llama2-chat=75
# mala-500=50
# aya=80

#copa-hr
# bert=55
# mbert=57
# xlm-r=54
# bertic=64
# llama2 = 50
# aya= 75

##-modeling from scratch/finetuning with llm-augmented data generation?
# improve dialectal performance with the aid of llm
# trend
# - previously: llm works or not, benchmarking
# - current: reinventing mbert experiments: adding augmented data to improve the performance, llm generated data to train
# - next: Think outside of the traditional benchmarking?
# -     taking high resource and it's low-resource counterparts. 
        #Use prompt techniques 
        #See results and use the insight from there to use lora training. 
# - low-resource data is the bottolneck here. how much augmentation/synthetic data actually help. In which cases, it must fail?
# - what extra thing can llm does which mbert cant do except synthetic data generation


1. ft on all other data at once/one by one (mbert, xlm-r, bertic)
2. get result from aya on all of them (2 cause, 4 cause in prompt)
3. for a give test example, try to choose most similar training examples, in prompt provide info about dialect