In [1]:
from datasets import load_dataset
from datasets import Dataset
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


#### Load our data in a dataset object

In [2]:
article_path = Path("data/newsela_article_corpus_2016-01-29/articles")
columns = ["slug", "language", "title", "grade_level", "version", "filename"]
raw_dataset = Dataset.from_csv("data/newsela_article_corpus_2016-01-29/articles_metadata.csv", delimiter=",")

raw_dataset = raw_dataset.rename_column("slug", "id")  # Assuming 'slug' is equivalent to 'id'

def load_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()

# Create a function to process and prepare the dataset
def process_dataset(example):
    # Customize this function based on your needs
    return {
        "en": load_file(Path(article_path, example["filename"])), 
        "s_en": load_file(Path(article_path,(example["filename"][:-5]
                                                 +str(1)
                                                 +".txt"))),
        "id": example["filename"]
    }

raw_dataset = raw_dataset.filter(lambda x: int(x["version"]) == 0)
raw_dataset = raw_dataset.filter(lambda x: x["filename"] != "US-Constitution.en.0.txt")

# Apply the processing function to the dataset
processed_dataset = raw_dataset.map(process_dataset)


processed_dataset.set_format(type="torch",
                             columns=["en", "id", "s_en"])

# Print the processed dataset
print(processed_dataset)

Dataset({
    features: ['id', 'language', 'title', 'grade_level', 'version', 'filename', 'en', 's_en'],
    num_rows: 2153
})


#### Check how some datasets look compared to ours

In [3]:
processed_dataset[0]

{'id': '10dollarbill-woman.en.0.txt',
 'en': 'WASHINGTON — An abolitionist. The longest-serving first lady. The Labor secretary through the Great Depression. The founder of the Girl Scouts.\n\nThese are some of the candidates to be the first woman on U.S. currency notes in more than a century. Treasury Secretary Jacob J. Lew announced the plans this week, saying the all-male lineup on American money has gone on long enough.\n\n"We will right that wrong, and when the new, redesigned $10 note is released, it will bear the portrait of a woman," he said at the National Archives in Washington.\n\nWhile Lew gets to decide who is featured, he and other Treasury officials will fan out across the country to solicit suggestions. They\'ve set up a website and enlisted Twitter to spread the word. A non-profit group called Women On 20s, formed to convince President Barack Obama to put a woman\'s image on the $20 note, already has done some polling.\n\nThe winner in that contest was Harriet Tubman, 

In [4]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)
dataset['train'][0]

{'id': 'train_0',
 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.",
 'summary': "Mr. Smith'

In [5]:
books = load_dataset("opus_books", "en-fr")
books['train'][0]

{'id': '0', 'translation': {'en': 'The Wanderer', 'fr': 'Le grand Meaulnes'}}

In [6]:
alpaca = load_dataset("tatsu-lab/alpaca")
alpaca["train"][5]

{'instruction': 'Identify the odd one out.',
 'input': 'Twitter, Instagram, Telegram',
 'output': 'Telegram',
 'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nIdentify the odd one out.\n\n### Input:\nTwitter, Instagram, Telegram\n\n### Response:\nTelegram'}

### Transform our data to be similar to alpaca data 

In [7]:
article_path = Path("data/newsela_article_corpus_2016-01-29/articles")
columns = ["slug", "language", "title", "grade_level", "version", "filename"]
raw_dataset = Dataset.from_csv("data/newsela_article_corpus_2016-01-29/articles_metadata.csv", delimiter=",")

raw_dataset = raw_dataset.rename_column("slug", "id")  # Assuming 'slug' is equivalent to 'id'

def load_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()
    
def format_instruction(en: str, s_en: str):
	return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
        ### Instruction:
        Simplify the text.

        ### Input:
        {en.strip()}

        ### Summary:
        {s_en}
        """.strip()

# Create a function to process and prepare the dataset
def process_dataset(example):
    # apply preprocessing and tokenization here ??
    en = load_file(Path(article_path, example["filename"]))
    s_en = load_file(Path(article_path,(example["filename"][:-5]
                                                 +str(1)
                                                 +".txt")))
    return {
        "instruction": "Simplify the text",
        "input": en, 
        "output": s_en,
        "id": example["filename"],
        "text": format_instruction(en, s_en)
    }

raw_dataset = raw_dataset.filter(lambda x: int(x["version"]) == 0)
raw_dataset = raw_dataset.filter(lambda x: x["filename"] != "US-Constitution.en.0.txt")

# Apply the processing function to the dataset
alpaca_like_dataset = raw_dataset.map(process_dataset)


alpaca_like_dataset.set_format(type="torch",
                             columns=["instruction", "input", "output", "id", "text"])

# Print the processed dataset
print(alpaca_like_dataset)

Dataset({
    features: ['id', 'language', 'title', 'grade_level', 'version', 'filename', 'instruction', 'input', 'output', 'text'],
    num_rows: 2153
})


In [8]:
alpaca_like_dataset[0]

{'id': '10dollarbill-woman.en.0.txt',
 'instruction': 'Simplify the text',
 'input': 'WASHINGTON — An abolitionist. The longest-serving first lady. The Labor secretary through the Great Depression. The founder of the Girl Scouts.\n\nThese are some of the candidates to be the first woman on U.S. currency notes in more than a century. Treasury Secretary Jacob J. Lew announced the plans this week, saying the all-male lineup on American money has gone on long enough.\n\n"We will right that wrong, and when the new, redesigned $10 note is released, it will bear the portrait of a woman," he said at the National Archives in Washington.\n\nWhile Lew gets to decide who is featured, he and other Treasury officials will fan out across the country to solicit suggestions. They\'ve set up a website and enlisted Twitter to spread the word. A non-profit group called Women On 20s, formed to convince President Barack Obama to put a woman\'s image on the $20 note, already has done some polling.\n\nThe win

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoConfig, PretrainedConfig
import torch

model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                torch_dtype=torch.bfloat16,
                                                device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"



Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.25s/it]


In [10]:
from peft import LoraConfig, get_peft_model

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():

        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

lora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    # target_modules=["query_key_value"],
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], #specific to Llama models.
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 16777216 || all params: 6755192832 || trainable%: 0.24836028248556738


In [14]:
# import wandb


args = TrainingArguments(
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    logging_steps=10,
    learning_rate=3e-4,
    fp16=True,
    max_grad_norm=1,
    num_train_epochs=2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir='output/',
    # report_to="wandb",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)

In [16]:
from trl import SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=alpaca_like_dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=args,
)

Map: 100%|██████████| 2153/2153 [00:02<00:00, 930.74 examples/s]


In [17]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtantarudragos[0m ([33mdtant[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/66 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacty of 23.66 GiB of which 151.38 MiB is free. Including non-PyTorch memory, this process has 21.14 GiB memory in use. Of the allocated memory 20.49 GiB is allocated by PyTorch, and 354.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF