In [None]:
%pip install --disable-pip-version-check torch==1.13.1 torchdata==0.5.1

In [None]:
!nvidia-smi

## _==> Please ignore all WARNINGs and ERRORs from the `pip install`'s above. <==_

# Hugging Face Transformers
## _==> Please ignore all WARNINGs and ERRORs from the `pip install`'s below. <==_

In [2]:
%pip install --disable-pip-version-check -Uq \
    transformers==4.27.2 \
    datasets==2.9.0 \
    accelerate \
    promptsource==0.2.3 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
%pip install git+https://github.com/huggingface/peft.git

Collecting git+https://github.com/huggingface/peft.git
  Cloning https://github.com/huggingface/peft.git to /tmp/pip-req-build-sddx_9vn
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-req-build-sddx_9vn
  Resolved https://github.com/huggingface/peft.git to commit 2822398fbe896f25d4dac5e468624dc5fd65a51b
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install git+https://github.com/lvwerra/trl.git

Collecting git+https://github.com/lvwerra/trl.git
  Cloning https://github.com/lvwerra/trl.git to /tmp/pip-req-build-leu_vhb8
  Running command git clone --filter=blob:none --quiet https://github.com/lvwerra/trl.git /tmp/pip-req-build-leu_vhb8
  Resolved https://github.com/lvwerra/trl.git to commit ce37eadcfa22f2a3c25422411a586b8f593e3e6e
  Preparing metadata (setup.py) ... [?25ldone
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
import argparse
import os

import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig, get_peft_model #, prepare_model_for_int8_training
from torch.utils.data import IterableDataset
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments, logging, set_seed

In [6]:
def get_prompt_with_response(example):
    text = f"Question: {example['question']}\n\nAnswer: {example['response_j']}"
    return text

def get_prompt_from_question(question):
    text = f"Question: {question}\n\nAnswer: "
    return text

def get_prompt_without_response(example):
    text = f"Question: {example['question']}\n\nAnswer: "
    return text

def get_response_without_prompt(example):
    return example['response_j']


def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = get_prompt_with_response(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

    
class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6,
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id else eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(get_prompt_with_response(next(iterator)))
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }

# Fine-Tune flan-t5 on stack exchange paired dataset

In [7]:
model_path="google/flan-t5-base"
dataset_name="lvwerra/stack-exchange-paired"
subset="data/finetune"
split="train"
size_valid_set=4000
streaming=False
shuffle_buffer=5000
seq_length=1024
max_steps=500 # was 5000
batch_size=2 # was 16
gradient_accumulation_steps=1
eos_token_id=49152
learning_rate=1e-5
lr_scheduler_type="cosine"
num_warmup_steps=50 # was 100
weight_decay=0.05
local_rank=0
no_fp16=True
bf16=True
no_gradient_checkpointing=True
seed=42
num_workers=None
log_freq=100
eval_freq=100 # was 1000
save_freq=100 # was 1000

In [9]:
set_seed(seed)

tokenizer = AutoTokenizer.from_pretrained(model_path)

print('Dataset name: {}'.format(dataset_name))
dataset = load_dataset(
    dataset_name,
    data_dir=subset,
    split=split,
    num_proc=num_workers if not streaming else None,
    streaming=streaming,
)

print(dataset)
if streaming:
    print("Loading the dataset in streaming mode")
    valid_data = dataset.take(size_valid_set)
    train_data = dataset.skip(size_valid_set)
    train_data = train_data.shuffle(buffer_size=shuffle_buffer, seed=seed)
else:
    dataset = dataset.train_test_split(test_size=0.005, seed=seed)
    train_data = dataset["train"]
    valid_data = dataset["test"]
    print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

chars_per_token = chars_token_ratio(train_data, tokenizer)
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

train_dataset = ConstantLengthDataset(
    tokenizer,
    train_data,
    infinite=True,
    seq_length=seq_length,
    chars_per_token=chars_per_token,
)
validation_dataset = ConstantLengthDataset(
    tokenizer,
    valid_data,
    infinite=False,
    seq_length=seq_length,
    chars_per_token=chars_per_token,
)

print(train_dataset)
print(validation_dataset)

Dataset name: lvwerra/stack-exchange-paired


Using custom data configuration lvwerra--stack-exchange-paired-4e8354da9bd1bd3d
Found cached dataset parquet (/root/.cache/huggingface/datasets/lvwerra___parquet/lvwerra--stack-exchange-paired-4e8354da9bd1bd3d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/lvwerra___parquet/lvwerra--stack-exchange-paired-4e8354da9bd1bd3d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-54456b7e473dd794.arrow and /root/.cache/huggingface/datasets/lvwerra___parquet/lvwerra--stack-exchange-paired-4e8354da9bd1bd3d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8304e884db3f8548.arrow


Dataset({
    features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],
    num_rows: 7440923
})
Size of the train set: 7403718. Size of the validation set: 37205


  0%|          | 0/400 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 400/400 [00:09<00:00, 41.55it/s]

The character to token ratio of the dataset is: 3.08
<__main__.ConstantLengthDataset object at 0x7fb1e9423090>
<__main__.ConstantLengthDataset object at 0x7fb07cb15bd0>





In [10]:
print("Loading the model")
# disable caching mechanism when using gradient checkpointing
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_path,
#    use_cache=not no_gradient_checkpointing,
#    torch_dtype=torch.float16,
)

print_trainable_parameters(model)

Loading the model
trainable params: 247577856 || all params: 247577856 || trainable%: 100.0


In [11]:
fine_tuned_with_public_qanda_checkpoint='./fine_tuned_with_public_qanda'

training_args = TrainingArguments(
    output_dir=fine_tuned_with_public_qanda_checkpoint,
    dataloader_drop_last=True,
#    evaluation_strategy="steps",
    max_steps=max_steps,
#    eval_steps=eval_freq,
#    save_steps=save_freq,
    logging_steps=log_freq,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    warmup_steps=num_warmup_steps,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=not no_gradient_checkpointing,
    fp16=not no_fp16,
    bf16=bf16,
    weight_decay=weight_decay,
)

trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=train_dataset, 
#                  eval_dataset=validation_dataset
                 )

print("Training...")
trainer.train()

Training...




Step,Training Loss
1,1.7927
2,2.115
3,1.7796
4,1.9029
5,1.6711
6,1.5
7,2.5108
8,2.0076
9,1.9578
10,1.3653


TrainOutput(global_step=500, training_loss=0.32707486420869825, metrics={'train_runtime': 294.443, 'train_samples_per_second': 3.396, 'train_steps_per_second': 1.698, 'total_flos': 1369514704896000.0, 'train_loss': 0.32707486420869825, 'epoch': 1.0})

In [12]:
print("Saving last checkpoint of the model")
trainer.model.save_pretrained(fine_tuned_with_public_qanda_checkpoint)
tokenizer.save_pretrained(fine_tuned_with_public_qanda_checkpoint)

Saving last checkpoint of the model


('./fine_tuned_with_public_qanda/tokenizer_config.json',
 './fine_tuned_with_public_qanda/special_tokens_map.json',
 './fine_tuned_with_public_qanda/tokenizer.json')

In [13]:
%store fine_tuned_with_public_qanda_checkpoint

Stored 'fine_tuned_with_public_qanda_checkpoint' (str)


In [16]:
print(fine_tuned_with_public_qanda_checkpoint)

fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(fine_tuned_with_public_qanda_checkpoint)
fine_tuned_model_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_with_public_qanda_checkpoint)

./fine_tuned_with_public_qanda


In [20]:
question = 'Who won the 2022 World Cup?'

prompt = get_prompt_from_question(question)

inputs = fine_tuned_model_tokenizer(prompt, return_tensors='pt')
response = fine_tuned_model_tokenizer.decode(
    fine_tuned_model.generate(
       input_ids= inputs["input_ids"], 
       max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)
print(f'INPUT PROMPT:\n{prompt}{response}\n')

INPUT PROMPT:
Question: Who won the 2022 World Cup?

Answer: switzerland



In [21]:
print('Dataset name: {}'.format(dataset_name))
dataset = load_dataset(
    dataset_name,
    data_dir='data/evaluation',
    split='train',
)

print(dataset)

Dataset name: lvwerra/stack-exchange-paired


Using custom data configuration lvwerra--stack-exchange-paired-b55353e62cc7a74b
Found cached dataset parquet (/root/.cache/huggingface/datasets/lvwerra___parquet/lvwerra--stack-exchange-paired-b55353e62cc7a74b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Dataset({
    features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],
    num_rows: 4483004
})


In [22]:
prompts_and_responses = dataset \
    .select(range(10)).map(lambda example : {
        'prompt': get_prompt_without_response(example), 
        'response': get_response_without_prompt(example)
    })
prompts_and_responses[0]['question']
prompts_and_responses[0]['response_j']

Loading cached processed dataset at /root/.cache/huggingface/datasets/lvwerra___parquet/lvwerra--stack-exchange-paired-b55353e62cc7a74b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-ff5d9e4fd03dc106.arrow


"> \n> Making my Java program easily distributable\n> \n> \n> \n\nIf you mean 'easy for the end user' look to [Java Web Start](https://stackoverflow.com/tags/java-web-start/info).\n\n---\n\nA passer-by asks:\n\n> \n> Can you package the dll dependencies with Web Start? \n> \n> \n> \n\nYes, but much, much better. You can package the natives for each platform in separate Jars, and supply them only to the platform that uses that native, even so far as partitioning the download between 32 & 64 bit versions of the natives.\n\nJWS puts the natives on the run-time class-path of the application, ready for loading in code.\n\nThis all happens automatically for the end user, they click a link, approve the trust dialog(s) when asked, and the application installs - possibly with desktop integration, and appears on screen like magic.\n\nJWS apps. that use natives need to be distributed as `all-permissions` security level, because the JVM cannot guarantee the actions of anything that 'goes native'."

In [23]:
for prompt_response in prompts_and_responses:
    prompt = prompt_response['prompt']
    inputs = fine_tuned_model_tokenizer(prompt, return_tensors='pt')

    response = fine_tuned_model_tokenizer.decode(
                fine_tuned_model.generate(
                    input_ids=inputs["input_ids"], 
                    max_new_tokens=200,
                    do_sample=True, 
                    top_k=50, 
                    top_p=0.9
                    )[0],
                    skip_special_tokens=True)

    print('PROMPT: {}'.format(prompt))
    print('RESPONSE: {}'.format(response))
    print('EXPECTED RESPONSE: {}'.format(prompt_response['response']))
    print('----')

PROMPT: Question: I have installed the Java 3D API on PC via the exe installer, which simply created a new directory with `j3dcore.jar`, `vecmath.jar`, `j3dutils.jar` in a lib sub-directory and `j3dcore-ogl.dll` in a bin sub-directory.

Netbeans had no issues and my code compiled and executed smoothly, however once I built my project and tried to run it from the command prompt I got an `UnsatisfiedLinkError` saying that `no j3dcore-ogl in java.library.path`. 

Google came to the rescue and gave me 3 viable solutions:

* by copying the dll file into my JRE's bin directory
* by adding the path of the dll file to the library path (`java -Djava.library.path=dllpath`)
* load the dll in the program with `System.load()` (I couldn't get this one to work, actually)

My question is: Is there an elegant solution to this problem, that I missed? 

It seems tedious that for each different PC someone would like to use this program on, he'd have to either copy the dll or add it to the library path bef

Token indices sequence length is longer than the specified maximum sequence length for this model (557 > 512). Running this sequence through the model will result in indexing errors


PROMPT: Question: I need to take some online tests for school.
This website tells me I need Flash Player 11.3.0 or higher. As far as I can see that is not yet avaible for Linux.
I use Ubuntu 12.04 LTS and Chromium. Is there a way I can work around it?

Greetz. Rob.

Answer: 
RESPONSE: Linux
EXPECTED RESPONSE: The best way to get Flash Player 11.2+ is to use Google Chrome in Ubuntu. There is no other way to get it, because a higher version has not been released for Ubuntu.

[Download Google Chrome From Here](https://www.google.com/intl/en/chrome/browser/)

Select your OS version x86 or x64 and download it to any path.

Then you can open it with the Ubuntu Software Center to install.

You can also install by executing command:

```
sudo dpkg -i <googlechromefile.deb>

```

Hope it helps you somewhat!!
----
PROMPT: Question: we got homework to make convertor of weights where the fields are updated while typing the number (no need to click "calculate" or anything). one of the students offe

# Quantitative Results with ROGUE Metric

The [ROUGE metric](https://en.wikipedia.org/wiki/ROUGE_(metric)) helps quantify the validity of summarizations produced by models. It compares summarizations to a "baseline" summary which is usually created by a human. While not perfect, it does give an indication to the overall increase in summarization effectiveness that we have accomplished by fine-tuning.# ROUGE evaluation of summaries

In [24]:
import evaluate

rouge = evaluate.load('rouge')

In [25]:
prompts = prompts_and_responses['prompt']
human_baseline_summaries = prompts_and_responses['response']
human_baseline_summaries

["> \n> Making my Java program easily distributable\n> \n> \n> \n\nIf you mean 'easy for the end user' look to [Java Web Start](https://stackoverflow.com/tags/java-web-start/info).\n\n---\n\nA passer-by asks:\n\n> \n> Can you package the dll dependencies with Web Start? \n> \n> \n> \n\nYes, but much, much better. You can package the natives for each platform in separate Jars, and supply them only to the platform that uses that native, even so far as partitioning the download between 32 & 64 bit versions of the natives.\n\nJWS puts the natives on the run-time class-path of the application, ready for loading in code.\n\nThis all happens automatically for the end user, they click a link, approve the trust dialog(s) when asked, and the application installs - possibly with desktop integration, and appears on screen like magic.\n\nJWS apps. that use natives need to be distributed as `all-permissions` security level, because the JVM cannot guarantee the actions of anything that 'goes native'.

In [26]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

pretrained_model_checkpoint='t5-base'
pretrained_model_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_checkpoint, use_fast=True)
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_checkpoint)

instruct_model_checkpoint='google/flan-t5-base'
instruct_model_tokenizer = AutoTokenizer.from_pretrained(instruct_model_checkpoint, use_fast=True)
instruct_model = AutoModelForSeq2SeqLM.from_pretrained(instruct_model_checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [29]:
from transformers import GenerationConfig

pretrained_model_summaries = []
instruct_model_summaries = []
fine_tuned_model_summaries = []

for idx, prompt in enumerate(prompts):
    input_ids = pretrained_model_tokenizer(prompt, return_tensors="pt").input_ids

    pretrained_model_outputs = pretrained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    pretrained_model_text_output = pretrained_model_tokenizer.decode(pretrained_model_outputs[0], skip_special_tokens=True)
    pretrained_model_summaries.append(pretrained_model_text_output)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = instruct_model_tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)
    
    fine_tuned_model_outputs = fine_tuned_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    fine_tuned_model_text_output = fine_tuned_model_tokenizer.decode(fine_tuned_model_outputs[0], skip_special_tokens=True)
    fine_tuned_model_summaries.append(fine_tuned_model_text_output)

Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors


In [30]:
pretrained_model_results = rouge.compute(
    predictions=pretrained_model_summaries,
    references=human_baseline_summaries[0:len(pretrained_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
pretrained_model_results

{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}

In [32]:
instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
instruct_model_results

{'rouge1': 0.12426905673512008,
 'rouge2': 0.0214021164021164,
 'rougeL': 0.0650203601196157,
 'rougeLsum': 0.0997737556561086}

In [33]:
fine_tuned_model_results = rouge.compute(
    predictions=fine_tuned_model_summaries,
    references=human_baseline_summaries[0:len(fine_tuned_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
fine_tuned_model_results

{'rouge1': 0.21335099346295278,
 'rouge2': 0.0614604952645688,
 'rougeL': 0.13508294119632722,
 'rougeLsum': 0.1730430279883701}