# Final Project of the NLP 2024 Course

Slides: https://docs.google.com/presentation/d/1NbH4E2HKVHQlaW_ivKCyjpWuEJFvmz3bSKsX8fs67tA/edit#slide=id.g2d17364e0e4_0_34


## Environment Setup

Get your own huggingface access token via
https://huggingface.co/settings/tokens

And set up HF_TOKEN as a secret of Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install accelerate==0.29.3 peft bitsandbytes==0.43.1 transformers==4.40.1 trl==0.8.5

In [None]:
import os
import torch
import logging
from google.colab import userdata

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

## Parameter

In [None]:
# Run with CUDA
if torch.cuda.is_available() and torch.cuda.device_count():
  device = "cuda:0"
  print('inference device is not set, using cuda:0, %s')
else:
  device = 'cpu'
  # raise ValueError('CUDA CUDA CUDA CUDA CUDA')

# Hugging Face token
hf_access_token=userdata.get('HUGF_TOKEN')

# Tokenizer from Hugging Face hub
tokenizer = 'meta-llama/Llama-2-7b-hf'

# Model from Hugging Face hub
# base_model = "apple/OpenELM-450M-Instruct"
base_model = "apple/OpenELM-1_1B-Instruct"
# base_model = "apple/OpenELM-3B-Instruct"

# Fine-tuned model
new_model = "OpenELM-666"

inference device is not set, using cuda:0, %s


## Dataset
Create Your Dataset

In [None]:
import json
import random
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split


def create_prompt(row):
    prompt = f"Instruction: {row['instruction']}\nContext: {row['context']}\nResponse: {row['response']}"
    return prompt

def format_instruction(data_path):
  data = []
  with open(data_path, 'r') as f:
      for line in f:
          data.append(json.loads(line))

  df = pd.DataFrame(data)

  train, test = train_test_split(df, test_size=0.2, random_state = 42)

  train['text'] = train.apply(create_prompt, axis=1)
  train_data = Dataset.from_pandas(train)
  print(train_data[0])
  print(len(train_data))

  test['text'] = test.apply(create_prompt, axis=1)
  test_data = Dataset.from_pandas(test)
  print(test_data[0])
  print(len(test_data))

  return train_data, test_data

### databricks-dolly-15k
!wget -nc https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl
data_path = "/content/databricks-dolly-15k.jsonl"

dolly_train_data, dolly_test_data = format_instruction(data_path)

####################################################################################
## custom

data_path = "/content/drive/MyDrive/NLPproject/data/"
data_path = data_path + "abstract.jsonl"

custom_train_data, custom_test_data = format_instruction(data_path)

--2024-06-11 13:26:28--  https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl
Resolving huggingface.co (huggingface.co)... 54.192.18.113, 54.192.18.15, 54.192.18.37, ...
Connecting to huggingface.co (huggingface.co)|54.192.18.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/34/ac/34ac588cc580830664f592597bb6d19d61639eca33dc2d6bb0b6d833f7bfd552/2df9083338b4abd6bceb5635764dab5d833b393b55759dffb0959b6fcbf794ec?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27databricks-dolly-15k.jsonl%3B+filename%3D%22databricks-dolly-15k.jsonl%22%3B&Expires=1718371589&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxODM3MTU4OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8zNC9hYy8zNGFjNTg4Y2M1ODA4MzA2NjRmNTkyNTk3YmI2ZDE5ZDYxNjM5ZWNhMzNkYzJkNmJiMGI2ZDgzM2Y3YmZkNTUyLzJkZjkwODMzMzhiNGFiZDZiY2ViNTYzNTc2NGRhYjVkOD

In [None]:
## Quantilization
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [None]:
### Load model
## with Quantilization
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map=device,
    trust_remote_code=True
    )

## without Quantilization
# model = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     device_map=device,
#     trust_remote_code=True
#     )

model.config.use_cache = False
model.config.pretraining_tp = 1

## Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer,
    trust_remote_code=True,
    token=hf_access_token
    )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

configuration_openelm.py:   0%|          | 0.00/14.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/apple/OpenELM-1_1B-Instruct:
- configuration_openelm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_openelm.py:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/apple/OpenELM-1_1B-Instruct:
- modeling_openelm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/2.16G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

## PEFT parameters

In [None]:
## Lora
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = 'all-linear',
)

## Training databricks/databricks-dolly-15k


In [None]:
## databricks/databricks-dolly-15k
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    label_names = ["labels"],
    per_device_train_batch_size=4,
    # per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    save_steps=500,
    logging_steps=500,
    learning_rate=2e-4,
    weight_decay=0.01,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-500,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

trainer = SFTTrainer(
    model=model,
    peft_config=peft_params,
    max_seq_length=512,
    tokenizer=tokenizer,
    train_dataset = dolly_train_data,
    # eval_dataset = dolly_test_data,
    dataset_text_field="text",
    args=training_params,
    packing=True,
)
trainer.train()

Generating train split: 0 examples [00:00, ? examples/s]

Step,Training Loss
500,2.1971




TrainOutput(global_step=624, training_loss=2.1467053095499673, metrics={'train_runtime': 4090.901, 'train_samples_per_second': 1.22, 'train_steps_per_second': 0.153, 'total_flos': 1.572706230730752e+16, 'train_loss': 2.1467053095499673, 'epoch': 1.0})

## Training custom data


In [None]:
## Custom dataset
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    label_names = ["labels"],
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=50,
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.01,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-500,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

trainer = SFTTrainer(
    model=model,
    peft_config=peft_params,
    max_seq_length=1024,
    tokenizer=tokenizer,
    train_dataset=custom_train_data,
    eval_dataset=custom_test_data,
    dataset_text_field="text",
    args=training_params,
    packing=False,
)
trainer.train()

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Step,Training Loss
50,1.775
100,1.4554
150,1.2309
200,0.9971




TrainOutput(global_step=200, training_loss=1.3645867919921875, metrics={'train_runtime': 321.628, 'train_samples_per_second': 1.244, 'train_steps_per_second': 0.622, 'total_flos': 1091749460865024.0, 'train_loss': 1.3645867919921875, 'epoch': 5.0})

## tensorboard

In [None]:
# from tensorboard import notebook
# log_dir = "results/runs"
# notebook.start("--logdir {} --port 8787".format(log_dir))

## TEST

In [None]:
def prompt(question, context, debug=False, max_new_tokens=512):
    input_text = f"Instruction: {question}\nContext: {context}\nResponse:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    output_tokens = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,  # Ensure we only generate the required amount of text
        pad_token_id=tokenizer.pad_token_id,
    )

    response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    return response


In [None]:
context = """The reliability of self-labeled data is an important issue when the data are regarded as ground-truth for training and testing learning-based models.
This paper addresses the issue of false-alarm hashtags in the self-labeled data for irony detection.
We analyze the ambiguity of hashtag usages and propose a novel neural network-based model, which incorporates linguistic information from different aspects, to disambiguate the usage of three hashtags that are widely used to collect the training data for irony detection.
Furthermore, we apply our model to prune the self-labeled training data.
Experimental results show that the irony detection model trained on the less but cleaner training instances outperforms the models trained on all data."""

question = "From the following abstract, extract the sentences that shows the methods of the research. Only the sentences from the abstract, no other information.\n\n\n"

predicted = prompt(question, context)

print(predicted)



Instruction: From the following abstract, extract the sentences that shows the methods of the research. Only the sentences from the abstract, no other information.



Context: The reliability of self-labeled data is an important issue when the data are regarded as ground-truth for training and testing learning-based models.
This paper addresses the issue of false-alarm hashtags in the self-labeled data for irony detection.
We analyze the ambiguity of hashtag usages and propose a novel neural network-based model, which incorporates linguistic information from different aspects, to disambiguate the usage of three hashtags that are widely used to collect the training data for irony detection.
Furthermore, we apply our model to prune the self-labeled training data.
Experimental results show that the irony detection model trained on the less but cleaner training instances outperforms the models trained on all data.
Response: We analyze the ambiguity of hashtag usages and propose a novel neu

## Save model and tokenizer



In [None]:
model_path = "/content/drive/MyDrive/NLPproject/checkpoint/" + new_model
# trainer.model.save_pretrained(model_path)
# trainer.tokenizer.save_pretrained(model_path)

RUN IN LOCAL

In [None]:
model_local = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    device_map=device
)

tokenizer_local = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True,
    use_fast=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

configuration_openelm.py:   0%|          | 0.00/14.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/apple/OpenELM-1_1B-Instruct:
- configuration_openelm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_openelm.py:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/apple/OpenELM-1_1B-Instruct:
- modeling_openelm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/2.16G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
def prompt_local(question, context, debug=False, max_new_tokens=512):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    input_text = f"Instruction: {question}\nContext: {context}\nResponse:"
    input_ids = tokenizer_local.encode(input_text, return_tensors='pt').to(device)

    model_local.to(device)
    output_tokens = model_local.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer_local.pad_token_id,
    )

    response = tokenizer_local.decode(output_tokens[0], skip_special_tokens=True)

    return response

def extract_first_response(generated_text):
    try:
      parts = generated_text.split("Response:")
      if len(parts) > 1:
          first_response = parts[1].strip()
          end_index = first_response.find("Response:")
          if end_index != -1:
              return first_response[:end_index].strip()
          return first_response
      return generated_text
    except:
      return generated_text

# context = """The reliability of self-labeled data is an important issue when the data are regarded as ground-truth for training and testing learning-based models.
# This paper addresses the issue of false-alarm hashtags in the self-labeled data for irony detection.
# We analyze the ambiguity of hashtag usages and propose a novel neural network-based model, which incorporates linguistic information from different aspects, to disambiguate the usage of three hashtags that are widely used to collect the training data for irony detection.
# Furthermore, we apply our model to prune the self-labeled training data.
# Experimental results show that the irony detection model trained on the less but cleaner training instances outperforms the models trained on all data."""

question = "From the following abstract, extract the sentences that shows the methods of the research. Only the sentences from the abstract, no other data./n/n/n"

# predicted = prompt_local(question, context)
# response_text = extract_first_response(predicted)


In [None]:
print(response_text)

We analyze the ambiguity of hashtag usages and propose a novel neural network-based model, which incorporates linguistic information from different aspects, to disambiguate the usage of three hashtags that are widely used to collect the training data for irony detection.


## Evaluation

We will evaluate your module with a close testset.
The sentence returned by your function will be compared with a golden reference.
The evaluation metric is `ROUGE-L`, which measures the overlap ratio between a predicted output and a reference. The details will be introduced in class.

In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=a664c2f3688d32b86a88e092af40778c94d450e224b70513b03bd9733f60391b
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'])

In [None]:
reference = """We analyze the ambiguity of hashtag usages and propose a novel neural network-based model, which incorporates linguistic information from different aspects, to disambiguate the usage of three hashtags that are widely used to collect the training data for irony detection. Furthermore, we apply our model to prune the self-labeled training data."""

print(scorer.score(reference, response_text)['rougeL'].fmeasure)

0.8723404255319149


In [None]:
def evaluate(foo):
    import urllib.request
    test = "https://www.cs.nccu.edu.tw/~hhhuang/courses/nlp2024/test2024.in"
    gold = "https://www.cs.nccu.edu.tw/~hhhuang/courses/nlp2024/test2024.gold"

    from rouge_score import rouge_scorer
    scorer = rouge_scorer.RougeScorer(['rougeL'])

    total = 0
    cnt = 0
    with urllib.request.urlopen(test) as testin, \
         urllib.request.urlopen(gold) as gold:
        for input, ref in zip(testin, gold):
            input = input.decode("utf-8")
            ref = ref.decode("utf-8")
            output = foo(input)
            score = scorer.score(ref, output)['rougeL'].fmeasure
            cnt += 1
            total += score
            print("Test case %d: %g" % (cnt, score))
    print("Overall: %g" % (total / cnt))
    return total / cnt

# As your working function is `extract_sentence`, so do evaluation with the following statement
def extract_sentence(context):
  predicted = prompt_local(question, context)
  response_text = extract_first_response(predicted)

  return response_text

evaluate(extract_sentence)

Test case 1: 0.820961
Test case 2: 0.807018
Test case 3: 0.694444
Test case 4: 0.802548
Test case 5: 0.598425
Test case 6: 0.530612
Test case 7: 0.576
Test case 8: 0.171875
Test case 9: 0.137931
Test case 10: 0.918728
Test case 11: 0.904762
Test case 12: 0.159827
Test case 13: 0.157895
Test case 14: 0.757576
Test case 15: 0.745223
Test case 16: 0.248
Test case 17: 0.716667
Test case 18: 0.861635
Test case 19: 0.86631
Test case 20: 0.53211
Test case 21: 0.792453
Test case 22: 0.4625
Test case 23: 0.649123
Test case 24: 0.299401
Test case 25: 0.409836
Test case 26: 0.921811
Test case 27: 0.773723
Test case 28: 0.852459
Test case 29: 0.675439
Test case 30: 0.744792
Test case 31: 0.724359
Test case 32: 0.354839
Test case 33: 0.629442
Test case 34: 0.639535
Test case 35: 0.776316
Test case 36: 0.658537
Test case 37: 0.10929
Test case 38: 0.568807
Test case 39: 0.12766
Test case 40: 0.784314
Test case 41: 0.121212
Test case 42: 0.752809
Test case 43: 0.691525
Test case 44: 0.859107
Test case

0.5868431420219477