## This Notebook was presented at NLP Lecture for MGT 6785 (Fall 2023)

### Author: Agam Shah

### Instalation and basic imports

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate

import numpy as np
import pandas as pd
import os

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m80.1 MB/s[0m eta [36m0:00:0

# Fine-Tune the model

## Data import from HuggingFace

In [None]:
from datasets import load_dataset

data_files = {"train": "train.csv", "test": "test.csv"}
dataset = load_dataset("gtfintechlab/fomc-example-dataset", data_files=data_files)
print(dataset)

Downloading readme:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/423k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/104k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'sentence', 'year', 'label', 'orig_index'],
        num_rows: 1984
    })
    test: Dataset({
        features: ['index', 'sentence', 'year', 'label', 'orig_index'],
        num_rows: 496
    })
})


# Fine-Tune RoBERTa model

## Data processing and tokenization

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('roberta-base')

def tokenize_data(example):
    return tokenizer(example['sentence'], padding='max_length')

dataset = dataset.map(tokenize_data, batched=True)

remove_columns = ['index', 'sentence', 'year', 'orig_index']
dataset = dataset.map(remove_columns=remove_columns)

print(dataset)

train_dataset = dataset['train']
eval_dataset = dataset['train']

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/1984 [00:00<?, ? examples/s]

Map:   0%|          | 0/496 [00:00<?, ? examples/s]

Map:   0%|          | 0/1984 [00:00<?, ? examples/s]

Map:   0%|          | 0/496 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1984
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 496
    })
})


## Train the model

### Set training arguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="shahagam4/trial-model",
                                  num_train_epochs=1,
                                  learning_rate=1e-6,
                                  per_device_train_batch_size=4,
                                  hub_model_id="shahagam4/trial-model",
                                  push_to_hub=False)

### Load Pre-trained Language Model (PLM)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load and create function to compute metric

In [None]:
from datasets import load_metric

metric = load_metric("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")



  metric = load_metric("f1")


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

### Create trainer object

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset.shard(num_shards=10, index=0),
    eval_dataset=eval_dataset.shard(num_shards=10, index=0),
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

### Train (Fine-tune) the model

In [None]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=50, training_loss=1.1059552764892577, metrics={'train_runtime': 22.7019, 'train_samples_per_second': 8.766, 'train_steps_per_second': 2.202, 'total_flos': 52359570127872.0, 'train_loss': 1.1059552764892577, 'epoch': 1.0})

### Evaluate the model

In [None]:
evaluate_output = trainer.evaluate()
print(evaluate_output)

{'eval_loss': 1.1078120470046997, 'eval_f1': 0.17301738730670807, 'eval_runtime': 5.6878, 'eval_samples_per_second': 34.987, 'eval_steps_per_second': 4.395, 'epoch': 1.0}


# Deploy model on HuggingFace

### Login to HuggingFace

In [None]:
from huggingface_hub import login, logout

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Push tokenizer and trained model

In [None]:
tokenizer.push_to_hub("shahagam4/trial-model")
trainer.push_to_hub()

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

'https://huggingface.co/shahagam4/trial-model/tree/main/'

### Modify and push additional files

In [None]:
# Open the tokenizer config file.

import json
with open("/content/shahagam4/trial-model/tokenizer_config.json", "r") as f:
  config = json.load(f)

# Make the necessary changes to the config file.

config["name_or_path"] = "roberta-base"

with open("/content/shahagam4/trial-model/tokenizer_config.json", "w") as f:
  json.dump(config, f, indent=4)


from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="/content/shahagam4/trial-model/tokenizer_config.json",
    path_in_repo="tokenizer_config.json",
    repo_id="shahagam4/trial-model",
    repo_type="model",
)

'https://huggingface.co/shahagam4/trial-model/blob/main/tokenizer_config.json'

# Use already deployed model on HuggingFace

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("gtfintechlab/FOMC-RoBERTa", do_lower_case=True, do_basic_tokenize=True)

model = AutoModelForSequenceClassification.from_pretrained("gtfintechlab/FOMC-RoBERTa", num_labels=3)

config = AutoConfig.from_pretrained("gtfintechlab/FOMC-RoBERTa")

classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, config=config, device=0, framework="pt")
# classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, config=config, framework="pt")
results = classifier(["Such a directive would imply that any tightening should be implemented promptly if developments were perceived as pointing to rising inflation.",
                      "The International Monetary Fund projects that global economic growth in 2019 will be the slowest since the financial crisis."],
                      batch_size=4, truncation="only_first")

print(results)

Downloading (…)okenizer_config.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/891 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

[{'label': 'LABEL_1', 'score': 0.999393105506897}, {'label': 'LABEL_0', 'score': 0.9979877471923828}]


# Zero-shot LLaMA-2-7B

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

## get model and tokenizer from HuggingFace

model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

### Set pipiline for text generation

In [None]:
import torch

pipeline_obj = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

### Create prompt

In [None]:
prompt = "Behave like you are an expert sentence classifier. Classify the following sentence from FOMC into 'HAWKISH', 'DOVISH', or 'NEUTRAL' class. Label 'HAWKISH' if it is corresponding to tightening of the monetary policy, 'DOVISH' if it is corresponding to easing of the monetary policy, or 'NEUTRAL' if the stance is neutral. Provide the label in the first line and provide a short explanation in the second line. The sentence: " + "Such a directive would imply that any tightening should be implemented promptly if developments were perceived as pointing to rising inflation."
prompt = "Tell me something interesting about Georgia Institute of Technology."

prompts_list = [prompt]

### Chat with model with prompt

In [None]:
res = pipeline_obj(
        prompts_list,
        max_new_tokens=512,
        do_sample=True,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        )

### Print output

In [None]:
print(res)

[[{'generated_text': "Tell me something interesting about Georgia Institute of Technology.\nGeorgia Institute of Technology, also known as Georgia Tech, is a public research university located in Atlanta, Georgia, United States. Here are a few interesting facts about Georgia Tech:\n1. Georgia Tech is one of the top-ranked public universities in the United States, according to U.S. News & World Report. It is consistently ranked among the top 10 public universities in the country.\n2. Georgia Tech was founded in 1885 as the Georgia School of Technology, with the mission of providing technical education to the people of Georgia. Today, it is a comprehensive research university with over 30,000 students from all 50 states and more than 100 countries.\n3. Georgia Tech is known for its strong programs in engineering, computer science, and business, as well as its interdisciplinary research in fields such as biotechnology, nanotechnology, and renewable energy.\n4. The university has a strong 

### Logout from HuggingFace

In [None]:
logout() # logout completely

Successfully logged out.
