In [31]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, GPT2Tokenizer, GPT2Model

from PEFT_For_Summarization import loaded_peft_model

In [32]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)

In [33]:
# encode context the generation is condition on
model_inputs = tokenizer('I enjoy walking with my cute dog', return_tensors='pt')

# generate 40 new tokens
greedy_output = model.generate(**model_inputs, max_new_tokens=40)

print('Output: \n' + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output: 
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my dog.

I'm not sure


In [34]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
print(output)

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[ 0.1629, -0.2166, -0.1410,  ..., -0.2619, -0.0819,  0.0092],
         [ 0.4628,  0.0248, -0.0785,  ..., -0.0859,  0.5122, -0.3939],
         [-0.0644,  0.1551, -0.6306,  ...,  0.2488,  0.3691,  0.0833],
         ...,
         [-0.5591, -0.4490, -1.4540,  ...,  0.1650, -0.1302, -0.3740],
         [ 0.1400, -0.3875, -0.7916,  ..., -0.1780,  0.1824,  0.2185],
         [ 0.1721, -0.2420, -0.1124,  ..., -0.1068,  0.1205, -0.3213]]],
       grad_fn=<ViewBackward0>), past_key_values=((tensor([[[[-1.0719,  2.4170,  0.9660,  ..., -0.4787, -0.3316,  1.7925],
          [-2.2897,  2.5424,  0.8317,  ..., -0.5299, -2.4828,  1.3537],
          [-2.2856,  2.7125,  2.4725,  ..., -1.4911, -1.8427,  1.6493],
          ...,
          [-3.3203,  2.3325,  2.7061,  ..., -1.1569, -1.5586,  2.4076],
          [-2.9917,  2.2701,  2.1742,  ..., -0.8670, -1.6410,  1.9237],
          [-2.5066,  2.6139,  2.1347,  ..., -0.0627, -2.0542,  1.6568]],

In [35]:
pipe = pipeline('text-generation', model='gpt2')

pipe('I enjoy walking with my cute dog', max_length=30, num_return_sequences=5)

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'I enjoy walking with my cute dog. I love helping my daughter and she loves me sometimes.\n\nMy dog loves me sometimes when I play with'},
 {'generated_text': 'I enjoy walking with my cute dog. I can tell that she is a little confused, but she can tell from the way I walk her that it'},
 {'generated_text': "I enjoy walking with my cute dog. I don't need to explain why, because I love to be left alone with my puppies, so I thought"},
 {'generated_text': "I enjoy walking with my cute dog.\n\nIt's a simple and safe walk in the city. I'm currently running around looking for a spot"},
 {'generated_text': 'I enjoy walking with my cute dog and he is a very special, and we get along really well," he told the local media.'}]

## Tokenizers

In [36]:
tokenizer = AutoTokenizer.from_pretrained('sshleifer/distilbart-cnn-12-6')

In [37]:
raw_inputs = [
    'I love deep learning',
    'I hate this so much!'
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[   0,  100,  657, 1844, 2239,    2,    1,    1],
        [   0,  100, 4157,   42,   98,  203,  328,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])}


In [38]:
print('Tokenizer output for "I love deep learning!"')
print(f'Input ids: {inputs["input_ids"][0]}')
print(f'Attention Mask: {inputs["attention_mask"][0]}')
print('-' * 100)
print('Tokenizer output for "I hate this so much!"')
print(f'Input ids: {inputs['input_ids'][1]}')
print(f'Attention Mask: {inputs["attention_mask"][1]}')

Tokenizer output for "I love deep learning!"
Input ids: tensor([   0,  100,  657, 1844, 2239,    2,    1,    1])
Attention Mask: tensor([1, 1, 1, 1, 1, 1, 0, 0])
----------------------------------------------------------------------------------------------------
Tokenizer output for "I hate this so much!"
Input ids: tensor([   0,  100, 4157,   42,   98,  203,  328,    2])
Attention Mask: tensor([1, 1, 1, 1, 1, 1, 1, 1])


In [39]:
tokens = tokenizer.tokenize('I love deep learning!')
tokens

['I', 'Ġlove', 'Ġdeep', 'Ġlearning', '!']

In [40]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids

[100, 657, 1844, 2239, 328]

In [41]:
decoded_tokens = tokenizer.decode(token_ids)
decoded_tokens

'I love deep learning!'

In [42]:
model_prepped_ids = tokenizer.prepare_for_model(token_ids)
model_prepped_ids

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': [0, 100, 657, 1844, 2239, 328, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

## Models

In [43]:
classifier = pipeline('sentiment-analysis')
classifier(
    [
        'I love deep learning!',
        'I hate this so much!'
    ]
)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.9998645782470703},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [44]:
text_generator = pipeline('text-generation')
text_generator(
    [
        'I went to the store to buy',
        'When two objects in space get close to each other'
    ]
)

No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[[{'generated_text': 'I went to the store to buy a bike which cost me only $25. My boyfriend sent me a card on my card. He said he was going to purchase 2 packs (2 of this bike and 1 from JVC for $8 each)'}],
 [{'generated_text': 'When two objects in space get close to each other, they merge into one, resulting in the collision occurring. When two objects in proximity are merged together, their collision is performed from the beginning. [This description is part of the Unicode-'}]]

In [45]:
summarizer = pipeline('summarization')
summarizer(
    [
        """A Fibonacci heap is a collection of trees satisfying the min-heap property. It allows faster amortized time for many operations than binary or binomial heaps.
        Trees in a Fibonacci heap can have any shape, which facilitates efficient operations. Lazy strategies are employed: node removals and consolidations are delayed until
        absolutely necessary (like during an extract-min operation). The main advantage lies in decreasing a key and merging two heaps, which are constant and amortized
        constant time, respectively. Nodes have a "mark" indicating if they've lost a child since the last time they were made a child of another node, assisting in
        restructuring during operations."""
    ]
)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'summary_text': ' A Fibonacci heap is a collection of trees satisfying the min-heap property . It allows faster amortized time for many operations than binary or binomial heaps . Nodes have a "mark" indicating if they\'ve lost a child since the last time they were made a child of another node .'}]

## Accessing Pretrained Models

In [46]:
# Load model directly
from transformers import AutoModelForSequenceClassification

In [47]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

In [48]:
inputs = tokenizer('I love deep learning', return_tensors='pt')
inputs

{'input_ids': tensor([[ 101, 1045, 2293, 2784, 4083,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [49]:
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-4.1975,  4.4937]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

## Model Embeddings

In [50]:
from transformers import AutoModel

In [51]:
model = AutoModel.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

In [52]:
inputs = tokenizer('I love deep learning!', padding=True, truncation=True, return_tensors='pt')
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([1, 7, 768])


In [53]:
# to get the full context vector for the sequence
context_vectors = outputs.last_hidden_state.mean(dim=1)
context_vectors.shape

torch.Size([1, 768])

## Accessing Model Config and Creating Custom Models

In [54]:
from transformers import GPT2Config, GPT2Model

In [55]:
# building config
config = GPT2Config()

In [56]:
print(config)

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.49.0",
  "use_cache": true,
  "vocab_size": 50257
}



In [57]:
# Building the model from the config
gpt_model = GPT2Model(config)

# Saving Models

In [58]:
gpt_model.save_pretrained('models/gpt2_model')

## Loading Datasets From HuggingFace

In [59]:
from datasets import load_dataset

dataset = load_dataset("fka/awesome-chatgpt-prompts")
dataset

DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 203
    })
})

In [60]:
# print an example
dataset['train'][0]

{'act': 'An Ethereum Developer',
 'prompt': 'Imagine you are an experienced Ethereum developer tasked with creating a smart contract for a blockchain messenger. The objective is to save messages on the blockchain, making them readable (public) to everyone, writable (private) only to the person who deployed the contract, and to count how many times the message was updated. Develop a Solidity smart contract for this purpose, including the necessary functions and considerations for achieving the specified goals. Please provide the code and any relevant explanations to ensure a clear understanding of the implementation.'}

In [61]:
# shuffle & sample
dataset = dataset['train'].shuffle(seed=37).select(range(100))
print(dataset)

Dataset({
    features: ['act', 'prompt'],
    num_rows: 100
})


In [62]:
# Create test Dataset
dataset = dataset.train_test_split(train_size=0.8, seed=42)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 80
    })
    test: Dataset({
        features: ['act', 'prompt'],
        num_rows: 20
    })
})


## Creating Your Own Dataset

In [63]:
import os
import tarfile
import wget

# Check if 'data' folder exists, if not, create it
if not os.path.exists('data'):
    os.makedirs('data')

# Check if the file exists in the 'data' folder, if not, download it
file_path = os.path.join('data', 'reuters21578.tar.gz')
if not os.path.exists(file_path):
    wget.download('https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz', file_path)

# Extract the tar.gz file into the 'data' folder
with tarfile.open(file_path, 'r:gz') as tar:
    tar.extractall(path='data')


  tar.extractall(path='data')


In [64]:
# the sgm files are what contains the articles
from bs4 import BeautifulSoup

# Open the file and parse its content with BeautifulSoup
reuters_articles = []
for i in range(22):
  if i < 10:
    i = f"0{i}"

  # load file data
  with open(f"data/reut2-0{i}.sgm", 'r', encoding='latin-1') as file:
      soup = BeautifulSoup(file, "html.parser")

  # Extract articles' titles and bodies
  articles = []
  for reuters in soup.find_all('reuters'):
      title = reuters.title.string if reuters.title else ""
      body = reuters.body.string if reuters.body else ""
      articles.append({
            'title': title,
            'body': body
        })

  reuters_articles.extend(articles)

In [65]:
# Print out the first few articles for inspection
for i, article in enumerate(reuters_articles[:5]):
  print(article)
  print("-"*100)

{'title': 'BAHIA COCOA REVIEW', 'body': 'Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n    The dry period means the temporao will be late this year.\n    Arrivals for the week ended February 22 were 155,221 bags\nof 60 kilos making a cumulative total for the season of 5.93\nmln against 5.81 at the same stage last year. Again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    Comissaria Smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. With total Bahia crop estimates\naround 6.4 mln bags and sales standing at almost 6.2 mln there\nare a few hundred thousand bags still in the hands of farmers,\nmiddlemen, exporters and processors.\n    There are do

In [66]:
import json

TRAIN_PCT, VALID_PCT = 0.8, 0.1

# Split the data
train_articles = reuters_articles[:int(len(reuters_articles)*TRAIN_PCT)]
valid_articles = reuters_articles[int(len(reuters_articles)*TRAIN_PCT): int(len(reuters_articles)*(TRAIN_PCT + VALID_PCT))]
test_articles = reuters_articles[int(len(reuters_articles)*(TRAIN_PCT + VALID_PCT)):]

# Function to save articles as JSON
def save_as_json(data, filename):
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)

# Save them into JSON files
save_as_json(train_articles, "data/train.json")
save_as_json(valid_articles, "data/valid.json")
save_as_json(test_articles, "data/test.json")

In [67]:
data_files = {"train": "data/train.json", "validation": "data/valid.json", "test": "data/test.json"}
dataset = load_dataset("json", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [68]:
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
})

In [69]:
dataset['train'][0]

{'title': 'BAHIA COCOA REVIEW',
 'body': 'Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n    The dry period means the temporao will be late this year.\n    Arrivals for the week ended February 22 were 155,221 bags\nof 60 kilos making a cumulative total for the season of 5.93\nmln against 5.81 at the same stage last year. Again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    Comissaria Smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. With total Bahia crop estimates\naround 6.4 mln bags and sales standing at almost 6.2 mln there\nare a few hundred thousand bags still in the hands of farmers,\nmiddlemen, exporters and processors.\n    There are d

In [70]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [71]:
dataset.push_to_hub('reuters_articles')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/3NTRPY-13/reuters_articles/commit/f9c78be8633b287a021417ce156fd18e95dad973', commit_message='Upload dataset', commit_description='', oid='f9c78be8633b287a021417ce156fd18e95dad973', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/3NTRPY-13/reuters_articles', endpoint='https://huggingface.co', repo_type='dataset', repo_id='3NTRPY-13/reuters_articles'), pr_revision=None, pr_num=None)

## Creating A Tokenizer

In [72]:
dataset = load_dataset('3NTRPY-13/reuters_articles')

In [73]:
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
})

In [74]:
def create_full_article_col(example):
    return {'full_article': f'TITLE:{example["title"]}\n\nBODY:{example["body"]}'}

dataset = dataset.map(create_full_article_col)
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 2158
    })
})

In [75]:
dataset['train'][0]['full_article']

'TITLE:BAHIA COCOA REVIEW\n\nBODY:Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n    The dry period means the temporao will be late this year.\n    Arrivals for the week ended February 22 were 155,221 bags\nof 60 kilos making a cumulative total for the season of 5.93\nmln against 5.81 at the same stage last year. Again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    Comissaria Smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. With total Bahia crop estimates\naround 6.4 mln bags and sales standing at almost 6.2 mln there\nare a few hundred thousand bags still in the hands of farmers,\nmiddlemen, exporters and processors.\n    There are doubts as

## Training Our Own Tokenizer

In [76]:
# Create a batched dataset for training, creates an interator object for later usage when training tokenizer

training_corpus = (
    dataset['train'][i:i+1000]['full_article'] for i in range(0, len(dataset['train']), 1000)
)

In [77]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [78]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=52000)






In [79]:
example = dataset['test'][2]['full_article']
example

"TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Chefs International\nInc said it completed a private sale of nine mln units of its\nsecurities for 20 cts per unit for a total price of 1,800,000\ndlrs.\n    The company said each unit consisted of one share of Chef's\ncommon stock and one three-year warrant exerciseable to\npurchase one share of Chef's stock at 25 cents.\n    The company also said Robert E. Brennan purchased 8,250,000\nof the units.\n   \n Reuter\n\x03"

In [80]:
old_tokenizer.tokenize(example)

['TIT',
 'LE',
 ':',
 'CH',
 'E',
 'FS',
 'Ġ<',
 'CH',
 'EF',
 '.',
 'O',
 '>',
 'ĠCOMPLE',
 'T',
 'ES',
 'ĠPR',
 'IV',
 'ATE',
 'ĠS',
 'ALE',
 'Ċ',
 'Ċ',
 'B',
 'ODY',
 ':',
 'Che',
 'fs',
 'ĠInternational',
 'Ċ',
 'Inc',
 'Ġsaid',
 'Ġit',
 'Ġcompleted',
 'Ġa',
 'Ġprivate',
 'Ġsale',
 'Ġof',
 'Ġnine',
 'Ġm',
 'ln',
 'Ġunits',
 'Ġof',
 'Ġits',
 'Ċ',
 'sec',
 'urities',
 'Ġfor',
 'Ġ20',
 'Ġc',
 'ts',
 'Ġper',
 'Ġunit',
 'Ġfor',
 'Ġa',
 'Ġtotal',
 'Ġprice',
 'Ġof',
 'Ġ1',
 ',',
 '800',
 ',',
 '000',
 'Ċ',
 'dl',
 'rs',
 '.',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'ĠThe',
 'Ġcompany',
 'Ġsaid',
 'Ġeach',
 'Ġunit',
 'Ġconsisted',
 'Ġof',
 'Ġone',
 'Ġshare',
 'Ġof',
 'ĠChef',
 "'s",
 'Ċ',
 'common',
 'Ġstock',
 'Ġand',
 'Ġone',
 'Ġthree',
 '-',
 'year',
 'Ġwarrant',
 'Ġexercise',
 'able',
 'Ġto',
 'Ċ',
 'p',
 'urchase',
 'Ġone',
 'Ġshare',
 'Ġof',
 'ĠChef',
 "'s",
 'Ġstock',
 'Ġat',
 'Ġ25',
 'Ġcents',
 '.',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'ĠThe',
 'Ġcompany',
 'Ġalso',
 'Ġsaid',
 'ĠRobert',
 'ĠE',
 '.',
 'ĠBren

In [81]:
tokenizer.tokenize(example)

['TITLE',
 ':',
 'CH',
 'EF',
 'S',
 'Ġ<',
 'CH',
 'EF',
 '.',
 'O',
 '>',
 'ĠCOMPLETES',
 'ĠPRIVATE',
 'ĠSALE',
 'Ċ',
 'Ċ',
 'BODY',
 ':',
 'Che',
 'f',
 's',
 'ĠInternational',
 'Ċ',
 'Inc',
 'Ġsaid',
 'Ġit',
 'Ġcompleted',
 'Ġa',
 'Ġprivate',
 'Ġsale',
 'Ġof',
 'Ġnine',
 'Ġmln',
 'Ġunits',
 'Ġof',
 'Ġits',
 'Ċ',
 'securities',
 'Ġfor',
 'Ġ20',
 'Ġcts',
 'Ġper',
 'Ġunit',
 'Ġfor',
 'Ġa',
 'Ġtotal',
 'Ġprice',
 'Ġof',
 'Ġ1',
 ',',
 '800',
 ',',
 '000',
 'Ċ',
 'dlrs',
 '.',
 'ĊĠĠĠ',
 'ĠThe',
 'Ġcompany',
 'Ġsaid',
 'Ġeach',
 'Ġunit',
 'Ġconsisted',
 'Ġof',
 'Ġone',
 'Ġshare',
 'Ġof',
 'ĠChe',
 'f',
 "'s",
 'Ċ',
 'common',
 'Ġstock',
 'Ġand',
 'Ġone',
 'Ġthree',
 '-',
 'year',
 'Ġwarrant',
 'Ġexerciseable',
 'Ġto',
 'Ċ',
 'purchase',
 'Ġone',
 'Ġshare',
 'Ġof',
 'ĠChe',
 'f',
 "'s",
 'Ġstock',
 'Ġat',
 'Ġ25',
 'Ġcents',
 '.',
 'ĊĠĠĠ',
 'ĠThe',
 'Ġcompany',
 'Ġalso',
 'Ġsaid',
 'ĠRobert',
 'ĠE',
 '.',
 'ĠBrennan',
 'Ġpurchased',
 'Ġ8',
 ',',
 '250',
 ',',
 '000',
 'Ċ',
 'of',
 'Ġthe',
 '

In [82]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [83]:
tokenizer.push_to_hub('gpt2-reuters_tokenizer')

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/3NTRPY-13/gpt2-reuters_tokenizer/commit/2e4946bbdf5792f8c9060ee767abea10070f46c6', commit_message='Upload tokenizer', commit_description='', oid='2e4946bbdf5792f8c9060ee767abea10070f46c6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/3NTRPY-13/gpt2-reuters_tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='3NTRPY-13/gpt2-reuters_tokenizer'), pr_revision=None, pr_num=None)

In [84]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('3NTRPY-13/gpt2-reuters_tokenizer')

In [85]:
example = dataset['test'][2]
example

{'title': 'CHEFS <CHEF.O> COMPLETES PRIVATE SALE',
 'body': "Chefs International\nInc said it completed a private sale of nine mln units of its\nsecurities for 20 cts per unit for a total price of 1,800,000\ndlrs.\n    The company said each unit consisted of one share of Chef's\ncommon stock and one three-year warrant exerciseable to\npurchase one share of Chef's stock at 25 cents.\n    The company also said Robert E. Brennan purchased 8,250,000\nof the units.\n   \n Reuter\n\x03",
 'full_article': "TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Chefs International\nInc said it completed a private sale of nine mln units of its\nsecurities for 20 cts per unit for a total price of 1,800,000\ndlrs.\n    The company said each unit consisted of one share of Chef's\ncommon stock and one three-year warrant exerciseable to\npurchase one share of Chef's stock at 25 cents.\n    The company also said Robert E. Brennan purchased 8,250,000\nof the units.\n   \n Reuter\n\x03"}

In [86]:
tokenizer.tokenize(example['full_article'])

['TITLE',
 ':',
 'CH',
 'EF',
 'S',
 'Ġ<',
 'CH',
 'EF',
 '.',
 'O',
 '>',
 'ĠCOMPLETES',
 'ĠPRIVATE',
 'ĠSALE',
 'Ċ',
 'Ċ',
 'BODY',
 ':',
 'Che',
 'f',
 's',
 'ĠInternational',
 'Ċ',
 'Inc',
 'Ġsaid',
 'Ġit',
 'Ġcompleted',
 'Ġa',
 'Ġprivate',
 'Ġsale',
 'Ġof',
 'Ġnine',
 'Ġmln',
 'Ġunits',
 'Ġof',
 'Ġits',
 'Ċ',
 'securities',
 'Ġfor',
 'Ġ20',
 'Ġcts',
 'Ġper',
 'Ġunit',
 'Ġfor',
 'Ġa',
 'Ġtotal',
 'Ġprice',
 'Ġof',
 'Ġ1',
 ',',
 '800',
 ',',
 '000',
 'Ċ',
 'dlrs',
 '.',
 'ĊĠĠĠ',
 'ĠThe',
 'Ġcompany',
 'Ġsaid',
 'Ġeach',
 'Ġunit',
 'Ġconsisted',
 'Ġof',
 'Ġone',
 'Ġshare',
 'Ġof',
 'ĠChe',
 'f',
 "'s",
 'Ċ',
 'common',
 'Ġstock',
 'Ġand',
 'Ġone',
 'Ġthree',
 '-',
 'year',
 'Ġwarrant',
 'Ġexerciseable',
 'Ġto',
 'Ċ',
 'purchase',
 'Ġone',
 'Ġshare',
 'Ġof',
 'ĠChe',
 'f',
 "'s",
 'Ġstock',
 'Ġat',
 'Ġ25',
 'Ġcents',
 '.',
 'ĊĠĠĠ',
 'ĠThe',
 'Ġcompany',
 'Ġalso',
 'Ġsaid',
 'ĠRobert',
 'ĠE',
 '.',
 'ĠBrennan',
 'Ġpurchased',
 'Ġ8',
 ',',
 '250',
 ',',
 '000',
 'Ċ',
 'of',
 'Ġthe',
 '

# Fine-tuning & PEFT

## Full Fine-tuning BART For Summarization

In [87]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch


"""
BART HAS 400M PARAMS: https://github.com/facebookresearch/fairseq/tree/main/examples/bart
"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-cnn')
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large-cnn').to(device)

In [88]:
from datasets import load_dataset

dataset = load_dataset("knkarthick/samsum")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})

In [119]:
sample = dataset['test'][0]['dialogue']
label = dataset['test'][0]['summary']

def generate_summary(input, llm, device):
    input_prompt = f"""
                    Summarize the following conversation:

                    {input}

                    Summary:
                    """
    input_ids = tokenizer(sample, return_tensors='pt').to(device)
    tokenized_output = llm.generate(**input_ids, min_length=30, max_length=200)
    output = tokenizer.decode(tokenized_output[0], skip_special_tokens=True)
    return output

output = generate_summary(sample, model, device)
print('Sample')
print(sample)
print("-" * 100)
print("Model Generated Summary:")
print(output)
print('Correct Summary:')
print(label)




Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
----------------------------------------------------------------------------------------------------
Model Generated Summary:
. Hannah and Amanda are at the park together. Hannah doesn't have Betty's number. Amanda asks Hannah to text Larry.
Correct Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [90]:
def tokenize_inputs(example):
    start_prompt = "Summarize the following conversation. \n\n"
    end_prompt = "\n\nSummary: "
    prompt = [
        start_prompt + (dialogue if dialogue is not None else "") + end_prompt
        for dialogue in example['dialogue']
    ]
    tokenized_prompt = tokenizer(prompt, padding='max_length', truncation=True, return_tensors='pt', max_length=512)
    tokenized_summary = tokenizer(example['summary'], padding='max_length', truncation=True, return_tensors='pt', max_length=512)

    example['input_ids'] = tokenized_prompt['input_ids']
    example['attention_mask'] = tokenized_prompt['attention_mask']
    example['labels'] = tokenized_summary['input_ids']

    return example

tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(tokenize_inputs, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary'])
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14732 [00:00<?, ? examples/s]

In [91]:
print(tokenized_datasets['train'].shape)
print(tokenized_datasets['validation'].shape)
print(tokenized_datasets['test'].shape)

(148, 3)
(9, 3)
(9, 3)


In [92]:
tokenized_datasets['train'][0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [93]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [94]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='models/bart-cnn-samsum-finetuned',
    hub_model_id='3NTRPY-13/bart-cnn-samsum-finetuned',
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    auto_find_batch_size=True,
    eval_strategy='epoch',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [95]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.088,0.131138




TrainOutput(global_step=37, training_loss=0.1297052502632141, metrics={'train_runtime': 16.3692, 'train_samples_per_second': 9.041, 'train_steps_per_second': 2.26, 'total_flos': 169034158964736.0, 'train_loss': 0.1297052502632141, 'epoch': 1.0})

In [96]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/3NTRPY-13/bart-cnn-samsum-finetuned/commit/5989c2a0c32800d9f9d75e161c6ec579750b0f3f', commit_message='End of training', commit_description='', oid='5989c2a0c32800d9f9d75e161c6ec579750b0f3f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/3NTRPY-13/bart-cnn-samsum-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='3NTRPY-13/bart-cnn-samsum-finetuned'), pr_revision=None, pr_num=None)

In [97]:
loaded_model = AutoModelForSeq2SeqLM.from_pretrained('3NTRPY-13/bart-cnn-samsum-finetuned').to(device)
output = generate_summary(sample, model, device)

print('Sample')
print(sample)
print("-" * 100)
print("Model Generated Summary:")
print(output)
print('Correct Summary:')
print(label)




model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
----------------------------------------------------------------------------------------------------
Model Generated Summary:
Hannah doesn't have Betty's number. Amanda asks Larry to text her. Hannah doesn't know Larry well. Amanda will text him.
Correct Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


## PEFT

In [142]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained('3NTRPY-13/bart-cnn-samsum-finetuned')
model = AutoModelForSeq2SeqLM.from_pretrained('3NTRPY-13/bart-cnn-samsum-finetuned').to(device)

In [143]:
from datasets import load_dataset

dataset = load_dataset('knkarthick/samsum')
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})

In [152]:
tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(tokenize_inputs, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary'])
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

In [145]:
print(tokenized_datasets['train'].shape)
print(tokenized_datasets['validation'].shape)
print(tokenized_datasets['test'].shape)

(148, 3)
(9, 3)
(9, 3)


## Create PEFT Model using LoRA

In [146]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.05,
    bias='none',
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [147]:
peft_model = get_peft_model(model, lora_config)

In [148]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [149]:
from transformers import TrainingArguments, Trainer

peft_training_args = TrainingArguments(
    output_dir='models/bart-cnn-samsum-finetuned-lora',
    hub_model_id='3NTRPY-13/bart-cnn-samsum-finetuned-lora',
    learning_rate=1e-5,
    num_train_epochs=15,
    weight_decay=0.01,
    auto_find_batch_size=True,
    eval_strategy='epoch',
    logging_steps=10
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [150]:
peft_model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 411,009,024 || trainable%: 1.1481


In [153]:
peft_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1181,0.131306
2,0.0938,0.131025
3,0.0863,0.130831
4,0.0969,0.1305
5,0.0995,0.130323
6,0.0874,0.130187
7,0.0869,0.130065
8,0.0817,0.129762
9,0.092,0.12915
10,0.0785,0.129326




TrainOutput(global_step=1110, training_loss=0.08322439714595005, metrics={'train_runtime': 144.8242, 'train_samples_per_second': 15.329, 'train_steps_per_second': 7.664, 'total_flos': 2446450533335040.0, 'train_loss': 0.08322439714595005, 'epoch': 15.0})

In [154]:
peft_trainer.push_to_hub()



training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/18.9M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/3NTRPY-13/bart-cnn-samsum-finetuned-lora/commit/c7eb5a54dbc5ebd2173503111b34350fff05dd17', commit_message='End of training', commit_description='', oid='c7eb5a54dbc5ebd2173503111b34350fff05dd17', pr_url=None, repo_url=RepoUrl('https://huggingface.co/3NTRPY-13/bart-cnn-samsum-finetuned-lora', endpoint='https://huggingface.co', repo_type='model', repo_id='3NTRPY-13/bart-cnn-samsum-finetuned-lora'), pr_revision=None, pr_num=None)

In [155]:
from peft import PeftModel

tokenizer = AutoTokenizer.from_pretrained('3NTRPY-13/bart-cnn-samsum-finetuned')
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained('3NTRPY-13/bart-cnn-samsum-finetuned').to(device)

loaded_peft_model = PeftModel.from_pretrained(
    peft_model_base,
    '3NTRPY-13/bart-cnn-samsum-finetuned-lora',
    is_trainable=False
)

adapter_model.safetensors:   0%|          | 0.00/18.9M [00:00<?, ?B/s]

In [139]:
sample = dataset['test'][0]['dialogue']
label = dataset['test'][0]['summary']

output = generate_summary(sample, loaded_peft_model, device)

print("Sample")
print(sample)
print("-------------------")
print("Summary:")
print(output)
print("Ground Truth Summary:")
print(label)

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
-------------------
Summary:

Ground Truth Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
