In [None]:
# !pip install -q -U bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/transformers.git
# !pip install -q -U git+https://github.com/huggingface/peft.git
# !pip install -q -U git+https://github.com/huggingface/accelerate.git
# !pip install -q datasets
# !pip install -U adapter-transformers sentencepiece
!pip install -U adapter-transformers sentencepiece
!pip install datasets
!pip install evaluate==0.4.0
!pip install rouge_score

Collecting adapter-transformers
  Downloading adapter_transformers-3.2.1-py3-none-any.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from adapter-transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from adapter-transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m34.9 MB/s[0m eta [

In [None]:
from transformers import AutoTokenizer

def model_and_tokenizer(model_name):
  if model_name =="flan-t5":
    base_model = "google/flan-t5-base"
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    return base_model, tokenizer


# tokenize the dataset
def encode_batch(examples, text_column = 'document', summary_column ='summary', padding = "max_length"):
    # convert to lists of strings
    inputs, targets = [], []
    for i in range(len(examples[text_column])):
        if examples[text_column][i] and examples[summary_column][i]:
            inputs.append(examples[text_column][i])
            targets.append(examples[summary_column][i])

    # add prefix to inputs
    inputs = [prefix + inp for inp in inputs]

    # finally we can tokenize the inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, padding=padding, truncation=True)
    labels = tokenizer(targets, max_length=512, padding=padding, truncation=True)

    # rename to labels for training
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

def load_split(dataset, split_name, max_items):
    # load the split
    #dataset = load_dataset("xsum")[split_name]
    dataset = dataset[split_name]
    # only use the first max_items items
    dataset = dataset.filter(lambda _, idx: idx < max_items, with_indices=True)
    # tokenize the dataset
    dataset = dataset.map(
        encode_batch,#(examples, model = model_name ,text_column = 'document', summary_column ='summary'),
        batched=True,
        remove_columns=dataset.column_names,
        desc="Running tokenizer on " + split_name + " dataset",
    )
    # set the format to torch
    dataset.set_format(type="torch", columns=["input_ids", "labels"])

    return dataset

In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers.adapters import LoRAConfig
import numpy as np
from datasets import load_dataset

model_name = 'flan-t5'
prefix = 'summarize: '

dataset = load_dataset('csv', data_files={'train': 'df_train.csv',
                                              'test': 'df_test.csv'})

base_model, tokenizer = model_and_tokenizer(model_name)

# start with the pretrained base model
model = AutoModelForSeq2SeqLM.from_pretrained(
    base_model
)

# set the parameters for LoRA
config = LoRAConfig(
    r=8,
    alpha=16,
    # use it on all of the layers
    intermediate_lora=True,
    output_lora=True
)

# make a new adapter for the XSum dataset
model.add_adapter("summarization", config=config)
# enable the adapter for training
model.train_adapter("summarization")
model.set_active_adapters(["summarization"])

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments, AdapterTrainer, TrainerCallback
from datasets import load_dataset

# small batch size to fit in memory
batch_size = 1

training_args = TrainingArguments(
    learning_rate=3e-4,
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    remove_unused_columns=False
)

# create the trainer
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    # load the dataset
    train_dataset=load_split(dataset, "train", 10),
    eval_dataset=load_split(dataset, "test", 2),
)

Running tokenizer on train dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Running tokenizer on test dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

In [None]:
trainer.train()

***** Running training *****
  Num examples = 10
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 30
  Number of trainable parameters = 1966080
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=30, training_loss=45.22047526041667, metrics={'train_runtime': 11.7227, 'train_samples_per_second': 2.559, 'train_steps_per_second': 2.559, 'total_flos': 20723914506240.0, 'train_loss': 45.22047526041667, 'epoch': 3.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2
  Batch size = 1


{'eval_loss': 43.63471221923828,
 'eval_runtime': 0.3206,
 'eval_samples_per_second': 6.238,
 'eval_steps_per_second': 6.238,
 'epoch': 3.0}

In [None]:
num_validation = 10
val = load_dataset('csv', data_files={'validation': 'df_validation.csv'})

validation_dataset = load_split(val, 'validation', num_validation)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

Running tokenizer on validation dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
predictions, references = [] , []

for i in range(num_validation):
    # load the input and label
    input_ids = validation_dataset[i]['input_ids'].unsqueeze(0).to(0)
    label_ids = validation_dataset[i]['labels'].unsqueeze(0).to(0)
    # use the model to generate the output
    output = model.generate(input_ids, max_length=1024)
    # convert the tokens to text
    input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    label_text = tokenizer.decode(label_ids[0], skip_special_tokens=True)
    predictions.append(output_text)
    references.append(label_text)

    print('Input:', input_text)
    print('Output:', output_text)
    print('Label:', label_text)
    print('---')

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Input: summarize: The University and College Union says the 1.1% rise offered by the universities is "an insult". But the Universities and Colleges Employers Association said the walkout was "disappointing given the very good pay offer". Unions representing university support staff are balloting on the offer, with strike action possible in the autumn. UCU says its members have suffered a real-terms pay cut of 14.% since 2009 and complains the squeeze on staff salaries has come as university leaders enjoyed hefty increases. "A 1.1% pay offer is an insult to hardworking staff, especially in light of the 5% pay rise vice-chancellors have enjoyed while holding down staff pay," said general secretary Sally Hunt. "Industrial action which impacts on students is never taken lightly, but members feel that they have been left with no alternative. "If the employers wish to see a swift end to this dispute, and avoid further disruption, they need to come back to the table with a much-improved offer

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Input: summarize: The government has "called in" proposals to shut St Joseph's Primary in Milngavie near Glasgow. The council plans to build a new denominational primary in nearby Bearsden - replacing both St Joseph's and the school on the site at present. Parents at St Joseph's and the Catholic Church have been fighting the plan. East Dunbartonshire Council plans to merge St Joseph's Primary with St Andrew's Primary in neighbouring Bearsden. The merged school would be sited in a new building on the current St Andrew's Primary School site. When the Scottish government calls in any proposal to close a school, it examines the process followed by the council and the information used to reach the decision. But it cannot simply overturn a decision because it disagrees with it. A letter informing the council of the government's decision said ministers were concerned by allegations the council's consultation document contained inaccurate information. It also said concerns raised by Education 

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Input: summarize: The man known as Kazu, or 'King Kazu' by some fans, will stay with the second division side past his 49th birthday. Kazuyoshi first played for Brazilian side Santos in 1986, so his deal will see his career span over 30 years. "I'm thankful to the club staff and supporters who always offer me support," said Miura, who scored 55 goals in 89 appearances for Japan. "I'll continue to give everything I have and strive," added the former Genoa and Dinamo Zagreb striker. Perhaps unsurprisingly, Miura holds the record as the oldest scorer in Japanese football - a winner in a second division match four months after his 48th birthday. He was particularly prolific in guiding Japan to the 1998 World Cup, scoring 14 goals in qualifying, and last played for the national side in 2000. His career, which started when he moved to Brazil to play youth football aged 15, is one of the longest in football history. Last week ex-England striker Teddy Sheringham registered himself as a player 

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Input: summarize: The security forces are reported to have used tear gas against stone-throwing protesters. They also surrounded the hometown of Burhan Wani, 22, who was killed fighting Indian troops last year. Separately seven people are reported to have been killed in shelling across the Line of Control that divides Indian and Pakistani-administered Kashmir. Officials on the Pakistani side told Reuters that five people died in Indian shelling, while Indian officials say two people were killed by Pakistani fire. There has been an armed revolt in the Muslim-majority region against rule by India since 1989, although violence has waned in recent years. The disputed region is claimed by both India and Pakistan in its entirety. India blames Pakistan for fuelling the unrest, a claim denied by Islamabad. Burhan Wani is credited with reviving the image of militancy in Muslim-majority Indian-administered Kashmir, becoming a figurehead for young people. Saturday's violence started as people tri

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Input: summarize: A Leave vote was always seen as more likely to generate significant disturbances in the financial world. For that reason it is also the result which was thought more likely to pose a headache for the world's central banks. There have already been statements from some that they are prepared to act to maintain financial stability. There are two potential issues they might want to respond to in the short term: the stability of the (commercial) banks and swings in exchange rates. With the banks there is a possibility of a general rise in risk aversion, and some might have difficulty borrowing in the inter-bank market. The ultimate danger from that is that they might be unable to make debt payments as they come due. Central banks can address that by lending them extra money (liquidity). In the UK the Bank of England has already been doing this ahead of the referendum, hoping to ensure that there is sufficient liquidity already in place. The Bank has said it "will continue 

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Input: summarize: The 33-year-old, who is the younger brother of Harlequins number eight Nick, joined Sale in 2011 after six years at Northampton. Easter won the European Challenge Cup and reached the Champions Cup final at Saints after joining from Nottingham. "I've enjoyed my time at Sale, I didn't really want to go anywhere else," said the back-row forward. "It was the right time to leave on my own terms, which not many people do. "I've had the chance for these last few games just to enjoy and savour them and I've been really lucky." He will now take up his teaching position at Wrekin College in Shropshire after the final game of the season at Newcastle on Saturday. "The last few years, I've been tailoring my career towards teaching, so taking a few courses, getting into schools, teaching and coaching," he added. "The opportunity just came up at the right time. It felt like the right time for me and my family."
Output: Sale midfielder Nick Easter has left the club after a successful

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Input: summarize: Tawel Fan ward at Glan Clwyd Hospital, Denbighshire, was closed more than three years ago and a report found some patients were treated like animals. It has emerged that at least seven patients' families were told treatment may have contributed to their deaths. Betsi Cadwaladr health board said an investigation was under way. It acknowledged the quality of care provided could have been a contributory factor to the deaths of some patients. A review of mortality rates on the ward has never been published although it is understood it has been completed. Relatives of one patient told BBC Wales Today they were told medical care on the ward was inadequate. Correspondence seen by the programme included an apology from the health board to the family, who do not want to be identified. One letter said: "Experts found that there were problems in the health care which may have contributed to the death." It added that "the board is very much engaged in a thorough search for the tr

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Input: summarize: Declan Paul Butcher from Roe Park Court committed the'single punch' assaults on Market Street in Limavady on 2 October 2014. CCTV footage showed his first victim being knocked unconscious. A second man suffered a double jaw fracture. Butcher appeared at Londonderry Crown Court on Thursday. He was sentenced to 14 months in prison. The court heard that the defendant had 51 previous convictions including eight assaults. He had also been assessed as presenting a high likelihood of reoffending. The judge told Butcher he had inflicted "gratuitous violence" on two" completely innocent" members of the public. The second man was punched and knocked out when he came across the first victim lying unconscious on Market Street. Both men later regained consciousness in hospital.
Output: A man has been jailed for a series of assaults on two people in Londonderry.
Label: A 25-year-old man from Limavady has been jailed for knocking out two men in separate attacks on the same night.
--

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Input: summarize: "Hope made a poor decision that has resulted in a negative impact on US Soccer and her team-mates," head coach Jill Ellis said. Solo, 33, will now miss matches on 8 February against France and 13 February against England. This month, domestic violence charges against Solo were dropped. In 2012, Solo, tested positive for a banned substance a month before winning a second Olympic gold medal. She maintained she was not aware the pre-menstrual medication she had been prescribed contained a banned substance and she was cleared of any wrongdoing. The US are preparing for the Women's World Cup in Canada in June when they will aim to win their third title following victories in 1991 and 1999. The latest incident surrounding Solo concerns a training camp being held by the team in Carson, California. Solo's husband, former Seattle Seahawks NFL player Jerramy Stevens, was arrested on suspicion of driving under the influence in Los Angeles in the early hours of Monday morning. Me

In [None]:
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm

# Metric
metric = evaluate.load("rouge")

rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Rogue1: 39.653424%
rouge2: 11.581853%
rougeL: 29.455628%
rougeLsum: 29.246060%


In [None]:
model.save_pretrained("model")

Configuration saved in model/config.json
Configuration saved in model/generation_config.json
Model weights saved in model/pytorch_model.bin


In [None]:
# #Training data
# import pandas as pd
# from datasets import Dataset, concatenate_datasets
# df_train = pd.read_csv('/content/df_train.csv')
# df_test = pd.read_csv('/content/df_test.csv')
# df_train = Dataset.from_pandas(df_train)
# df_test = Dataset.from_pandas(df_test)
# my_dict = {'train':  df_train,
#            'test': df_test
#            }

# # #dataset = concatenate_datasets([df_train, df_test])
# # dataset = Dataset.from_dict(my_dict)

In [None]:
# from datasets import load_dataset
# dataset = load_dataset('csv', data_files={'train': 'df_train.csv',
#                                               'test': 'df_test.csv'})

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:

# # df_test = pd.DataFrame(dataset)
# # df_test = df_test.dropna()
# # print(df_test.head())

In [None]:
# from datasets import load_dataset
# import pandas as pd
# dataset = load_dataset("xsum")
# cols = ['document','summary']
# df_test = pd.DataFrame(dataset['test'])
# df_test = df_test.dropna()
# df_test = df_test.reset_index()
# df_test = df_test[0:100]
# df_test = df_test[cols]
# df_train = pd.DataFrame(dataset['train'])
# df_train = df_train.dropna()
# df_train = df_train[0:1000]
# df_train = df_train[cols]
# df_train = df_train.reset_index()

# df_train.to_csv('df_train.csv', index= False)
# df_test.to_csv('df_test.csv', index= False)


# print(df_test.head())