In [1]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [2]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt

import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch
from datasets import load_dataset

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
model_ckpt = "google/pegasus-large"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

In [4]:
#Get data
df = pd.read_csv('./drive/MyDrive/output.csv', sep=',')
df = df.dropna().reset_index()

#Select part of data we want to keep
df = df[['text','summary']]

#Clean text
df['text'] = df['text'].apply(lambda x: x.replace('\n',' '))
df['summary'] = df['summary'].apply(lambda x: x.replace('\n',' '))

#Select only part of it (makes testing faster)
writeups = df
writeups.head()

Unnamed: 0,text,summary
0,"When you visit the website, you get redirected...",Visit website and notice the `/?file=wc.php` r...
1,"Description: ""You can steal a car if you steal...",Examine the website source to find routes `/lo...
2,"Description: ""This is my file library. I don't...",Analyze the JavaScript code of the Express app...
3,"Description: ""People who get violent get that ...",Google the challenge description and discover ...
4,"Description: ""My nephew is a fussy eater and i...",Change the cookie value to the base64 value of...


In [5]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Split the DataFrame into training and test sets
train_writeups, test_writeups = train_test_split(writeups, test_size=0.2, random_state=42)

# Convert the split DataFrames into datasets.Dataset objects
train_dataset = Dataset.from_pandas(train_writeups)
test_dataset = Dataset.from_pandas(test_writeups)

# Create a datasets.DatasetDict object with the train and test datasets
writeups_dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

print(writeups_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 47
    })
    test: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 12
    })
})


In [6]:
split_lengths = [len(writeups_dataset[split]) for split in writeups_dataset]

print(f"Split lengths: {split_lengths}")
print(f"Features: {writeups_dataset['train'].column_names}")
print("\nText:")

print(writeups_dataset["test"][0]["text"])

print("\nSummary:")

print(writeups_dataset["test"][0]["summary"])

Split lengths: [47, 12]
Features: ['text', 'summary', '__index_level_0__']

Text:
When you visit the website, you get redirected to `/?file=wc.php`. This might indicate that you can include files from the server, such as `/?file=/etc/passwd`. You can see in this file that there's a user called `ctf`, but that's not useful yet.  Moving on, you can find out that there's a `robots.txt` file at `?file=robots.txt`. In it there is the `/?file=checkpass.php` file disallowed.  Visiting that URL, you get redirected back to `/wc.php`. However, it maybe that there's some code in `checkpass.php` that might be important. If you request it in `python` using `requests`, you can see that the response text says: "IMPORTANT!!! The page is still under development. This has a secret, do not push this page."  We can try to view the source of this page with the help of `php://filter`. Visit the website a base64 encoded string.  When you base64 decode this, you get some PHP code: [This code shows the passwor

In [7]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the <n> token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [None]:
pipe = pipeline('summarization', model = model_ckpt )

pipe_out = pipe(writeups_dataset['test'][0]['text'] )

print(pipe_out)

In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

rouge_metric = load_metric('rouge')

score = calculate_metric_on_test_ds(writeups_dataset['test'], rouge_metric, model_pegasus, tokenizer, column_text = 'text', column_summary='summary', batch_size=8)

  rouge_metric = load_metric('rouge')
100%|██████████| 2/2 [00:21<00:00, 10.62s/it]


In [None]:
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = ['pegasus'])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.378362,0.19574,0.288945,0.292511


In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['text'] , max_length = 1024, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    
writeups_dataset_pt = writeups_dataset.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/47 [00:00<?, ? examples/s]



Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="pegasus-writeups", num_train_epochs=3, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
) 

In [None]:
trainer = Trainer(model=model_pegasus, args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=writeups_dataset_pt["train"], 
                  eval_dataset=writeups_dataset_pt["test"]) # Change to "validation"

In [None]:
trainer.train()

score = calculate_metric_on_test_ds(
    writeups_dataset['test'], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'text', column_summary= 'summary'
)

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


100%|██████████| 6/6 [00:31<00:00,  5.26s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.382163,0.18549,0.288396,0.289011


In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

saved_dir = "fine_tuned_model"

model_pegasus.save_pretrained(saved_dir)
tokenizer.save_pretrained(saved_dir)

fine_tuned_model = PegasusForConditionalGeneration.from_pretrained(saved_dir)
fine_tuned_tokenizer = PegasusTokenizer.from_pretrained(saved_dir)

In [None]:
gen_kwargs = {"length_penalty": 1.2, "num_beams": 16, "max_length": 512}

sample_text = writeups_dataset["test"][0]["text"]
reference = writeups_dataset["test"][0]["summary"]

# Use your fine-tuned model and tokenizer in the pipeline
pipe = pipeline("summarization", model=fine_tuned_model, tokenizer=fine_tuned_tokenizer)

print("Text:")
print(sample_text)

print("\nReference Summary:")
print(reference)

print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])