In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m95.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.6 MB/s[0m eta

In [None]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt

from datasets import load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#Get data
df = pd.read_csv('./drive/MyDrive/output.csv', sep=',')
df = df.dropna().reset_index()

#Select part of data we want to keep
df = df[['text','summary']]

#Clean text
df['text'] = df['text'].apply(lambda x: x.replace('\n',' '))
df['summary'] = df['summary'].apply(lambda x: x.replace('\n',' '))

#Select only part of it (makes testing faster)
writeups = df
writeups.head()

Unnamed: 0,text,summary
0,"When you visit the website, you get redirected...",Visit website and notice the `/?file=wc.php` r...
1,"Description: ""You can steal a car if you steal...",Examine the website source to find routes `/lo...
2,"Description: ""This is my file library. I don't...",Analyze the JavaScript code of the Express app...
3,"Description: ""People who get violent get that ...",Google the challenge description and discover ...
4,"Description: ""My nephew is a fussy eater and i...",Change the cookie value to the base64 value of...


In [None]:
sample_text = """
Description: "Can you find the flag on this website."

Additional details will be available after launching your challenge instance.
 
After start of the instance picoCTF will provide you a link to running instance.

If you type `user` as the username and `user` as the password, you will get a message that reveals the query for the login request. Since we know the query, we can easily get in by entering `'or 1=1;--` in the `pass` field. This modifies the query to be something like:
[SQL query modified with 'or 1=1;--' for bypassing authentication]

We can test some queries to find out what database is used. By entering this query: `123' UNION SELECT 1, sqlite_version(), 3;--`, we now know that the site is using SQLite.

Now we can list all tables with this query: `123' UNION SELECT name, sql, null from sqlite_master;--`. We find the flag in the table named `more_table`.

To get the flag, we use this query:
[SQL query to retrieve flag from the 'more_table']
"""

In [None]:
# We'll collect the generated summaries of each model in a dictionary
summaries = {}

In [None]:
def baseline_remove_empty_lines(text):
  lines = text.split('\n')
  non_empty_lines = [line for line in lines if line.strip() != '']
  return '\n'.join(non_empty_lines)

In [None]:
summaries['baseline'] = baseline_remove_empty_lines(sample_text)

summaries['baseline']

'Description: "Can you find the flag on this website."\nAdditional details will be available after launching your challenge instance.\nAfter start of the instance picoCTF will provide you a link to running instance.\nIf you type `user` as the username and `user` as the password, you will get a message that reveals the query for the login request. Since we know the query, we can easily get in by entering `\'or 1=1;--` in the `pass` field. This modifies the query to be something like:\n[SQL query modified with \'or 1=1;--\' for bypassing authentication]\nWe can test some queries to find out what database is used. By entering this query: `123\' UNION SELECT 1, sqlite_version(), 3;--`, we now know that the site is using SQLite.\nNow we can list all tables with this query: `123\' UNION SELECT name, sql, null from sqlite_master;--`. We find the flag in the table named `more_table`.\nTo get the flag, we use this query:\n[SQL query to retrieve flag from the \'more_table\']'

# GPT-2

In [None]:
from transformers import pipeline, set_seed

set_seed(42)
pipe = pipeline('text-generation', model = 'gpt2-medium' )
gpt2_query = sample_text + "\nTL;DR:\n"
pipe_out = pipe(gpt2_query, max_length = 512, clean_up_tokenization_spaces = True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
pipe_out

[{'generated_text': '\nDescription: "Can you find the flag on this website."\n\nAdditional details will be available after launching your challenge instance.\n \nAfter start of the instance picoCTF will provide you a link to running instance.\n\nIf you type `user` as the username and `user` as the password, you will get a message that reveals the query for the login request. Since we know the query, we can easily get in by entering `\'or 1=1;--` in the `pass` field. This modifies the query to be something like:\n[SQL query modified with \'or 1=1;--\' for bypassing authentication]\n\nWe can test some queries to find out what database is used. By entering this query: `123\' UNION SELECT 1, sqlite_version(), 3;--`, we now know that the site is using SQLite.\n\nNow we can list all tables with this query: `123\' UNION SELECT name, sql, null from sqlite_master;--`. We find the flag in the table named `more_table`.\n\nTo get the flag, we use this query:\n[SQL query to retrieve flag from the \

In [None]:
pipe_out[0]["generated_text"][len(gpt2_query) :]

"\nIn a website and user are already using SQLite, all we have to do is type the password and we will bypass the login attempt.\n\nIf you have been studying the Python, then this method is easy and easy-to-use. The login process is a simple script to do a request, set up a database, and then set up a test if an error occurs. Our login page is easy to understand, but as we just ran, some things just don't make sense."

In [None]:
summaries['gpt2'] = "\n".join(sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))

# T5

In [None]:
pipe = pipeline('summarization', model = 't5-small' )
pipe_out = pipe(sample_text)

In [None]:
pipe_out

[{'summary_text': "picoCTF will provide you a link to running instance . 'or 1=1;-- in the pass field ."}]

In [None]:
summaries['t5'] = 'n'.join(sent_tokenize(pipe_out[0]['summary_text']))

# BART

In [None]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)

In [None]:
pipe_out

[{'summary_text': "After start of the instance picoCTF will provide you a link to running instance. If you type `user` as the username and `user' as the password, you will get a message that reveals the query for the login request. We can easily get in by entering `'or 1=1;--` in the `pass' field."}]

In [None]:
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

In [None]:
summaries["bart"]

"After start of the instance picoCTF will provide you a link to running instance.\nIf you type `user` as the username and `user' as the password, you will get a message that reveals the query for the login request.\nWe can easily get in by entering `'or 1=1;--` in the `pass' field."

# PEGASUS

In [None]:
pipe = pipeline('summarization', model="google/pegasus-cnn_dailymail"  )
pipe_out = pipe(sample_text)

In [None]:
pipe_out

[{'summary_text': "picoCTF will provide you a link to running instance .<n>To get the flag, we use this query: [ query to retrieve flag from the 'more_table']"}]

In [None]:
summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

## Comparison

In [None]:
for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])

BASELINE
Description: "Can you find the flag on this website."
Additional details will be available after launching your challenge instance.
After start of the instance picoCTF will provide you a link to running instance.
If you type `user` as the username and `user` as the password, you will get a message that reveals the query for the login request. Since we know the query, we can easily get in by entering `'or 1=1;--` in the `pass` field. This modifies the query to be something like:
[SQL query modified with 'or 1=1;--' for bypassing authentication]
We can test some queries to find out what database is used. By entering this query: `123' UNION SELECT 1, sqlite_version(), 3;--`, we now know that the site is using SQLite.
Now we can list all tables with this query: `123' UNION SELECT name, sql, null from sqlite_master;--`. We find the flag in the table named `more_table`.
To get the flag, we use this query:
[SQL query to retrieve flag from the 'more_table']
GPT2

In a website and user