In [3]:
from simpletransformers.t5 import T5Model, T5Args
import json
import pandas as pd
import nltk
from rouge import Rouge 
from T5_functions import load, load_data, sort_data, set_seed

In [4]:
nltk.download("punkt")
set_seed()
data = load_data("/home/ritvik/QuestionGeneration/T5/data/all_en.json")
train, test = sort_data(data, data_fraction=0.005)

model_args = T5Args()
model_args.num_train_epochs = 1
model_args.train_batch_size = 16
model_args.overwrite_output_dir = True
model_args.max_seq_length = 1024
model_args.reprocess_input_data = True
model_args.evaluate_during_training = True
model_args.manual_seed = 42
model_args.use_multiprocessing = True
model_args.eval_batch_size = 8
model_args.wandb_project = "Simple Sweep"

[nltk_data] Downloading package punkt to /home/ritvik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [54]:
def bleu_score(labels, preds):
    sum = []
    for label, pred in zip(labels, preds):
        ref = nltk.word_tokenize(label)
        hyp = nltk.word_tokenize(pred)
        sum.append(nltk.translate.bleu_score.sentence_bleu([ref], hyp))
    return sum

def rouge_score(labels, preds):
    rouge = Rouge()
    final_score = []
    for pred,label in zip(preds,labels):
        try:
            score = rouge.get_scores(pred, label)
            final_score.append(score[0]['rouge-l']['f'])
        except ValueError:
            final_score.append(0)
    return final_score

def combined_metric(labels, preds):
    rouge_vals = rouge_score(labels, preds)
    bleu_vals = bleu_score(labels, preds)
    return [(rouge_val+bleu_val)/2 for rouge_val,bleu_val in zip(rouge_vals,bleu_vals)]
    

In [34]:
model = T5Model(model_type="t5", model_name="t5-small", args=model_args, use_cuda=True) #, args={'wandb_project': 'project-name'}
input_text = test['input_text'].values.tolist()
target_text = test['target_text'].values.tolist()
result = model.predict(['generate question: ' + it for it in input_text])

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

Generating outputs: 100%|██████████| 12/12 [00:03<00:00,  3.85it/s]
Decoding outputs: 100%|██████████| 95/95 [00:00<00:00, 112.41it/s]


In [55]:
# print(result)
# print(target_text)
combined_metric(target_text, result)
# bleu_rouge_scores = combined_metric(target_text,result)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


[0.04999999772500011,
 0.0,
 0.29999999752500006,
 0.09999999760000006,
 0.07692307514792904,
 0.18181817983471074,
 4.135552217993328e-236,
 0.04545454297520674,
 0.0,
 5.773235161021675e-232,
 0.1111111101234568,
 0.0,
 0.11111110861111116,
 0.0,
 0.18181817933884298,
 0.23097566248938356,
 0.0,
 0.0,
 0.08333333111111117,
 0.0,
 0.0,
 0.39583776213356364,
 0.0,
 0.0,
 0.13636363394628104,
 0.28875137376872584,
 0.38152506920126716,
 0.0,
 0.0,
 0.15384615136094676,
 0.11111110864197536,
 0.16666666423611112,
 0.0,
 0.0,
 0.0,
 0.0,
 0.041666664236111255,
 0.0,
 0.06896551481569568,
 0.0,
 0.0,
 0.0769230756213018,
 0.0,
 0.0,
 0.0,
 2.1945866176905448e-232,
 0.0,
 0.14285714163265306,
 0.49410672099782293,
 0.0,
 0.0,
 0.10526315595567869,
 0.0,
 0.13043478015122878,
 0.1363636340495868,
 0.1428571405612245,
 0.0,
 0.0,
 0.0,
 0.045454542954545596,
 0.06666666426666674,
 0.11764705633217998,
 0.0,
 0.0,
 0.0,
 0.24999999812500004,
 0.06249999812500006,
 0.3636363611570248,
 0.0,
 0.

In [13]:
# model = T5Model(model_type="t5", model_name="t5-small", args=model_args, use_cuda=True) #, args={'wandb_project': 'project-name'}

print(len())
# result = model.predict(test['input_text'])

# print(type(result), result[0])
# print(test['target_text'])


859


In [12]:
from rouge import Rouge
import nltk

def bleu_score(labels, preds):
    sum = 0
    for label, pred in zip(labels, preds):
        ref = nltk.word_tokenize(label)
        hyp = nltk.word_tokenize(pred)
        sum += nltk.translate.bleu_score.sentence_bleu([ref], hyp)
    return sum

def rouge_score(labels, preds):
    rouge = Rouge()
    scores = rouge.get_scores(preds, labels)
    f = scores[0]['rouge-l']['f']
    return (f)

def combined_metric(labels, preds):
    print(labels, preds)
    rouge_val = rouge_score(labels, preds)
    bleu_val = bleu_score(labels, preds)
    return (rouge_val+bleu_val)/2

print(combined_metric(["hi bob", "yo bob"], ["hello bob", "bumb dog"]))
# nltk.word_tokenize(["hello"])

['hi bob', 'yo bob'] ['hello bob', 'bumb dog']
0.24999999750000004


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [3]:
with open("all_en.json", "r") as f:
    data = json.load(f)

new_data = []
for i in range(len(data)):
    for j in range(len(data[i]['gpt3_queries'])):
        new_data.append({"target_text": data[i]['gpt3_queries'][j], "input_text": data[i]['claim']})
    # data[i]['gpt3_queries'] = [data[i]['gpt3_queries']]
new_data

[{'target_text': 'Which countries reduced emissions between 2016 and 2020?',
  'input_text': 'The United States was the only country to reduce emissions between 2016 and 2020.'},
 {'target_text': 'What is the data on greenhouse gas emissions from the United States between 2016 and 2020?',
  'input_text': 'The United States was the only country to reduce emissions between 2016 and 2020.'},
 {'target_text': 'Did almost every other OECD nation reduce its emissions between 2016 and 2020?',
  'input_text': 'The United States was the only country to reduce emissions between 2016 and 2020.'},
 {'target_text': 'Oslo sea level data',
  'input_text': 'The sea level in Oslo is falling.'},
 {'target_text': 'Glacial isostatic adjustment in Oslo',
  'input_text': 'The sea level in Oslo is falling.'},
 {'target_text': 'Global sea level rise',
  'input_text': 'The sea level in Oslo is falling.'},
 {'target_text': 'Protests in Spain, Israel and Portugal on February 11 and 12, 2023',
  'input_text': 'Th

In [4]:
df = pd.DataFrame(new_data)
df.head()

Unnamed: 0,target_text,input_text
0,Which countries reduced emissions between 2016...,The United States was the only country to redu...
1,What is the data on greenhouse gas emissions f...,The United States was the only country to redu...
2,Did almost every other OECD nation reduce its ...,The United States was the only country to redu...
3,Oslo sea level data,The sea level in Oslo is falling.
4,Glacial isostatic adjustment in Oslo,The sea level in Oslo is falling.


In [5]:
model_args = T5Args()
model_args.num_train_epochs = 3

model = T5Model(
    model_type="t5",
    model_name="t5-base",
    args=model_args,
    use_cuda=True,
    cuda_device=1
)


Downloading (…)lve/main/config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 190kB/s]
Downloading pytorch_model.bin: 100%|██████████| 892M/892M [00:04<00:00, 202MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 24.0kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 9.11MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
df["prefix"] = "question"
# df = df.rename(columns={"claim": "input_text", "gpt3_queries": "target_text"})
# df["target_text"]


In [31]:
df

Unnamed: 0,target_text,input_text,prefix
0,Which countries reduced emissions between 2016...,The United States was the only country to redu...,question
1,What is the data on greenhouse gas emissions f...,The United States was the only country to redu...,question
2,Did almost every other OECD nation reduce its ...,The United States was the only country to redu...,question
3,Oslo sea level data,The sea level in Oslo is falling.,question
4,Glacial isostatic adjustment in Oslo,The sea level in Oslo is falling.,question
...,...,...,...
190894,What is the Biological Threat Reduction Progra...,The American TV channel confirmed the existen...,question
190895,What organizations work with Ukraine towards l...,The American TV channel confirmed the existen...,question
190896,Chornomorsk radioactive shipment fake news,Radioactive substances were brought to Chorno...,question
190897,Evidence refuting radioactive shipment in Chor...,Radioactive substances were brought to Chorno...,question


In [18]:
# #Pre-train

# to_predict = [
#     "binary classification: Luke blew up the first Death Star",
#     "generate question: In 1971, George Lucas wanted to film an adaptation of the Flash Gordon serial, but could not obtain the rights, so he began developing his own space opera."
# ]
# #Model responded: ['False', 'False']
# predictions = model.predict(to_predict)
# predictions


Generating outputs: 100%|██████████| 1/1 [00:00<00:00,  2.31it/s]
Decoding outputs: 100%|██████████| 2/2 [00:03<00:00,  1.72s/it]


['False', 'False']

In [7]:
def train_test_split(data,train_num):
    shuffled = data.sample(frac=1, random_state=1).reset_index()
    train = shuffled[:train_num]
    test = shuffled[train_num:]
    return train, test

train, test = train_test_split(df, int(0.95*df.__len__()))
train = train.drop(columns=["index"])
test = test.drop(columns=["index"])



In [50]:
train

Unnamed: 0,target_text,input_text,prefix
0,Pirbright Institute patent coronavirus,The Coronavirus is patented by the US,question
1,Leni Robredo speech laughing politicians Bongb...,"Mayor Sara Duterte, former senator Bongbong Ma...",question
2,Tom Cotton social media post insulting Merrick...,Did Tom Cotton ‘Humiliate’ Merrick Garland And...,question
3,Rema Wikipedia page edit history,"No, Nigerian music star Rema not dead – rumour...",question
4,Madhya Pradesh government liquor store female ...,MP government making women sell liquor,question
...,...,...,...
181349,Pubs that recently reopened beer gardens in En...,Does This Video Show A Fight In An English Pub...,question
181350,Joe Biden Detroit Athletic Club event in March...,A photograph captures Joe Biden meeting with s...,question
181351,Opinion polls conducted on voting by mail in 2020,“80% of the people” want the option of vote by...,question
181352,What is the ranking of other states in terms o...,"""Texas has the highest rate of uninsured in th...",question


In [51]:
test

Unnamed: 0,target_text,input_text,prefix
181354,Ajit Doval fan pages on Facebook,"Ajit Kumar Doval, National Security Advisor to...",question
181355,Did the DNC collude with Ukrainian officials?,"""The allegation that the DNC colluded with Ukr...",question
181356,UNICEF statement on Coronavirus (COVID-19) siz...,Coronavirus (COVID-19) is large in size where ...,question
181357,Fintan O'Toole column Irish Times pity US lead...,The Irish Times newspaper published a column b...,question
181358,Are there at least six people in the world tha...,There are at least six people in the world tha...,question
...,...,...,...
190894,Did Coronavirus originate from bats?,People eating bat soup will contract the Coron...,question
190895,How much contraband was collected at the US bo...,"""The Coast Guard confiscated 200 tons of contr...",question
190896,Fact check on Trump claims about his Electoral...,Video: Trump’s News Conference,question
190897,Fuel price hike Bangladesh protest August 2022,Video shows protest against fuel price hike in...,question


In [8]:
model.train_model(train)

100%|██████████| 181354/181354 [00:07<00:00, 23371.80it/s]
Epochs 0/3. Running Loss:    4.6120:  13%|█▎        | 2875/22670 [09:49<1:07:35,  4.88it/s]
Epoch 1 of 3:   0%|          | 0/3 [09:49<?, ?it/s]


KeyboardInterrupt: 

: 

In [None]:
result = model.eval_model(test)
with open("model_results_t5.txt", 'w') as file:
    file.write(json.dumps(result, indent=4))

In [None]:
to_predict = [
#Add predictions here!
]

predictions = model.predict(to_predict)