#### Generate headline based on the article text using T5-base model on train and test set of clickbait and no-clickbait samples

In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd
import os

In [2]:
device = 'cpu'
model = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
model = model.to(device)

In [3]:
max_len = 128

def get_generated_headline(text, i):    
    print(i)
    encoding = tokenizer.encode_plus(text, return_tensors = "pt")
    input_ids = encoding["input_ids"].to(device)
    attention_masks = encoding["attention_mask"].to(device)
    
    beam_outputs = model.generate(
    input_ids = input_ids,
    attention_mask = attention_masks,
    max_length = max_len,
    num_beams = 3,
    early_stopping = True,
    )

    result = tokenizer.decode(beam_outputs[0])
#     print(result)
    result = result.split('>')[1].split('</')[0].strip()

    return result

In [4]:
path = "./TitleGenData/"

In [5]:
for f in os.listdir(path):
    print(f)

clickbait_test_data.csv
no_clickbait_test_data.csv
clickbait_train_data.csv
.ipynb_checkpoints
no_clickbait_train_data.csv


In [6]:
# run the scriipt on each of these CSVs

readp = path + 'no_clickbait_train_data.csv'
no_clickbait_train_data = pd.read_csv(readp)
no_clickbait_train_data.head()

Unnamed: 0,postText,targetParagraphs,truthClass
0,UK‚Äôs response to modern slavery leaving victim...,Thousands of modern slavery victims have¬†not c...,no-clickbait
1,"The ""forgotten"" Trump roast: Relive his brutal...",When the¬†White House correspondents‚Äô dinner is...,no-clickbait
2,Tokyo's subway is shut down amid fears over an...,One of Tokyo's major subways systems says it s...,no-clickbait
3,Ban lifted on Madrid doping laboratory,Share this with Madrid's Anti-Doping Laborator...,no-clickbait
4,"Despite the ‚ÄòYuck Factor,‚Äô Leeches Are Big in ...",MOSCOW ‚Äî They are small as physician assistant...,no-clickbait


In [7]:
readp = path + 'clickbait_train_data.csv'
clickbait_train_data = pd.read_csv(readp)
clickbait_train_data.head()

Unnamed: 0,postText,targetParagraphs,truthClass
0,this is good,President Donald Trump has appointed the¬†pro-l...,clickbait
1,Meet the happiest #dog in the world!,Adorable is probably an understatement. This a...,clickbait
2,The top 10 best selling comic books of all time,More By Brian Prowse-Gany In celebration of Na...,clickbait
3,"Players to know, QBs to watch and more üëÄnnGuid...",The first round of the 2017 NFL draft was nota...,clickbait
4,White House reporter @alivitali on her first 1...,The most consistent thing about my life for th...,clickbait


In [8]:
chop = 512 # take the first 512 characters of targetParagraphs

all_targetParagraphs = [] 
result = []

In [9]:
for i, row in clickbait_train_data.iterrows():
    all_targetParagraphs.append(row['targetParagraphs'][:chop])        

# all_targetParagraphs = all_targetParagraphs[:10]
print('all_targetParagraphs:', len(all_targetParagraphs))

# all_targetParagraphs=all_targetParagraphs[698:700]
for i, text in enumerate(all_targetParagraphs):
    result.append(get_generated_headline(text, i))

print('done', len(result))

In [16]:
print(len(result))

5437


In [17]:
import pandas as pd

clickbait_train_data['generatedPostText'] = result
clickbait_train_data.to_csv('./gen_clickbait_train_data.csv', index=False)

In [14]:
clickbait_train_data

Unnamed: 0,postText,targetParagraphs,truthClass,generatedPostText
0,this is good,President Donald Trump has appointed the¬†pro-l...,clickbait,Trump Appoints Pro-Life Advocate Dr. Charmaine...
1,Meet the happiest #dog in the world!,Adorable is probably an understatement. This a...,clickbait,"Maru the Husky - Part Husky, Part Panda, Part ..."
2,The top 10 best selling comic books of all time,More By Brian Prowse-Gany In celebration of Na...,clickbait,Top 10 Comic Books of All Time
3,"Players to know, QBs to watch and more üëÄnnGuid...",The first round of the 2017 NFL draft was nota...,clickbait,The 2017 NFL Draft was notable for a number of...
4,White House reporter @alivitali on her first 1...,The most consistent thing about my life for th...,clickbait,The Most Consistent Thing About My Life
...,...,...,...,...
5432,"RT @capitalweather: The Derecho, from D to O: ...","This post was originally published on June 11,...",clickbait,Derecho Season in the Northwest
5433,Restaurant won't apologize for tasteless Caitl...,Whoever comes up with the menus for Nacho Mama...,clickbait,How the Kardashians and Jenners Supported Cait...
5434,"As a BASE jumper leaped, his girlfriend snappe...",Dean Potter jumped. Graham Hunt followed. Pott...,clickbait,"YOSEMITE NATIONAL PARK, Calif."
5435,"Women scientists are tweeting ""sexy‚Äù photos of...","Marine biologists, archeologists, and many oth...",clickbait,#DistractinglySexy
