In [2]:
import torch
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
#from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer
from transformers import pipeline

In [3]:
df = pd.read_json('foxnews_news_cleaned.json')
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-pubmed")

# by default encoder-attention is `block_sparse` with num_random_blocks=3, block_size=64
#model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-pubmed")

# decoder attention type can't be changed & will be "original_full"
# you can change `attention_type` (encoder only) to full attention like this:
#model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-pubmed", attention_type="original_full")

# you can change `block_size` & `num_random_blocks` like this:
model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-pubmed", block_size=16, num_random_blocks=2)

text = df.body.loc[0]
inputs = tokenizer(text, return_tensors='pt', padding=True)
prediction = model.generate(**inputs)
prediction = tokenizer.batch_decode(prediction)
print(prediction)

['<s> tesla has a new on-screen driving assistance system called " advanced driver assistance system ( ass ) " that can select forward and reverse. in the video below,<n> a tesla driver demonstrates the system.<n> the video shows the system s ability to guess which way the owner wants to go and select the direction of travel without any input from the car.</s>']


In [13]:
df = pd.read_json('foxnews_news_cleaned.json')
print(df.sentiment.value_counts(normalize=True))
summarizer = pipeline('summarization', model="google/bigbird-pegasus-large-pubmed", device=0)
abstract = summarizer(df.body.loc[0])
print(abstract)

neutral     0.572464
negative    0.236715
positive    0.185990
NA          0.004831
Name: sentiment, dtype: float64


Attention type 'block_sparse' is not possible if sequence_length: 285 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3.Changing attention type to 'original_full'...


[{'summary_text': 'the first video of tesla new on-screen gear selector system in operation has appeared on twitter . elon musk previously announced the feature , which is debuting on the updated model s and model x vehicles that were revealed in january . twitter user michael hsu posted the clip which shows a driver swiping a car icon in the top left corner of the screen to choose forward and reverse .'}]


In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer
model = AutoModelWithLMHead.from_pretrained("google/pegasus-cnn_dailymail")
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
# T5 uses a max_length of 512 so we cut the article to 512 tokens.
inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512)
outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

In [8]:
df = pd.read_json('foxnews_news_cleaned.json')
print(df.sentiment.value_counts(normalize=True))
summarizer = pipeline('summarization', model="google/bigbird-pegasus-large-arxiv", device=0)
abstract = summarizer(df.body.loc[0:2].tolist(), truncation='longest_first')
print(abstract)

neutral     0.572464
negative    0.236715
positive    0.185990
NA          0.004831
Name: sentiment, dtype: float64


Attention type 'block_sparse' is not possible if sequence_length: 285 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3.Changing attention type to 'original_full'...


[{'summary_text': 'the first demonstrating video of the new on-screen gear selector feature of the tesla has appeared on twitter . in the video , a driver swipes a car icon in the top left corner of the screen to choose forward and reverse , and neutral can also be selected using a knob on the steering wheel , but there apparently a third method that truly next-level . according to musk , the cars can guess which way you want to go and select the direction of travel without any input at all the model s and x are also being advertised with yoke-style steering wheel , but according to a redditor who claims to have been in the new model s , they are currently being shipped with a traditional round wheel until nhtsa approves the yoke , at which time owners will be able to get it retrofitted into their new cars .'}]


In [6]:
df.body.loc[0:2].tolist()

['tesla bizarre swipe-to-drive gear selector revealed in video. the first video of tesla new on-screen gear selector system in operation has appeared on twitter. elon musk previously announced the feature, which is debuting on the updated model s and model x vehicles that were revealed in january. twitter user michael hsu posted the clip which shows a driver swiping a car icon in the top left corner of the screen to choose forward and reverse. hsu would not confirm to fox news autos the source of the video, or whether it was a customer or tesla employee demonstrating the system. a separate image posted by @nickhoward shows that neutral is engaged by a separate icon on the screen. drive, reverse and neutral can also be selected using a knob on the steering wheel, but there apparently a third method that truly next-level. according to musk, the cars can guess which way you want to go and select the direction of travel without any input at all the model s and x are also being advertised w

In [None]:
df = pd.read_json('foxnews_news_cleaned.json')
model_name = 'google/pegasus-xsum'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

# Generate in small batches else running out of memory on GTX 1080Ti :(
summary = []
chunkSize = 5
with torch.no_grad():
    for x in tqdm(range(0,df.shape[0], chunkSize), ncols=100):
        textChunk = df.body.loc[x:x+chunkSize-1]
        batch = tokenizer([f'\"{t}\"' for t in textChunk], truncation=True, padding='longest', return_tensors="pt").to(device)
        translated = model.generate(**batch)
        summary.extend(tokenizer.batch_decode(translated, skip_special_tokens=True))
df['summary'] = summary

In [None]:
# df = pd.read_json('foxnews_news_cleaned.json')
# model_name = 'google/pegasus-xsum'
# device = 'cuda' if torch.cuda.is_available() else 'cpu'

# tokenizer = PegasusTokenizer.from_pretrained(model_name)
# model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
# batch = tokenizer([df.body.loc[20]], truncation=True, padding='longest', return_tensors="pt").to(device)
# translated = model.generate(**batch)
# tokenizer.batch_decode(translated, skip_special_tokens=True)

In [None]:
df.summary.value_counts().nlargest(2)

In [None]:
# Save the cleaned data
parsed = json.loads(df.to_json(orient='records', force_ascii=False, indent=4))
with open('foxnews_news_cleaned_summary.json', 'w', encoding='utf-8') as f:
    json.dump(parsed, f, indent=4)

In [None]:
df = pd.read_json('foxnews_news_cleaned_summary.json')
df

In [None]:
a = df[df.summary == 'All images are copyrighted.']
print(a)
a.loc[20].body

In [None]:
# Split to training/test set
df = pd.read_json('foxnews_news_cleaned_summary.json')
df = df.loc[df['sentiment'] != 'NA', ['summary', 'sentiment']]
label = {'neutral':0,'positive':1,'negative':2}
df['labels'] = df['sentiment'].map(label)

# Split the train/test set
train, test = train_test_split(df[['summary', 'labels']], test_size=0.2, random_state=12, shuffle=True)
train.columns = ['text','labels']
test.columns = ['text','labels']
train.to_csv('data/train.csv', sep=',', index=False)
test.to_csv('data/test.csv', sep=',', index=False)