In [1]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [2]:
df = pd.read_json('foxnews_news_cleaned.json')
model_name = 'google/pegasus-xsum'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

    # Generate in small batches else running out of memory on GTX 1080Ti :(
with torch.no_grad():
    summary = []
    chunkSize = 5
    for textChunk in np.array_split(df.body, df.shape[0] / chunkSize):
        batch = tokenizer([f'\"{t}\"' for t in textChunk], truncation=True, padding='longest', return_tensors="pt").to(device)
        translated = model.generate(**batch)
        summary.extend(tokenizer.batch_decode(translated, skip_special_tokens=True))
        del batch
        del translated        
        model.zero_grad()
        gc.collect()
        torch.cuda.empty_cache()
df['summary'] = summary


In [17]:
df.to_json('foxnews_news_cleaned_summary.json', orient='records', indent=4)

In [22]:
df = pd.read_json('foxnews_news_cleaned_summary.json')
df

Unnamed: 0,url,body,sentiment,authorUrl,authorName,summary
0,https://www.foxnews.com/auto/teslas-bizarre-sw...,tesla bizarre swipe-to-drive gear selector rev...,neutral,/person/g/gary-gastelu,Gary Gastelu,Tesla's new on-screen gear selector system app...
1,https://www.foxnews.com/auto/tesla-cancels-ful...,tesla cancels full self-driving expansion ahea...,negative,/person/g/gary-gastelu,Gary Gastelu,Tesla has cancelled a planned expansion of its...
2,https://www.foxnews.com/auto/tesla-autopilot-m...,tesla set on autopilot slams into michigan sta...,negative,/person/g/gary-gastelu,Gary Gastelu,A tesla driver with a suspended license using ...
3,https://www.foxnews.com/auto/detroit-police-te...,detroit police don think tesla was on autopilo...,neutral,/person/g/gary-gastelu,Gary Gastelu,A tesla model y getting lodged under a semi-tr...
4,https://www.foxnews.com/auto/nhtsa-investigati...,nhtsa investigating violent tesla crash into s...,negative,http://www.ap.org/,Associated Press,The National Highway Traffic Safety Administra...
...,...,...,...,...,...,...
407,https://www.foxnews.com/auto/federal-governmen...,federal government decides not to investigate ...,neutral,Associated Press,http://www.ap.org/,The federal government says it will not invest...
408,https://www.foxnews.com/tech/mit-wins-design-c...,mit wins design competition for elon musk hype...,neutral,Associated Press,http://www.ap.org/,The winning design for elon musk's hyperloop h...
409,https://www.foxnews.com/auto/5-cars-you-might-...,5 vehicles you might not want to buy new . gmc...,positive,Gary Gastelu,/person/g/gary-gastelu,The best and worst models to buy in several ci...
410,https://www.foxnews.com/auto/tesla-testing-ram...,tesla spotted testing ram 1500 trx high perfor...,positive,Gary Gastelu,/person/g/gary-gastelu,tesla spotted testing ram 1500 trx high perfor...


In [25]:
# Split to training/test set
df = pd.read_json('foxnews_news_cleaned_summary.json')
df = df.loc[df['sentiment'] != 'NA', ['summary', 'sentiment']]
label = {'neutral':0,'positive':1,'negative':2}
df['labels'] = df['sentiment'].map(label)

# Split the train/test set
train, test = train_test_split(df[['summary', 'labels']], test_size=0.2, random_state=12, shuffle=True)
train.columns = ['text','labels']
test.columns = ['text','labels']
train.to_csv('data/train.csv', sep=',', index=False)
test.to_csv('data/test.csv', sep=',', index=False)