In [1]:
import torch
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
# Bart Model
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [3]:
df = pd.read_json('foxnews_news_cleaned.json')

# Generate in small batches else running out of memory on GTX 1080Ti :(
summary = []
with torch.no_grad():
    for x in tqdm(range(0,df.shape[0]), ncols=100):
        batch = tokenizer(df.body.loc[x], truncation=True, padding='longest', return_tensors="pt").to(device)
        translated = model.generate(**batch)
        summary.append(tokenizer.decode(translated, skip_special_tokens=True))
        #print(summary[-1])
df['summary'] = summary

100%|█████████████████████████████████████████████████████████████| 414/414 [36:15<00:00,  5.25s/it]


In [20]:
df.summary.value_counts().nlargest(2)

The tesla driver told officers he had used the autopilot. The system could not have prevented the crash because the bus swerved into the tesLA driver lane while the tedla was next to the bus, the company said. teslas updated the. autopilot software this month following a deadly crash in may.                               1
Some tesla model 3 owners and reservation holders are blowing up reddit threads accusing the automaker of pulling a bait and switch with pricey trim package. The $5,000 premium upgrade option is being delivered with a woven textile material in place of the suede-like alcantara headliner depicted in promotional images.    1
Name: summary, dtype: int64

In [21]:
# Save the cleaned data
parsed = json.loads(df.to_json(orient='records', force_ascii=False, indent=4))
with open('foxnews_news_cleaned_summary.json', 'w', encoding='utf-8') as f:
    json.dump(parsed, f, indent=4)

In [22]:
df = pd.read_json('foxnews_news_cleaned_summary.json')
df

Unnamed: 0,url,body,sentiment,authorUrl,authorName,title,summary
0,https://www.foxnews.com/auto/teslas-bizarre-sw...,the first video of tesla new on-screen gear se...,neutral,/person/g/gary-gastelu,Gary Gastelu,tesla's bizarre swipe-to-drive gear selector r...,Twitter user michael hsu posted the clip which...
1,https://www.foxnews.com/auto/tesla-cancels-ful...,tesla has canceled a planned expansion of its ...,negative,/person/g/gary-gastelu,Gary Gastelu,tesla cancels full self-driving expansion ahea...,tesla has canceled a planned expansion of its ...
2,https://www.foxnews.com/auto/tesla-autopilot-m...,a tesla driver with a suspended license using ...,negative,/person/g/gary-gastelu,Gary Gastelu,tesla set on autopilot slams into michigan sta...,The 22-year-old driver was cited with violatin...
3,https://www.foxnews.com/auto/detroit-police-te...,detroit police investigating an accident invol...,neutral,/person/g/gary-gastelu,Gary Gastelu,detroit police don't think tesla was on autopi...,A tesla model y got lodged under a semi-truck ...
4,https://www.foxnews.com/auto/nhtsa-investigati...,the u.s. government highway safety agency is s...,negative,http://www.ap.org/,Associated Press,nhtsa investigating 'violent' tesla crash into...,Two people were critically injured in the cras...
...,...,...,...,...,...,...,...
409,https://www.foxnews.com/auto/5-cars-you-might-...,gmc hummer ev chief engineer al oppenheiser j...,positive,Gary Gastelu,/person/g/gary-gastelu,5 vehicles you might not want to buy new (and ...,"The bmw 5 series saw a reduction of 38.4%, or ..."
410,https://www.foxnews.com/auto/tesla-testing-ram...,the 2021 ram 1500 trx is the most powerful pi...,positive,Gary Gastelu,/person/g/gary-gastelu,tesla spotted testing ram 1500 trx high perfor...,Video of tesla fremont factory complex shot by...
411,https://www.foxnews.com/auto/consumer-reports-...,the 2021 cadillac escalade is available with ...,negative,Gary Gastelu,/person/g/gary-gastelu,consumer reports test shows how tesla's autopi...,Consumer reports senior director of auto testi...
412,https://www.foxnews.com/entertainment/elon-mus...,"reaction and analysis from trace gallagher, t...",neutral,Brie Stimson,/person/s/brie-stimson,"elon musk hosts 'snl,' jokes about space, cars...",The 49-year-old south africa-born billionaire ...


In [27]:
# Split to training/test set
df = pd.read_json('foxnews_news_cleaned_summary.json')
df = df.loc[df['sentiment'] != 'NA', ['title', 'summary', 'sentiment']]
label = {'neutral':0,'positive':1,'negative':2}
df['labels'] = df['sentiment'].map(label)

# Split the train/test set
train, test = train_test_split(df[['title', 'summary', 'sentiment', 'labels']], test_size=0.2, random_state=12, shuffle=True)
train.columns = ['title','summary','sentiment', 'labels']
test.columns = ['title','sumamry','sentiment', 'labels']
train.to_csv('data/train.csv', sep=',', index=False)
test.to_csv('data/test.csv', sep=',', index=False)