In [12]:
import requests
import time
import json, csv
import unicodedata
import re
import pandas as pd
import streamlit as st
import bs4
import typing
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from tqdm import tqdm
from os import path

In [4]:
# Some global variables
catalog_filename = 'foxnews_catalog.json'
news_filename = 'foxnews_news.json'
url_foxnews_base = 'https://www.foxnews.com'

In [None]:
# Utility functions
def FilterCatalogs(current: set(), new: []) -> []:
    output = []
    for n in new:
        if url_foxnews_base + n['url'] not in current:
            output.append(n)
    return output

### Crawl the Foxnews side for Tesla specific news

In [None]:
# Load Existing Catalogs
catalog = None
catalogUrls = set()
with open(catalog_filename, 'r') as infile:
    catalog = json.load(infile)
    catalogUrls = set([url_foxnews_base+c['url'] for c in catalog])

url_foxnews = 'https://www.foxnews.com/api/article-search'
query = {
    "isTag" : 'true',
    "searchSelected" : "fox-news/auto/make/tesla",
    "size": 30,
    "offset" : 0
}

prevCatalogCount = len(catalog)
for x in tqdm(range(0,1000), ncols=45):
    query['offset'] = x * 30
    r = requests.get(url_foxnews, params=query)
    if r.status_code == 200:
        if r.text != '[]':
            response = r.json()
            newCatalogs = FilterCatalogs(catalogUrls, response)
            if len(newCatalogs) > 0:
                catalog.extend(newCatalogs)
            else:
                print("There is no more news to add, coz latest downloaded news are already in the catalogs")
                print(f"Last iteration: {x} (query: {query})")
                break
        else:
            print(f"Got blank response at iteration {x*30}")
            break
    time.sleep(10)
    
print(f"Total new articles: {len(catalog) - prevCatalogCount}")

In [None]:
# Save the catalog
with open(catalog_filename, 'w') as outfile:
    json.dump(catalog, outfile, indent=4)

### Retrieve the text content from the foxnews catalog

In [None]:
# Utility functions
def SanitizerAdWords(content: str) -> str:
    content = content.lower()
    for a in ad_words:
        content = content.replace(a,'')
    return content

def SanitizerHtml(content: str) -> str:
    temp = unicodedata.normalize("NFKD",content.rstrip('.').strip()) + '.'
    output = re.sub(u"(\u2018|\u2019)", "'", temp)
    output = re.sub(u"(\u2013|\u2014)", "-", output)
    output = re.sub(u"(\u201c|\u201d)", '"', output)
    output = re.sub(u"(\u200b)", '', output)
    return output
    
# Remove unrelated content - specific to fox news
def SanitizerNonArticleTags(tag: bs4.element.Tag):
    for classPath in [
        "featured featured-video video-ct", 
        "speakable"
        ]:
        x = tag.find(attrs={"class":classPath})
    if x:
        x.decompose()

def GetContent(url: str) -> typing.Dict[str,str]:
    content = None
    r = requests.get(url)
    output = {"body":'', "authorName":'', "authorUrl":''}
    if r.status_code == 200:        
        html = BeautifulSoup(r.text, 'html.parser')
        article_title = SanitizerHtml(html.find(attrs={"class": "headline"}).text)
        article_body = html.find(attrs={"class": "article-body"})
        article_author = html.find(attrs={"class": "author-byline"}).find("a")    
        if article_author:
            output['authorName'] = article_author['href']
            output['authorUrl'] = article_author.text
        SanitizerNonArticleTags(article_body)
        content = article_title + '. ' + ' '.join([SanitizerHtml(x.text) for x in article_body.find_all('p')])
        content = SanitizerAdWords(content)        
        content = unicodedata.normalize("NFKD",content)
        output['body'] = content
    return output

In [5]:
# Load the catalog for retrieving the news content 
catalog = None
with open(catalog_filename, 'r') as infile:
    catalog = json.load(infile)
    
# Load the current news
news = []
if path.exists(news_filename):
    with open(news_filename, 'r') as infile:
        news = json.load(infile)
        # if news has no body, remove it from cache so we can try to download the body again
        for idx in range(len(news)-1, -1, -1):
            if not news[idx]['body']:
                news.pop(idx)
    
# Ads Words that we want to remove from the news content
ad_words = None
with open('ad_words.txt', 'r') as infile:
    ad_words = [line.rstrip('\n') for line in infile.readlines()]

In [None]:
# url = 'https://www.foxnews.com/auto/teslas-semitruck-%e2%94%80-what-to-expect'
# r = requests.get(url)
# html = BeautifulSoup(r.text, 'html.parser')
# article_author = html.find(attrs={"class": "author-byline"})

In [None]:
# url = 'https://www.foxnews.com/auto/teslas-new-roadster-will-deliver-hardcore-smackdown-to-gasoline-cars-musk'
# aaa = print(GetContent(url))

In [None]:
news_urlcache = set([n['url'] for n in news])
for inx in tqdm(range(len(catalog))):
    url = url_foxnews_base + catalog[inx]['url']
    if url not in news_urlcache:
        print(f'new url not in cache {url}')
        content = GetContent(url)
        news.append({'url': url, 'body':content['body'], 'authorUrl':content['authorUrl'], 'authorName':content['authorName'], 'sentiment': ''})
        time.sleep(5)

In [None]:
# Save the news to json
with open(news_filename, 'w') as outfile:
    json.dump(news, outfile, indent=4)

In [None]:
# Check the data
data = pd.read_json('foxnews_news.json')
print(data.sentiment.value_counts())
print(data[data.sentiment == ''])

In [10]:
# Save the news to csv 
df = pd.DataFrame(news)
df.to_csv('data/')

In [47]:
# Save to training/test set
df = pd.DataFrame(news)
df = df.loc[df['sentiment'] != 'NA', ['body','sentiment']]
label={'neutral':0,'positive':1,'negative':2}
df['labels']=df['sentiment'].map(label)

# Split the train/test set
train, test = train_test_split(df[['body','labels']], test_size=0.2, random_state=18, shuffle=True)
train.columns = ['text','labels']
test.columns = ['text','labels']
train.to_csv('data/train.csv', sep=',', index=False)
test.to_csv('data/test.csv', sep=',', index=False)

In [48]:
print(train.head())
print(test.head())

                                                  text  labels
224  police report says tesla on autopilot sped up,...       2
246  will your smartphone replace your car key?. th...       2
251  tesla's new electric semi truck makes its debu...       1
397  tesla's autopilot 8.0 protects kids and pets l...       1
57   chuck devore: elon musk's tesla to texas? cali...       0
                                                  text  labels
109  is this what tesla's 'cybertruck' pickup will ...       0
370  tesla pays off government loan nine years earl...       1
51   elon musk lists 5 more homes for $97 million a...       2
371  consumer reports gives tesla model s highest r...       1
221  elon musk and malala are twitter bffs. (ap/spa...       0


In [41]:
df.loc[51]

body         elon musk lists 5 more homes for $97 million a...
sentiment                                             negative
y                                                            2
Name: 51, dtype: object