In [92]:
import requests
import time
import json
import unicodedata
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
from os import path

In [3]:
# Some global variables
catalog_filename = 'foxnews_catalog.json'
news_filename = 'foxnes_news.json'
url_foxnews_base = 'https://www.foxnews.com'

In [3]:
# Utility functions
def FilterCatalogs(current: set(), new: []) -> []:
    output = []
    for n in new:
        if url_foxnews_base + n['url'] not in current:
            output.append(n)
    return output

### Crawl the Foxnews side for Tesla specific news

In [7]:
# Load Existing Catalogs
catalog = None
catalogUrls = set()
with open(catalog_filename, 'r') as infile:
    catalog = json.load(infile)
    catalogUrls = set([url_foxnews_base+c['url'] for c in catalog])

url_foxnews = 'https://www.foxnews.com/api/article-search'
query = {
    "isTag" : 'true',
    "searchSelected" : "fox-news/auto/make/tesla",
    "size": 30,
    "offset" : 0
}

for x in tqdm(range(0,1000), ncols=45):
    query['offset'] = x * 30
    r = requests.get(url_foxnews, params=query)
    if r.status_code == 200:
        if r.text != '[]':
            response = r.json()
            newCatalogs = FilterCatalogs(catalogUrls, response)
            if len(newCatalogs) > 0:
                catalog.extend(r.json())
            else:
                print("There is no more news to add, coz latest downloaded news are already in the catalogs")
                print(f"Last iteration: {x} (query: {query})")
                break
        else:
            print(f"Got blank response at iteration {x*30}")
            break
    time.sleep(10)

  0%|               | 0/1000 [00:00<?, ?it/s]

There is no more news to add, coz latest downloaded news are already in the catalogs
Last iteration: 0 (query: {'isTag': 'true', 'searchSelected': 'fox-news/auto/make/tesla', 'size': 30, 'offset': 0})





In [19]:
# Save the catalog
with open(catalog_filename, 'w') as outfile:
    json.dump(catalog, outfile, indent=4)

### Retrieve the text content from the foxnews catalog

In [89]:
# Utility functions
def SanitizerAdWords(content: str) -> str:
    content = content.lower()
    for a in ad_words:
        content = content.replace(a,'')
    return content

def SanitizerHtml(content: str) -> str:
    temp = unicodedata.normalize("NFKD",content.rstrip('.').strip()) + '.'
    output = re.sub(u"(\u2018|\u2019)", "'", temp)
    output = re.sub(u"(\u2013|\u2014)", "-", output)
    output = re.sub(u"(\u201c|\u201d)", '"', output)
    return output
    
# Remove unrelated content - specific to fox news
def SanitizerNonArticleTags(tag: bs4.element.Tag):
    featured = tag.find(attrs={"class":"featured featured-video video-ct"})
    if featured:
        featured.decompose()

def GetContent(url: str) -> str:
    content = None
    r = requests.get(url)
    if r.status_code == 200:
        html = BeautifulSoup(r.text, 'html.parser')
        article_title = html.find(attrs={"class": "headline"}).text
        article_body = html.find(attrs={"class": "article-body"})
        SanitizerNonArticleTags(article_body)
        content = article_title + '. ' + ' '.join([SanitizerHtml(x.text) for x in article_body.find_all('p')])
        content = SanitizerAdWords(content)        
        content = unicodedata.normalize("NFKD",content)
    return content

In [94]:
# Load the catalog for retrieving the news content 
catalog = None
with open(catalog_filename, 'r') as infile:
    catalog = json.load(infile)
    
# Load the current news
news = []
if path.exists(news_filename):
    with open(news_filename, 'r') as infile:
        news = json.load(infile)
    
# Ads Words that we want to remove from the news content
ad_words = None
with open('ad_words.txt', 'r') as infile:
    ad_words = [line.rstrip('\n') for line in infile.readlines()]

In [97]:
# news = []
# news_urlcache = set()

0

In [91]:
url = 'https://www.foxnews.com/science/spacexs-first-astronaut-launch-breaking-ground'
print(GetContent(url))

spacex’s first astronaut launch breaking ground with new look: ‘it is really neat’. the first astronauts launched by spacex are breaking new ground for style by unveiling hip spacesuits, gull-wing teslas and even a sleek rocketship with a black and white trim. the color coordination is credited to elon musk, the driving force behind spacex and tesla who is also a science fiction fan. nasa astronauts doug hurley and bob behnken approved the "fresh new look," the associated press reported on monday. the pair will catch a ride to the launch pad in a tesla model x electric car. "it is really neat, and i think the biggest testament to that is my 10-year-old son telling me how cool i am now," hurley told the outlet. the 53-year-old noted "spacex has gone all out" on the capsule's appearance. tulsa unveils gigantic elon musk statue to help lure tesla to town. this undated photo made available by spacex shows nasa astronaut bob behnken in his spacesuit at spacex headquarters in hawthorne, cali

In [98]:
news_urlcache = set([n['url'] for n in news])
for inx in tqdm(range(len(catalog))):
    url = url_foxnews_base + catalog[inx]['url']
    if url not in news_urlcache:
        #print('new url not in cache {url}')
        content = GetContent(url)
        news.append({'url': url, 'body':content, 'sentiment': ''})

100%|████████████████████████████████████████████████████████████████████████████████| 409/409 [03:40<00:00,  1.85it/s]


In [99]:
#Save the news 
with open(news_filename, 'w') as outfile:
    json.dump(news, outfile, indent=4)

[{'url': 'https://www.foxnews.com/auto/teslas-bizarre-swipe-to-drive-gear-selector',
  'body': 'tesla\'s bizarre swipe-to-drive gear selector revealed in video. with a bulletproof body and 500-mile battery-powered range, tesla\'s electric pickup isn\'t like every other truck. the first video of tesla\'s new on-screen "gear" selector system in operation has appeared on twitter. elon musk previously announced the feature, which is debuting on the updated model s and model x vehicles that were revealed in january. twitter user michael hsu posted the clip which shows a driver swiping a car icon in the top left corner of the screen to choose forward and reverse. hsu would not confirm to fox news autos the source of the video, or whether it was a customer or tesla employee demonstrating the system. a separate image posted by @nickhoward shows that neutral is engaged by a separate icon on the screen. drive, reverse and neutral can also be selected using a knob on the steering wheel, but there