In [None]:
import requests
import time
import json
import unicodedata
import re
import pandas as pd
import streamlit as st
import bs4
import typing
from bs4 import BeautifulSoup
from tqdm import tqdm
from os import path

In [None]:
# Some global variables
catalog_filename = 'foxnews_catalog.json'
news_filename = 'foxnes_news.json'
url_foxnews_base = 'https://www.foxnews.com'

In [None]:
# Utility functions
def FilterCatalogs(current: set(), new: []) -> []:
    output = []
    for n in new:
        if url_foxnews_base + n['url'] not in current:
            output.append(n)
    return output

### Crawl the Foxnews side for Tesla specific news

In [None]:
# Load Existing Catalogs
catalog = None
catalogUrls = set()
with open(catalog_filename, 'r') as infile:
    catalog = json.load(infile)
    catalogUrls = set([url_foxnews_base+c['url'] for c in catalog])

url_foxnews = 'https://www.foxnews.com/api/article-search'
query = {
    "isTag" : 'true',
    "searchSelected" : "fox-news/auto/make/tesla",
    "size": 30,
    "offset" : 0
}

prevCatalogCount = len(catalog)
for x in tqdm(range(0,1000), ncols=45):
    query['offset'] = x * 30
    r = requests.get(url_foxnews, params=query)
    if r.status_code == 200:
        if r.text != '[]':
            response = r.json()
            newCatalogs = FilterCatalogs(catalogUrls, response)
            if len(newCatalogs) > 0:
                catalog.extend(newCatalogs)
            else:
                print("There is no more news to add, coz latest downloaded news are already in the catalogs")
                print(f"Last iteration: {x} (query: {query})")
                break
        else:
            print(f"Got blank response at iteration {x*30}")
            break
    time.sleep(10)
    
print(f"Total new articles: {len(catalog) - prevCatalogCount}")

In [None]:
# Save the catalog
with open(catalog_filename, 'w') as outfile:
    json.dump(catalog, outfile, indent=4)

### Retrieve the text content from the foxnews catalog

In [None]:
# Utility functions
def SanitizerAdWords(content: str) -> str:
    content = content.lower()
    for a in ad_words:
        content = content.replace(a,'')
    return content

def SanitizerHtml(content: str) -> str:
    temp = unicodedata.normalize("NFKD",content.rstrip('.').strip()) + '.'
    output = re.sub(u"(\u2018|\u2019)", "'", temp)
    output = re.sub(u"(\u2013|\u2014)", "-", output)
    output = re.sub(u"(\u201c|\u201d)", '"', output)
    output = re.sub(u"(\u200b)", '', output)
    return output
    
# Remove unrelated content - specific to fox news
def SanitizerNonArticleTags(tag: bs4.element.Tag):
    featured = tag.find(attrs={"class":"featured featured-video video-ct"})
    if featured:
        featured.decompose()

def GetContent(url: str) -> typing.Dict[str,str]:
    content = None
    r = requests.get(url)
    output = {"body":'', "authorName":'', "authorUrl":''}
    if r.status_code == 200:        
        html = BeautifulSoup(r.text, 'html.parser')
        article_title = SanitizerHtml(html.find(attrs={"class": "headline"}).text)
        article_body = html.find(attrs={"class": "article-body"})
        article_author = html.find(attrs={"class": "author-byline"}).find("a")
        if article_author:            
            output['author_url'] = article_author['href']
            output['author_name'] = article_author.text
        SanitizerNonArticleTags(article_body)
        content = article_title + '. ' + ' '.join([SanitizerHtml(x.text) for x in article_body.find_all('p')])
        content = SanitizerAdWords(content)        
        content = unicodedata.normalize("NFKD",content)
    return output

In [None]:
# Load the catalog for retrieving the news content 
catalog = None
with open(catalog_filename, 'r') as infile:
    catalog = json.load(infile)
    
# Load the current news
news = []
if path.exists(news_filename):
    with open(news_filename, 'r') as infile:
        news = json.load(infile)
    
# Ads Words that we want to remove from the news content
ad_words = None
with open('ad_words.txt', 'r') as infile:
    ad_words = [line.rstrip('\n') for line in infile.readlines()]

In [None]:
# url = 'https://www.foxnews.com/auto/teslas-semitruck-%e2%94%80-what-to-expect'
# r = requests.get(url)
# html = BeautifulSoup(r.text, 'html.parser')
# article_author = html.find(attrs={"class": "author-byline"})

In [None]:
# url = 'https://www.foxnews.com/auto/teslas-semitruck-%e2%94%80-what-to-expect'
# aaa = print(GetContent(url))

In [None]:
news_urlcache = set([n['url'] for n in news])
for inx in tqdm(range(len(catalog))):
    url = url_foxnews_base + catalog[inx]['url']
    if url not in news_urlcache:
        #print(f'new url not in cache {url}')
        content = GetContent(url)
        news.append({'url': url, 'body':content, 'sentiment': ''})
        time.sleep(1)

In [None]:
#Save the news 
with open(news_filename, 'w') as outfile:
    json.dump(news, outfile, indent=4)