In [75]:
import requests
import time
import json
import unicodedata
import re
from bs4 import BeautifulSoup
from tqdm import tqdm

In [76]:
# Some global variables
catalog_filename = 'foxnews_catalog.json'
news_filename = 'foxnes_news.json'
url_foxnews_base = 'https://www.foxnews.com'

In [77]:
# Utility functions
def FilterCatalogs(current: set(), new: []) -> []:
    output = []
    for n in new:
        if url_foxnews_base + n['url'] not in current:
            output.append(n)
    return output

### Crawl the Foxnews side for Tesla specific news

In [78]:
# Load Existing Catalogs
catalog = None
catalogUrls = set()
with open(catalog_filename, 'r') as infile:
    catalog = json.load(infile)
    catalogUrls = set([url_foxnews_base+c['url'] for c in catalog])

url_foxnews = 'https://www.foxnews.com/api/article-search'
query = {
    "isTag" : 'true',
    "searchSelected" : "fox-news/auto/make/tesla",
    "size": 30,
    "offset" : 0
}

for x in tqdm(range(0,1000), ncols=45):
    query['offset'] = x * 30
    r = requests.get(url_foxnews, params=query)
    if r.status_code == 200:
        if r.text != '[]':
            response = r.json()
            newCatalogs = FilterCatalogs(catalogUrls, response)
            if len(newCatalogs) > 0:
                catalog.extend(r.json())
            else:
                print("There is no more news to add, coz latest downloaded news are already in the catalogs")
                break
        else:
            print(f"Got blank response at iteration {x*30}")
            break
    time.sleep(10)

  0%|               | 0/1000 [00:00<?, ?it/s]

There is no more news to add, coz latest downloaded news are already in the catalogs





In [19]:
with open(catalog_filename, 'w') as outfile:
    json.dump(catalog, outfile, indent=4)

### Retrieve the text content from the foxnews catalog

In [16]:
with open(catalog_filename, 'r') as infile:
    catalog = json.load(infile)
    
# Ads Words
ad_words = None
with open('ad_words.txt', 'r') as infile:
    ad_words = [line.rstrip('\n') for line in infile.readlines()]

In [72]:
# Utility functions
def SanitizerAdWords(content: str) -> str:
    content = content.lower()
    for a in ad_words:
        content = content.replace(a,'')
    return content

def SanitizerHtml(content: str) -> str:
        temp = unicodedata.normalize("NFKD",content.rstrip('.').strip()) + '.'
        output = re.sub(u"(\u2018|\u2019)", "'", temp)
        return output

def GetContent(url: str) -> str:
    content = None
    r = requests.get(url)
    if r.status_code == 200:
        html = BeautifulSoup(r.text, 'html.parser')
        article_title = html.find(attrs={"class": "headline"}).text
        article_body = html.find(attrs={"class": "article-body"})
        #print(article_body)
        content = article_title + '.' + ' '.join([SanitizerHtml(x.text) for x in article_body.find_all('p')])
        #print(content)
        content = SanitizerAdWords(content)
        content = unicodedata.normalize("NFKD",content)
    return content

In [73]:
news = []
for inx in tqdm(range(len(catalog))):
    url = url_foxnews_base + catalog[inx]['url']
    content = GetContent(url)
    news.append({'url': url, 'body':content, 'sentiment': ''}) 
    break

  0%|          | 0/409 [00:00<?, ?it/s]


In [163]:
with open(news_filename, 'w') as outfile:
    json.dump(news, outfile, indent=4)

In [36]:
print(news[0]['body'])

Tesla's bizarre swipe-to-drive gear selector revealed in video.‘Hi’ ‘Hi’ ‘Hi’ ‘Hi’ ‘Hi’ ‘Hi’ ‘Hi’ ‘Hi’ ‘Hi’ ‘Hi’ ‘Hi’ ‘Hi’


In [96]:
max = 5
for c in catalog:
    print(url_foxnews_base + c['url'])
    if max == 0:
        break
    max -= 1

https://www.foxnews.com/auto/teslas-bizarre-swipe-to-drive-gear-selector
https://www.foxnews.com/auto/tesla-cancels-full-self-driving-expansion-update
https://www.foxnews.com/auto/tesla-autopilot-michigan-state-police-car
https://www.foxnews.com/auto/detroit-police-tesla-autopilot-semi-crash
https://www.foxnews.com/auto/nhtsa-investigating-violent-tesla-crash-semi
https://www.foxnews.com/auto/tesla-fires-self-driving-testers-camera
