In [24]:
from bs4 import BeautifulSoup
import requests
import json

In [88]:
def get_content(url):
    """Extracts content from a news article URL.

    Makes a GET request to the article URL to retrieve 
    the HTML. Parses the HTML with BeautifulSoup and 
    extracts the article title, text, author and source 
    from structured data.

    Args:
        url (str): News article URL

    Returns:
        tuple: 
            (title, text, author, source) extracted 
            from article metadata.
    """    
    r = requests.get(url)

    # Create a BeautifulSoup object and specify the parser
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # Main text of the article
    article = soup.find(type="application/ld+json")
    content = article.string
    
    # website, title, author, content
    source = json.loads(content)['publisher']['name']
    title = json.loads(content)['headline']
    print(json.loads(content).keys())
    
    if source in ['Daily Mail', 'The Sun']:
        # Title
        title = soup.find('title').text
        
        # Main text of the article
        main_text = soup.find_all('p')
        main_text = [p.text for p in main_text]
        main_text = ' '.join(main_text)
    else:
        main_text = json.loads(content)['articleBody']
    
    author = json.loads(content)['author']
    if type(author) == dict:
        author = author['name']
    elif type(author) == list:
        author = author[0]['name']
    
    
    
    return (title, main_text, author, source)

- Fox News
- The Onion
- Daily Mail
- The Sun

In [89]:
get_content("https://www.foxnews.com/opinion/must-stop-mask-mandates-before-biden-democrats-force-them-again-heres-how")

dict_keys(['@context', '@type', 'mainEntityOfPage', 'headline', 'articleBody', 'datePublished', 'dateModified', 'description', 'author', 'publisher', 'image'])


('We must stop mask mandates before Biden and Democrats force them on us again. Here’s how we do it',
 'Dr. Anthony Fauci, the former chief medical adviser to the president,&nbsp;took to CNN this past weekend to sound the alarm. According to him, COVID cases are spiking, and the American people urgently need to mask up. Is anyone surprised?&nbsp;&nbsp; Fauci and his cabal crave power over everything else. But if I have anything to say about it, they won’t ever have the authority to impose mask mandates or trample our freedoms again.&nbsp; As we learned in the pandemic, public health bureaucrats will force compliance when the people won’t abide their recommendations. With the latest seasonal spike, they are back to their old ways.&nbsp; BIDEN JOKES ABOUT BEING FORCED TO WEAR A MASK: \'DON\'T TELL THEM I DIDN\'T HAVE IT ON\' Already public health experts and university officials are calling for masks to be reimposed. Businesses and universities are reinstating mask mandates. Even an elem

In [90]:
get_content("https://www.theonion.com/tesla-cybertruck-torn-to-pieces-by-hose-1850990479")

dict_keys(['@type', '@context', 'url', 'author', 'headline', 'description', 'dateline', 'datePublished', 'dateModified', 'mainEntityOfPage', 'image', 'articleBody', 'articleSection', 'keywords', 'publisher', 'video'])


('Tesla Cybertruck Torn To Pieces By Hose',
 'HOLLISTER, CA—Its stainless steel panels immediately crumpling from the pressure of the stream of water, a Tesla Cybertruck was reportedly torn to pieces Friday after getting sprayed by a hose. As seen in the now-viral video shared across social media platforms, the 7,000-pound electric vehicle splintered into dozens of pieces under the onslaught of the $39.99 hose purchased from Lowe’s, each bead of water from the gardening implement puncturing holes in the truck’s body until the entire vehicle shuddered and collapsed into a pile of broken glass and shrapnel. In a statement, Tesla Motors CEO Elon Musk defended the build quality of the Cybertruck, blaming the damage on the hose being set to “jet” at the time of the incident. At press time, sources confirmed that the remaining, dripping-wet pieces of the vehicle pieces  had burst into flames. \n\n\n',
 'The Ellen Show',
 'The Onion')

In [91]:
get_content("https://www.dailymail.co.uk/health/article-12702971/GP-unblemished-40-year-career-gets-6-month-suspension-vitriolic-anti-vaxx-comments-including-one-kids-lined-Covid-vaccine-kill-them.html")

dict_keys(['@context', 'dateModified', 'image', 'author', 'video', 'id', 'mainEntityOfPage', 'name', 'articleBody', 'publisher', 'datePublished', '@type', 'description', 'headline'])


("GP with unblemished 40-year career gets 6-month suspension for 'vitriolic' anti-vaxx comments - including one that kids were being 'lined up' to get a Covid vaccine that could 'kill them' | Daily Mail Online",
 "By John Ely Senior Health Reporter For Mailonline    Published:  08:04, 3 November 2023   |  Updated:  10:06, 3 November 2023      130 View  comments  A GP with an unblemished career has been suspended for six months over 'vitriolic' comments she made about Covid and vaccines.\xa0 Londonderry-based Dr Mary McCloskey claimed the pandemic was a 'figment' of the media and the Government and that jabs don't work and\xa0were killing people.\xa0 She also claimed tests and face masks were being used as a psychological weapons to spread fear and experts were 'laughing' at the public via how they named variants of the virus.\xa0 In one of her most inflammatory statements on Covid vaccines, she claimed parents were being 'told to line up our children to get something that might kill th

In [92]:
get_content("https://www.thesun.co.uk/news/24602546/meat-bad-as-smoking-colin-robertson/")

dict_keys(['@context', '@type', 'url', 'publisher', 'headline', 'image', 'datePublished', 'dateModified', 'keywords', 'articleSection', 'mainEntityOfPage', 'author'])


('Telling shoppers that eating meat is as a bad as smoking and will bring on the apocalypse is banger out of order | The Sun',
 'Colin Robertson',
 'The Sun')