In [1]:
#scrap the website the onion
#https://www.theonion.com/

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import json
import datetime
from tqdm import tqdm
import sys
import os

#async 
import aiohttp
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [144]:
#Fisrt find all article links using the sitemap

url = "https://www.theonion.com/sitemap/"
months = ["january","february","march","april","may","june","july","august","september","october","november","december"]

#if the folder the_onion does not exist, create it
if not os.path.exists("the_onion"):
    os.makedirs("the_onion")


for year in range(2003,2024):
    liste = []
    for month in months:
        url_request = url + str(year) + "/" + month
        page = requests.get(url_request, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(page.content, 'html.parser')
        links = soup.find_all('h4')
        for link in links:
            liste.append(link.find('a')['href'])
    df = pd.DataFrame(liste)
    df.to_csv("the_onion/links_" + str(year) + ".csv", index=False)


In [3]:
#scrap the articles using the links in the dataframe

def scrap_the_onion_asyn(url):
    page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(page.content, 'html.parser')
    content = soup.find('main').find("script").get_text()
    content = json.loads(content) #will output a dictionary in string format, so I use json.loads to convert it to a dictionary


    #Temporary

    #print(soup.prettify())
    #Temporary

    #get theme
    theme = soup.find_all('div', {'class' : "sc-fek4t4-1 fKyolL"})[0].get_text()

    #convert date to datetime format
    #remove hours
    content["datePublished"] = content["datePublished"].split("T")[0]
    date = datetime.datetime.strptime(content["datePublished"], '%Y-%m-%d')

    return content["headline"], date, theme, content["articleBody"]

In [135]:
#df["headline"], df["date"], df["article"] = zip(*df["link"].apply(scrap_the_onion))
#fast way to scrap all the articles but it will take a long time

In [4]:
#read each csv file and scrap the articles

#store in a dico all the erros that occured
errors = {}

for year in range(2012,2024):
    print("scraping year " + str(year) + "...")
    df = pd.read_csv("the_onion/links_" + str(year) + ".csv", header=None)
    df.columns = ["link"]
    df["headline"] = ""
    df["date"] = ""
    df["article"] = ""
    df["theme"] = ""
    for i in tqdm(range(len(df))):
        try:
            df["headline"][i], df["date"][i], df["theme"][i], df["article"][i] = scrap_the_onion(df["link"][i])
        except Exception as e:
            df["headline"][i], df["date"][i], df["theme"][i], df["article"][i] = np.nan, np.nan, np.nan, np.nan
            errors[(year, i)] = e
            #print(e.__class__.__name__ + ": " + str(e))

    df.to_csv("the_onion/articles_" + str(year) + ".csv", index=False)
    print("year " + str(year) + " done")

scraping year 2012...


100%|██████████| 2952/2952 [18:25<00:00,  2.67it/s]


year 2012 done
scraping year 2013...


100%|██████████| 3199/3199 [20:33<00:00,  2.59it/s]


year 2013 done
scraping year 2014...


100%|██████████| 2759/2759 [19:21<00:00,  2.38it/s]


year 2014 done
scraping year 2015...


100%|██████████| 2336/2336 [15:26<00:00,  2.52it/s]


year 2015 done
scraping year 2016...


100%|██████████| 2224/2224 [15:51<00:00,  2.34it/s] 


year 2016 done
scraping year 2017...


100%|██████████| 2685/2685 [26:03<00:00,  1.72it/s]   


year 2017 done
scraping year 2018...


100%|██████████| 3002/3002 [18:30<00:00,  2.70it/s]


year 2018 done
scraping year 2019...


 10%|▉         | 278/2823 [01:45<16:08,  2.63it/s]


KeyboardInterrupt: 

In [None]:
#On est à l'année 2019

In [3]:
async def fetch(session, url):
    try:
        async with session.get(url, headers={'User-Agent': 'Mozilla/5.0'}) as response:
            return await response.text()
    except Exception as e:
        return False
async def scrap_the_onion_asyn(session, url):
    html = await fetch(session, url)
    if html == False:
        return np.nan, np.nan, np.nan, np.nan
    soup = BeautifulSoup(html, 'html.parser')

    try:
        content = soup.find('main').find("script").get_text()
        content = json.loads(content)
        theme = soup.find_all('div', {'class': "sc-fek4t4-1 fKyolL"})
        if len(theme) == 0:
            theme = np.nan
        else:
            theme = theme[0].get_text()
        content["datePublished"] = content["datePublished"].split("T")[0]
        date = datetime.datetime.strptime(content["datePublished"], '%Y-%m-%d')

        return content["headline"], date, theme, content["articleBody"]
    except:
        return np.nan, np.nan, np.nan, np.nan



async def main_search(year):
    async with aiohttp.ClientSession() as session:
        df = pd.read_csv(f"the_onion/links_{year}.csv", header=None)
        urls = df[0].tolist()
        tasks = []
        for url in urls:
            tasks.append(scrap_the_onion_asyn(session, url))
        results = await asyncio.gather(*tasks)
        df["headline"], df["date"], df["theme"], df["article"] = zip(*results)
        df.to_csv(f"the_onion/articles_{year}.csv", index=False)

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    for year in range(2020, 2024):
        loop.run_until_complete(main_search(year))
