In [1]:
'''
A HTML scraper that gets the transcription for all 
EU Parliament verbatim reports from 01/2004 to 05/2024.
'''

'\nA HTML scraper that gets the transcription for all \nEU Parliament verbatim reports from 01/2004 to 05/2024.\n'

In [2]:
from bs4 import BeautifulSoup
import csv
import requests
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm
import os

In [3]:
def get_term(date):
    '''
    Returns the number of the parliamentary legislature,
    needed for retrieving the transcripts.
    '''
    
    # Start and end of each Parliament term, with the corresponding code
    date_ranges = {
        ("2019-07-02", "2024-06-01"): 9,
        ("2014-07-01", "2019-04-18"): 8,
        ("2009-07-14", "2014-04-17"): 7,  # Interval from 14 July 2009 to 17 April 2014
        ("2004-07-20", "2009-05-07"): 6
    }

    # Iterate through each date interval and corresponding code
    for (start_date, end_date), code in date_ranges.items():
        
        # Convert the start and end dates of the interval to pandas Timestamps
        start = pd.to_datetime(start_date)
        end = pd.to_datetime(end_date)
        
        # Check if the input date falls within the interval
        if start <= date <= end:
            return code
    
    # Return None if no matching interval was found
    return np.nan


In [4]:
def build_url(term, date):
    '''
    Using the term and date, we build the structure of the
    URL that contains the speeches given in one particular date.
    '''
    
    # Formats the date object
    date_str = date.strftime('%Y-%m-%d')
    
    # Base URL for all verbatim reports
    url = f"https://www.europarl.europa.eu/doceo/document/CRE-{term}-{date_str}_EN.html"
    
    return url

In [5]:
def read_existing_log(filepath):
    '''
    Keeps trace of the URLs that were already saved,
    so we don't need to start over in case something
    goes wrong.
    '''
    
    try:
        # Read existing log file if it exists
        return pd.read_csv(filepath)
    except FileNotFoundError:
        # Return empty DataFrame if no log file exists
        return pd.DataFrame(columns=['Date', 'Status', 'Details'])

In [12]:
def main():
    
    # Directory for saving the log
    log_path = '../log.csv'
    
    # Load the existing log as dataframe, return a empty dataframe if there is no log
    existing_log = read_existing_log(log_path)
    
    # Save the processed and not found dates in a list for later control
    processed_dates = set(existing_log.loc[existing_log['Status'] == 'Success', 'Date'])
    not_found_dates = set(existing_log.loc[existing_log['Status'] == 'Not Found', 'Date'])

    # Open log file for appending
    with open(log_path, 'a', newline='') as log_file:
        
        log_writer = csv.writer(log_file)
        
        # Write header only if the file is empty
        if os.stat(log_path).st_size == 0:
            log_writer.writerow(['Date', 'Status', 'Details'])
        
        # Generate date range (first speech in website to end of April)
        dates = pd.date_range(start="2004-07-20", end="2024-05-01", freq='D')
        
        # Iterate through each date in the range
        for date in tqdm(dates):
            
            print(date, end= '\r')
            date_str = date.strftime('%Y-%m-%d')
            
            if date_str in processed_dates or date_str in not_found_dates:
                continue  # Skip dates that have been successfully processed or were not found before

            try:
                
                # Build the URL
                term = get_term(date)
                url = build_url(term, date)
                
                # Send the request
                r = requests.get(url)
                time.sleep(0.1)
                
                # Handle response
                
                # If the responde is good, write the file
                if r.status_code == 200:
                    fname = f"CRE-{term}-{date_str}"
                    with open(f"../../output/html/{fname}.txt", "w+") as f:
                        f.write(r.text)
                    log_writer.writerow([date_str, 'Success', 'File saved'])
                
                # If there was a 404 error, log it as not found
                elif r.status_code == 404:
                    log_writer.writerow([date_str, 'Not Found', 'URL not found'])
                
                # Else, save the error code
                else:
                    log_writer.writerow([date_str, r.status_code, 'Other HTTP error'])
            
            # If there was a RequestException, do the same
            except requests.exceptions.RequestException as e:
                # Handle requests exceptions
                log_writer.writerow([date_str, 'Request Error', str(e)])
            
            # Any other exception, save it as well
            except Exception as e:
                # Handle other exceptions
                log_writer.writerow([date_str, 'Error', str(e)])


In [13]:
main()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7226.0), HTML(value='')))

2004-07-20 00:00:002004-07-21 00:00:002004-07-22 00:00:002004-07-23 00:00:002004-07-24 00:00:002004-07-25 00:00:002004-07-26 00:00:002004-07-27 00:00:002004-07-28 00:00:002004-07-29 00:00:002004-07-30 00:00:002004-07-31 00:00:002004-08-01 00:00:002004-08-02 00:00:002004-08-03 00:00:002004-08-04 00:00:002004-08-05 00:00:002004-08-06 00:00:002004-08-07 00:00:002004-08-08 00:00:002004-08-09 00:00:002004-08-10 00:00:002004-08-11 00:00:002004-08-12 00:00:002004-08-13 00:00:002004-08-14 00:00:002004-08-15 00:00:002004-08-16 00:00:002004-08-17 00:00:002004-08-18 00:00:002004-08-19 00:00:002004-08-20 00:00:002004-08-21 00:00:002004-08-22 00:00:002004-08-23 00:00:002004-08-24 00:00:002004-08-25 00:00:002004-08-26 00:00:002004-08-27 00:00:002004-08-28 00:00:002004-08-29 00:00:002004-08-30 00:00:002004-08-31 00:00:002004-09-01 00:00:002004-09-02 00:00:002004-09-03 00:00:002004-09-04 00:00:002004-09-05 00:00:002004-09-06 00:00:002004-09-07 00:00:00

2024-05-01 00:00:00
