## 1. Set-Up

### 1.1 Import Libraries

In [2]:
import requests
import json
import time
import os
import pickle
from bs4 import BeautifulSoup
from tqdm import tqdm
from collections import defaultdict
from wayback import WaybackClient
from datetime import datetime

### 1.2 Set-Up Cache

In [3]:
# Cache management
CACHE_FILE = 'wayback_cache_TEST.pkl'
def load_cache():
    if os.path.exists(CACHE_FILE) and os.path.getsize(CACHE_FILE) > 0:
        with open(CACHE_FILE, 'rb') as f:
            try:
                return pickle.load(f)
            except EOFError:
                return {}
    return {}

def save_cache(cache):
    """Save the cache to a file."""
    with open(CACHE_FILE, 'wb') as f:
        pickle.dump(cache, f)

cache = load_cache()

## 2. MEP Class

In [4]:
# MEP data classes
class ArchivedMEP:
    def __init__(self, url: str):
        self.url = url
        self.name = None
        self.mep_party = None
        self.assistants = defaultdict(dict)

    def get_mep_data(self, snapshot_url: str):
        response = requests.get(snapshot_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Scrape data with streamlined tag checks
        self.name = (soup.find('span', class_='sln-member-name') or {}).get('text', '').strip()
        self.mep_party = (soup.find('h3', class_='erpl_title-h3 mt-1 sln-political-group-name') or {}).get('text', '').strip()

        for section in soup.find_all('div', class_='erpl_type-assistants'):
            assistant_type = section.find('h4', class_='erpl_title-h4').text.strip()
            for assistant_tag in section.find_all('div', class_='erpl_type-assistants-item'):
                assistant_name = assistant_tag.find('span', class_='erpl_assistant').text.strip()
                self.assistants[assistant_type].setdefault(assistant_name, []).append(snapshot_url)

    def to_dict(self) -> dict:
        return {"name": self.name, "party": self.mep_party, "assistants": self.assistants}


In [5]:
def construct_archived_mep_url(mep_name: str, mep_id: str, snapshot_url: str) -> str:
    """Construct a URL in the form of a Wayback Machine archived snapshot link."""
    # Split the name to create the correct path format
    names = mep_name.split()
    first_names = [name for name in names if not name.isupper()]
    last_names = [name for name in names if name.isupper()]
    
    # Join names with underscores, matching the URL style
    first_name_part = '_'.join(first_names)
    last_name_part = '_'.join(last_names)
    
    # Construct the final name part as "FIRSTNAME_LASTNAME"
    name_path = f"{first_name_part}_{last_name_part}" if last_names else first_name_part
    
    # Append the MEP path to the snapshot URL
    mep_path = f"http://www.europarl.europa.eu/meps/en/{mep_id}/{name_path}/assistants#mep-card-content"
    return f"{snapshot_url}{mep_path}"

## 3. Extract Snapshots from Wayback Machine

This function extracts one snapshot per day from the wayback machine from two dates that have been entered. 

In [6]:
# Snapshot retrieval
def get_wayback_snapshots(base_url: str, from_date: str, to_date: str) -> list:
    """Fetch and cache snapshots."""
    client = WaybackClient()
    from_date_dt = datetime.strptime(from_date, "%Y%m%d")
    to_date_dt = datetime.strptime(to_date, "%Y%m%d")
    print(f"Fetching snapshots for {base_url} from {from_date_dt} to {to_date_dt}...")

    try:
        results = list(client.search(base_url, from_date=from_date_dt, to_date=to_date_dt))
        print(f"Total snapshots found: {len(results)}")
        snapshots_by_day = {result.timestamp: result.raw_url for result in results}
        return [url for _, url in sorted(snapshots_by_day.items())]
    except Exception as e:
        print(f"Error fetching snapshots: {e}")
        return []

## 4. Extract MEP Archived Links
Based on the snapshots I have extraced above, I will now try to recreate the links to the meps home pages based on the full list of MEPs. 

In [8]:
def get_archived_mep_links(from_date: str, to_date: str) -> list:
    base_url = "http://www.europarl.europa.eu/meps/en/full-list"
    snapshot_urls = get_wayback_snapshots(base_url, from_date, to_date)
    if not snapshot_urls:
        print("No snapshots found.")
        return []

    # Use the first snapshot URL
    snapshot_url = snapshot_urls[0]

    response = requests.get(snapshot_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    mep_links = []

    # Iterate through MEP links and construct each archived URL
    for mep in soup.select('a.ep_content'):
        mep_name = mep.find('span', class_='ep_name member-name').text.strip()
        mep_id = mep['href'].split('/')[-1]
        mep_assistant_url = construct_archived_mep_url(mep_name, mep_id, snapshot_url)
        mep_links.append(mep_assistant_url)

    return mep_links

## 5. Run the Damn Thing

In [9]:
def scrape_archived_meps(url):
    """Scrape MEP data from the archived assistant page."""
    mep = ArchivedMEP(url)
    mep.get_mep_data(url)  # Use the MEP's own snapshot URL
    return mep.to_dict()  # Return dictionary directly

In [15]:
# Use existing cache and output files
CACHE_FILE = 'wayback_cache.pkl'
OUTPUT_FILE = "mep_assistants_test.json"

# Load existing cache
cache = load_cache()

def save_data_incrementally(data):
    """Save data incrementally to the JSON file."""
    if os.path.exists(OUTPUT_FILE):
        with open(OUTPUT_FILE, "r+", encoding="utf-8") as f:
            existing_data = json.load(f)
            existing_data.append(data)
            f.seek(0)
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
    else:
        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
            json.dump([data], f, ensure_ascii=False, indent=4)

def main():
    from_date = "20230101"  # Start of 2022
    to_date = "20231231"    # End of 2022

    # Step 1: Retrieve archived MEP links
    print("Fetching archived MEP links...")
    mep_links = get_archived_mep_links(from_date, to_date)
    
    if not mep_links:
        print("No MEP links found for the given date range.")
        return

    # Limit for testing
    mep_links = mep_links[:5]  

    print("Scraping MEP assistants data...")
    for mep_url in tqdm(mep_links, desc="Scraping MEPs"):
        # Check cache first
        if mep_url in cache:
            print(f"Using cached data for {mep_url}")
            mep_data = cache[mep_url]
        else:
            # Scrape and cache new data
            try:
                mep_data = scrape_archived_meps(mep_url)
                cache[mep_url] = mep_data  # Update cache
                save_cache(cache)  # Save cache to file
            except Exception as e:
                print(f"Error scraping {mep_url}: {e}")
                continue
        
        # Save data incrementally to JSON
        save_data_incrementally(mep_data)
        
        # Timeout to avoid overloading the API
        time.sleep(5)  # Adjust this delay as needed

    print(f"Data collection completed. Output saved incrementally to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()

Fetching archived MEP links...
Fetching snapshots for http://www.europarl.europa.eu/meps/en/full-list from 2023-01-01 00:00:00 to 2023-12-31 00:00:00...
Total snapshots found: 21
No MEP links found for the given date range.
