In [1]:
# Import libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.safari.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import time

**TLDR solution:**
1. Using the URL provided and the year in the prompt, we get the first HTML page for scraping. This implementation is handled in _getfirstpagehtml_
2. For the page, we scrape all the research paper links (20 at a time) and save them in a csv along with their metadata link, which can be constructed with slight modification and has "?show=full" at the end. This is handled in _getlinksfromhtml_
3. Finally, we move on to the next page, and repeat step 2. This implementation is handled in _scrapemetadatalinks_

This design gives flexibility to the user and allows to pressure test implementation piece by piece. Further, by downloading links page by page, which can always be aggregated later, we can isolate manual checks on certain pages too. 

In [2]:
# Function to initialize the Selenium WebDriver
def init_driver():
    service = Service()
    driver = webdriver.Safari(service=service)
    return driver

# Function to get the first page of the research papers
def get_first_page_html(driver, url, year):
    # Construct the URL with the year parameter
    params = {
        'type': 'dateissued',
        'sort_by': '2',
        'order': 'ASC',
        'rpp': '20',  # Number of results per page
        'starts_with': str(year)
    }
    
    # Perform the navigation with the constructed URL and parameters
    full_url = url + '?' + '&'.join([f'{k}={v}' for k, v in params.items()])
    driver.get(full_url)
    return driver.page_source

In [4]:
# Function to extract links from the main page

def get_links_from_html(html, page_number):
    soup = BeautifulSoup(html, 'html.parser')
    existing_links = set()
    paper_links = []

    # Locate the specific <div> containing the <ul> with the links
    container_div = soup.find('div', id='aspect_artifactbrowser_ConfigurableBrowse_div_browse-by-dateissued-results', class_='ds-static-div primary')
    if container_div:
        artifact_list = container_div.find('ul', class_='ds-artifact-list list-unstyled')
        if artifact_list:
            for link in artifact_list.select('a[href^="/handle/1721.1/"]'):
                href = 'https://dspace.mit.edu' + link['href']
                metadata_link = href + '?show=full'
                if href not in existing_links:
                    paper_links.append({'href': href, 'page_number': page_number, 'metadata_link': metadata_link})
                    existing_links.add(href)
    
    return existing_links, paper_links

In [5]:
# Function to combine everything and scrape the metadata links

def scrape_metadata_links(url, year):
    driver = init_driver()
    page_number = 1
    current_page_url = url

    while True:
        # Get the current page's HTML
        page_html = get_first_page_html(driver, current_page_url, year)
        print(f"Extracting links from page {page_number}...")
        
        _, paper_links = get_links_from_html(page_html, page_number)
        
        # Export the links to a CSV for the current page
        df = pd.DataFrame(paper_links)
        df.to_csv(f'research_paper_links_page_{page_number}.csv', index=False)
        print(f"Saved page {page_number} links to research_paper_links_page_{page_number}.csv")
        
        # Find the next page link using pagination info
        soup = BeautifulSoup(page_html, 'html.parser')
        pagination_info = soup.find('p', class_='pagination-info').text
        next_offset = int(pagination_info.split(' ')[3].split('-')[1].strip('"'))
        print(f"Next offset: {next_offset}")
        total_items = int(pagination_info.split(' ')[5])
        print(f"Total items: {total_items}")
        
        if next_offset >= total_items:
            break
        
        results_per_page = 20
        current_page_url = f'https://dspace.mit.edu/handle/1721.1/5458/browse?rpp={results_per_page}&offset={next_offset}&etal=-1&sort_by=2&type=dateissued&starts_with={year}&order=ASC'
        driver.get(current_page_url)
        time.sleep(5)  # Adjust the delay as needed
        
        page_number += 1
    
    driver.quit()
    return page_number - 1

In [89]:
# Executing the pipeline
url = 'https://dspace.mit.edu/handle/1721.1/5458/browse'
year = 2000
num_pages = scrape_metadata_links(url, year)
print(f"Extracted and saved links from {num_pages} pages.")

Extracting links from page 1...
Saved page 1 links to research_paper_links_page_1.csv
Next offset: 2727
Total items: 3804
Extracting links from page 2...
Saved page 2 links to research_paper_links_page_2.csv
Next offset: 2747
Total items: 3804
Extracting links from page 3...
Saved page 3 links to research_paper_links_page_3.csv
Next offset: 2767
Total items: 3804
Extracting links from page 4...
Saved page 4 links to research_paper_links_page_4.csv
Next offset: 2787
Total items: 3804
Extracting links from page 5...
Saved page 5 links to research_paper_links_page_5.csv
Next offset: 2807
Total items: 3804
Extracting links from page 6...
Saved page 6 links to research_paper_links_page_6.csv
Next offset: 2827
Total items: 3804
Extracting links from page 7...
Saved page 7 links to research_paper_links_page_7.csv
Next offset: 2847
Total items: 3804
Extracting links from page 8...
Saved page 8 links to research_paper_links_page_8.csv
Next offset: 2867
Total items: 3804
Extracting links from pa