In [10]:
# Importing libraries
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.safari.service import Service
from bs4 import BeautifulSoup
import time

In [4]:
def combine_csv_files(input_folder, output_file):
    all_files = [f for f in os.listdir(input_folder) if f.startswith('research_paper_links_page_') and f.endswith('.csv')]
    all_files.sort(key=lambda f: int(f.split('_')[-1].split('.')[0]))  # Ensure the files are sorted

    combined_df = pd.concat([pd.read_csv(os.path.join(input_folder, f)) for f in all_files], ignore_index=True)
    combined_df.to_csv(output_file, index=False)
    print(f"Combined CSV saved to {output_file}")

In [5]:
# Creating one csv file
current_dir = os.getcwd()
input_folder = os.path.join(current_dir, '..', 'data', 'paper_links')
output_file = os.path.join(current_dir, '..', 'data', 'all_research_paper_links.csv')

combine_csv_files(input_folder, output_file)

Combined CSV saved to /Users/eshan23/Library/CloudStorage/OneDrive-TheUniversityofChicago/Summer 2024/Shaoda_Wang_RA/wang-ml/scripts/../data/all_research_paper_links.csv


In [11]:
# Initialize the Selenium WebDriver
def init_driver():
    service = Service()
    driver = webdriver.Safari(service=service)
    return driver

In [12]:
# Function to get metadata from a link using Selenium
def get_metadata_from_link(driver, metadata_url):
    print(f"Getting metadata from {metadata_url}")
    metadata = {}

    try:
        driver.get(metadata_url)
        time.sleep(2)  # Allow some time for the page to load
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table = soup.find('table', {'class': 'ds-includeSet-table'})
        if not table:
            raise ValueError("Metadata table not found")

        for row in table.find_all('tr'):
            cols = row.find_all('td')
            if len(cols) >= 2:  # Ensure there are at least two columns
                key = cols[0].text.strip()
                value = cols[1].text.strip()
                if key in metadata:
                    metadata[key] += f"; {value}"
                else:
                    metadata[key] = value
    except Exception as e:
        print(f"Error fetching metadata from {metadata_url}: {e}")
        return None

    return metadata

In [13]:
# 
def extract_and_save_metadata(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    all_files = [f for f in os.listdir(input_folder) if f.startswith('research_paper_links_page_') and f.endswith('.csv')]
    all_files.sort(key=lambda f: int(f.split('_')[-1].split('.')[0]))  # Ensure the files are sorted

    driver = init_driver()

    for file in all_files:
        page_number = file.split('_')[-1].split('.')[0]
        df = pd.read_csv(os.path.join(input_folder, file))

        metadata_list = []
        for link in df['metadata_link']:
            metadata = get_metadata_from_link(driver, link)
            if metadata:
                metadata_list.append(metadata)

        metadata_df = pd.DataFrame(metadata_list)
        metadata_df.to_csv(os.path.join(output_folder, f'metadata_page_{page_number}.csv'), index=False)
        print(f"Metadata for page {page_number} saved to {output_folder}/metadata_page_{page_number}.csv")

    driver.quit()

In [14]:
# Fetching metadata
input_folder = os.path.join(current_dir, '..', 'data', 'paper_links')
output_folder = os.path.join(current_dir, '..', 'data', 'metadata')
extract_and_save_metadata(input_folder, output_folder)

Getting metadata from https://dspace.mit.edu/handle/1721.1/149913?show=full
Getting metadata from https://dspace.mit.edu/handle/1721.1/149277?show=full
Getting metadata from https://dspace.mit.edu/handle/1721.1/149906?show=full
Getting metadata from https://dspace.mit.edu/handle/1721.1/149907?show=full
Getting metadata from https://dspace.mit.edu/handle/1721.1/149295?show=full
Getting metadata from https://dspace.mit.edu/handle/1721.1/149296?show=full
Getting metadata from https://dspace.mit.edu/handle/1721.1/149317?show=full
Getting metadata from https://dspace.mit.edu/handle/1721.1/149910?show=full
Getting metadata from https://dspace.mit.edu/handle/1721.1/149297?show=full
Getting metadata from https://dspace.mit.edu/handle/1721.1/149298?show=full
Getting metadata from https://dspace.mit.edu/handle/1721.1/149912?show=full
Getting metadata from https://dspace.mit.edu/handle/1721.1/7222?show=full
Getting metadata from https://dspace.mit.edu/handle/1721.1/5566?show=full
Getting metadata