In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
def crawl_website(url):
    # Send a GET request to the website
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all divs with the class "obj_issue_summary"
        issue_summaries = soup.find_all('div', class_='obj_issue_summary')
        
        return issue_summaries
    else:
        print(f"Failed to retrieve the website. Status code: {response.status_code}")
        return None

In [3]:
def crawl_article_summaries(url):
    # Send a GET request to the website
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all divs with the class "obj_article_summary"
        article_summaries = soup.find_all('div', class_='obj_article_summary')
        
        return article_summaries
    else:
        print(f"Failed to retrieve the website. Status code: {response.status_code}")
        return None

In [4]:
def get_urls(issue_summaries):
    urls = []
    for issue_summary in issue_summaries:
        # Find the a class title
        link = issue_summary.find('a', class_='title', href=True)

        title_text = link.text
        
        # Check if AAAI-24 in title
        if 'AAAI-24' not in link.text:
            continue
        # Append the URL to the list of URLs
        urls.append((title_text, link['href']))
    
    return urls

In [5]:
url = 'https://ojs.aaai.org/index.php/AAAI/issue/archive'  # Replace with the target website URL
summaries = crawl_website(url)

In [6]:
urls = get_urls(summaries)

In [8]:
import time

issue_summaries = []
for url in urls:
    issue_summaries.append(crawl_article_summaries(url[1]))
    print('url', len(issue_summaries))
    time.sleep(2)  # Sleep for 1 second to avoid spamming the website

# Flatten the list of issue summaries
issue_summaries = [summary for summaries in issue_summaries for summary in summaries]
print(len(issue_summaries))

url 1
url 2
url 3
url 4
url 5
url 6
url 7
url 8
url 9
url 10
url 11
url 12
url 13
url 14
url 15
url 16
url 17
url 18
url 19
url 20
url 21


In [27]:
import re
import tqdm

def clear_string(input_str):
    # Use regular expression to remove backslashes and the following character
    cleaned_str = re.sub(r'\\.', '', input_str)
    return cleaned_str.strip()

data = []
with tqdm.tqdm(total=len(issue_summaries)) as pbar:
    for issue_summary in issue_summaries:
        current = {}
        # Find the h3 tag
        h3 = issue_summary.find('h3')
        title = clear_string(h3.text)

        current['title'] = title
        current['authors'] = []
        
        # Find the a tag
        href = h3.find('a')['href']

        for i in range(5):
            try:
                response = requests.get(href)

                soup = BeautifulSoup(response.content, 'html.parser')

                authors = soup.find('ul', class_='authors')
                authors_list = authors.find_all('li')

                for author in authors_list:
                    cur_author = {}
                    cur_author['name'] = clear_string(author.find('span', class_='name').text)
                    cur_author['affiliation'] = clear_string(author.find('span', class_='affiliation').text)
                    current['authors'].append(cur_author)

                data.append(current)
                break
            except:
                time.sleep(2)
        pbar.update(1)

 35%|███▌      | 1014/2865 [18:41:30<6838:50:55, 13300.84s/it]

In [None]:
# Serialize the data to a JSON file
import json

with open('data/aaai24.json', 'w') as f:
    json.dump(data, f, indent=4)