In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


def parse_paper_tag(tag):
    title = tag.find('a').text
    authors_list = tag.next_sibling.next_sibling.find_all('a')
    authors = ';'.join([author.text for author in authors_list])
    pdf_link = tag.next_sibling.next_sibling.next_sibling.next_sibling.find_all('a')[0]['href']

    return title, authors, pdf_link

def get_cv_papers(year=2022, conf='CVPR', month='6'):
    r = requests.get(f'https://openaccess.thecvf.com/{conf}{year}?day=all' )
    soup = BeautifulSoup(r.text, 'html.parser')
    tags = soup.find_all('dt')

    papers = []
    for tag in tags:
        title, authors, pdf_link = parse_paper_tag(tag)
        papers.append([title, authors, pdf_link])

    df = pd.DataFrame(papers, columns=['title', 'authors', 'pdf_link',])
    df['year'] = year
    df['source'] = 'CVPR'
    df['abstract'] = None
    df['keywords'] = None
    df['class'] = None
    df['pdf_link'] = df['pdf_link'].apply(lambda x: 'https://openaccess.thecvf.com' + x)
    
    # save with column order: title,year,source,authors,class,keywords,abstract,pdf_link
    df[['title', 'year', 'source', 'authors', 'class', 'keywords', 'abstract', 'pdf_link']].to_csv(f'data/{year}/{month}_{conf}.csv', index=False)

    return df

In [4]:
# df_cvpr_2021 = get_cv_papers(2021)
# len(df_cvpr_2021) # 1660

In [5]:
# df_cvpr_2022 = get_cv_papers(2022)
# len(df_cvpr_2022) # 2074

In [6]:
# df_cvpr_2023 = get_cv_papers(2023)
# len(df_cvpr_2023) # 2359

In [9]:
df_iccv_2023 = get_cv_papers(2023, conf='ICCV', month='10')