In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

In [12]:
def extract_data_from_url(url, conference_year):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    data = {
        'paper_title': [],
        'author_name': [],
        'author_link': [],
        'conference_name': 'ICCV',
        'conference_year': conference_year
    }

    papers = soup.select('.tts-content')

    for paper in papers:
        title = paper.select_one('.title').text.strip()
        author_elements = paper.select('span[itemprop="author"]')
        links = [a['href'] for a in paper.select('a[href^="https://dblp.org/pid/"]')]

        for author_element, link in zip(author_elements, links):
            author_name = author_element.select_one('span[itemprop="name"]').text.strip()

            data['paper_title'].append(title)
            data['author_name'].append(author_name)
            data['author_link'].append(link)

    # Create a Pandas DataFrame
    df = pd.DataFrame(data)

    return df


In [13]:
url_conference_years = [
    ('https://dblp.org/db/conf/iccv/iccv2003-1.html', 2003),
    ('https://dblp.org/db/conf/iccv/iccv2003-2.html', 2003),
    ('https://dblp.org/db/conf/iccv/iccv2005-1.html', 2005),
    ('https://dblp.org/db/conf/iccv/iccv2005-2.html', 2005),
    ('https://dblp.org/db/conf/iccv/iccv2007.html', 2007),
    ('https://dblp.org/db/conf/iccv/iccv2009.html', 2009),
    ('https://dblp.org/db/conf/iccv/iccv2011.html', 2011),
    ('https://dblp.org/db/conf/iccv/iccv2013.html', 2013),
    ('https://dblp.org/db/conf/iccv/iccv2015.html', 2015),
    ('https://dblp.org/db/conf/iccv/iccv2017.html', 2017),
    ('https://dblp.org/db/conf/iccv/iccv2019.html', 2019),
    ('https://dblp.org/db/conf/iccv/iccv2021.html', 2021)
]

In [15]:
dfs = []
for url, conference_year in url_conference_years:
    df = extract_data_from_url(url, conference_year)
    dfs.append(df)

In [16]:
all_iccv_data = pd.concat(dfs, ignore_index=True)
all_iccv_data

Unnamed: 0,paper_title,author_name,author_link,conference_name,conference_year
0,Regression based Bandwidth Selection for Segme...,Maneesh Kumar Singh,https://dblp.org/pid/263/9205-1.html,ICCV,2003
1,Regression based Bandwidth Selection for Segme...,Narendra Ahuja,https://dblp.org/pid/30/3572.html,ICCV,2003
2,Learning a Classification Model for Segmentation.,Xiaofeng Ren,https://dblp.org/pid/84/3585.html,ICCV,2003
3,Learning a Classification Model for Segmentation.,Jitendra Malik,https://dblp.org/pid/58/2944.html,ICCV,2003
4,"Image Parsing: Unifying Segmentation, Detectio...",Zhuowen Tu,https://dblp.org/pid/t/ZTu.html,ICCV,2003
...,...,...,...,...,...
23508,PointBA: Towards Backdoor Attacks in 3D Point ...,Yue Zhao,https://dblp.org/pid/48/76.html,ICCV,2021
23509,PointBA: Towards Backdoor Attacks in 3D Point ...,Zekun Tong,https://dblp.org/pid/258/0469.html,ICCV,2021
23510,PointBA: Towards Backdoor Attacks in 3D Point ...,Yabang Zhao,https://dblp.org/pid/289/2131.html,ICCV,2021
23511,PointBA: Towards Backdoor Attacks in 3D Point ...,Andrew Lim,https://dblp.org/pid/92/972.html,ICCV,2021


In [17]:
all_iccv_data.to_csv('iccv_paper_authors.csv', index=False)

## **Split into separate CSVs per Year**

In [21]:
years = range(2002, 2022)  # Update this range based on your data

for year in years:
    year_data = all_iccv_data[(all_iccv_data['conference_year'] == year)]
    
    output_file = f'iccv_data_{year}.json'
    
    year_data.to_json(output_file, index=False)

## **Extract Author Names to JSON per Year**

In [20]:
authors_per_year = {}

for year in years:
    year_data = all_iccv_data[all_iccv_data['conference_year'] == year]
    
    unique_authors = year_data['author_name'].unique()
    
    output_file = f'iccv_authors_{year}.json'
    
    json_data = [{'name': value} for value in unique_authors]
    
    with open(output_file, 'w') as json_file:
        json.dump(json_data, json_file, indent=2)