In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

In [3]:
def extract_data_from_url(url, conference_year):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    data = {
        'Paper Title': [],
        'Author Names': [],
        'Author URLs': [],
        'Conference Name': 'WACV',
        'Conference Year': conference_year
    }

    papers = soup.select('.tts-content')

    for paper in papers:
        title = paper.select_one('.title').text.strip()
        author_elements = paper.select('span[itemprop="author"]')
        links = [a['href'] for a in paper.select('a[href^="https://dblp.org/pid/"]')]

        for author_element, link in zip(author_elements, links):
            author_name = author_element.select_one('span[itemprop="name"]').text.strip()

            data['Paper Title'].append(title)
            data['Author Names'].append(author_name)
            data['Author URLs'].append(link)

    # Create a Pandas DataFrame
    df = pd.DataFrame(data)

    return df


In [4]:
url_conference_years = [
    ('https://dblp.org/db/conf/wacv/wacv2005.html', 2005),
    ('https://dblp.org/db/conf/wacv/wacv2007.html', 2007),
    ('https://dblp.org/db/conf/wacv/wacv2008.html', 2008),
    ('https://dblp.org/db/conf/wacv/wacv2009.html', 2009),
    ('https://dblp.org/db/conf/wacv/wacv2011.html', 2011),
    ('https://dblp.org/db/conf/wacv/wacv2012.html', 2012),
    ('https://dblp.org/db/conf/wacv/wacv2013.html', 2013),
    ('https://dblp.org/db/conf/wacv/wacv2014.html', 2014),
    ('https://dblp.org/db/conf/wacv/wacv2015.html', 2015),
    ('https://dblp.org/db/conf/wacv/wacv2016.html', 2016),
    ('https://dblp.org/db/conf/wacv/wacv2017.html', 2017),
    ('https://dblp.org/db/conf/wacv/wacv2018.html', 2018),
    ('https://dblp.org/db/conf/wacv/wacv2019.html', 2019),
    ('https://dblp.org/db/conf/wacv/wacv2020.html', 2020),
    ('https://dblp.org/db/conf/wacv/wacv2021.html', 2021),
    ('https://dblp.org/db/conf/wacv/wacv2022.html', 2022),
    ('https://dblp.org/db/conf/wacv/wacv2023.html', 2023)
]

In [5]:
dfs = []
for url, conference_year in url_conference_years:
    df = extract_data_from_url(url, conference_year)
    dfs.append(df)

In [9]:
all_wacv_data = pd.concat(dfs, ignore_index=True)
all_wacv_data

Unnamed: 0,Paper Title,Author Names,Author URLs,Conference Name,Conference Year
0,Image Segmentation by Unsupervised Sparse Clus...,Byoung-Ki Jeon,https://dblp.org/pid/86/3989.html,WACV,2005
1,Image Segmentation by Unsupervised Sparse Clus...,Yun-Beom Jung,https://dblp.org/pid/33/1342.html,WACV,2005
2,Image Segmentation by Unsupervised Sparse Clus...,Ki-Sang Hong,https://dblp.org/pid/26/5507.html,WACV,2005
3,3D Recognition and Segmentation of Objects in ...,Ajmal S. Mian,https://dblp.org/pid/63/807.html,WACV,2005
4,3D Recognition and Segmentation of Objects in ...,Mohammed Bennamoun,https://dblp.org/pid/00/3214.html,WACV,2005
...,...,...,...,...,...
13374,Mapping DNN Embedding Manifolds for Network Ge...,Mathias Unberath,https://dblp.org/pid/165/8137.html,WACV,2023
13375,Mapping DNN Embedding Manifolds for Network Ge...,Aria Pezeshk,https://dblp.org/pid/43/8760.html,WACV,2023
13376,Mapping DNN Embedding Manifolds for Network Ge...,Greg Hager,https://dblp.org/pid/12/5814.html,WACV,2023
13377,Federated Learning for Commercial Image Sources.,Shreyansh Jain,https://dblp.org/pid/254/1516.html,WACV,2023


In [10]:
all_wacv_data.to_csv('wacv_paper_author.csv', index=False)

## **Split Whole CSV to CSV by Year**

In [15]:
years = range(2005, 2024)  # Update this range based on your data

for year in years:
    if year in [2006, 2010]:
        continue
        
    year_data = all_wacv_data[(all_wacv_data['Conference Year'] == year)]
    
    output_file = f'wacv_data_{year}.csv'
    
    year_data.to_csv(output_file, index=False)

## **Extract Author Names as JSON**

In [16]:
years = range(2005, 2024)

authors_per_year = {}

for year in years:
    if year in [2006, 2010]:
        continue
    
    year_data = all_wacv_data[all_wacv_data['Conference Year'] == year]
    
    unique_authors = year_data['Author Names'].unique()
    
    output_file = f'wacv_authors_{year}.json'
    
    json_data = [{'name': value} for value in unique_authors]
    
    with open(output_file, 'w') as json_file:
        json.dump(json_data, json_file, indent=2)