In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

In [4]:
# This process will extract all the hmtl content of the url. It takes about 10-15 minutes, so be patient
url = 'https://dblp.org/db/conf/bmvc/bmvc2022.html'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

try:
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()
except requests.exceptions.RequestException as e:
    print(f"Error fetching content: {e}")

soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
def extract_data_from_url(url, conference_year):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    data = {
        'Paper Title': [],
        'Author Names': [],
        'Author URLs': [],
        'Conference Name': 'BMVC',
        'Conference Year': conference_year
    }

    papers = soup.select('.tts-content')

    for paper in papers:
        title = paper.select_one('.title').text.strip()
        author_elements = paper.select('span[itemprop="author"]')
        links = [a['href'] for a in paper.select('a[href^="https://dblp.org/pid/"]')]

        for author_element, link in zip(author_elements, links):
            author_name = author_element.select_one('span[itemprop="name"]').text.strip()

            data['Paper Title'].append(title)
            data['Author Names'].append(author_name)
            data['Author URLs'].append(link)

    # Create a Pandas DataFrame
    df = pd.DataFrame(data)

    return df


In [7]:
url_conference_years = [
    ('https://dblp.org/db/conf/bmvc/bmvc1990.html', 1990),
    ('https://dblp.org/db/conf/bmvc/bmvc1991.html', 1991),
    ('https://dblp.org/db/conf/bmvc/bmvc1992.html', 1992),
    ('https://dblp.org/db/conf/bmvc/bmvc1993.html', 1993),
    ('https://dblp.org/db/conf/bmvc/bmvc1994.html', 1994),
    ('https://dblp.org/db/conf/bmvc/bmvc1995.html', 1995),
    ('https://dblp.org/db/conf/bmvc/bmvc1996.html', 1996),
    ('https://dblp.org/db/conf/bmvc/bmvc1997.html', 1997),
    ('https://dblp.org/db/conf/bmvc/bmvc1998.html', 1998),
    ('https://dblp.org/db/conf/bmvc/bmvc1999.html', 1999),
    ('https://dblp.org/db/conf/bmvc/bmvc2000.html', 2000),
    ('https://dblp.org/db/conf/bmvc/bmvc2001.html', 2001),
    ('https://dblp.org/db/conf/bmvc/bmvc2002.html', 2002),
    ('https://dblp.org/db/conf/bmvc/bmvc2003.html', 2003),
    ('https://dblp.org/db/conf/bmvc/bmvc2004.html', 2004),
    ('https://dblp.org/db/conf/bmvc/bmvc2005.html', 2005),
    ('https://dblp.org/db/conf/bmvc/bmvc2006.html', 2006),
    ('https://dblp.org/db/conf/bmvc/bmvc2007.html', 2007),
    ('https://dblp.org/db/conf/bmvc/bmvc2008.html', 2008),
    ('https://dblp.org/db/conf/bmvc/bmvc2009.html', 2009),
    ('https://dblp.org/db/conf/bmvc/bmvc2010.html', 2010),
    ('https://dblp.org/db/conf/bmvc/bmvc2011.html', 2011),
    ('https://dblp.org/db/conf/bmvc/bmvc2012.html', 2012),
    ('https://dblp.org/db/conf/bmvc/bmvc2013.html', 2013),
    ('https://dblp.org/db/conf/bmvc/bmvc2014.html', 2014),
    ('https://dblp.org/db/conf/bmvc/bmvc2015.html', 2015),
    ('https://dblp.org/db/conf/bmvc/bmvc2016.html', 2016),
    ('https://dblp.org/db/conf/bmvc/bmvc2017.html', 2017),
    ('https://dblp.org/db/conf/bmvc/bmvc2018.html', 2018),
    ('https://dblp.org/db/conf/bmvc/bmvc2019.html', 2019),
    ('https://dblp.org/db/conf/bmvc/bmvc2020.html', 2020),
    ('https://dblp.org/db/conf/bmvc/bmvc2021.html', 2021),
    ('https://dblp.org/db/conf/bmvc/bmvc2022.html', 2022)
]

In [8]:
dfs = []
for url, conference_year in url_conference_years:
    df = extract_data_from_url(url, conference_year)
    dfs.append(df)

In [14]:
all_bmvc_data = pd.concat(dfs, ignore_index=True)
all_bmvc_data

Unnamed: 0,Paper Title,Author Names,Author URLs,Conference Name,Conference Year
0,Proceedings of the British Machine Vision Conf...,Andrew C. Sleigh,https://dblp.org/pid/129/0801.html,BMVC,1990
1,Technology innovations and product design issu...,S. White,https://dblp.org/pid/71/3665.html,BMVC,1990
2,On the computational neurobiology of curve det...,Steven W. Zucker,https://dblp.org/pid/40/1041.html,BMVC,1990
3,On the computational neurobiology of curve det...,Allan Dobbins,https://dblp.org/pid/55/10795.html,BMVC,1990
4,On the computational neurobiology of curve det...,Lee Iverson,https://dblp.org/pid/38/4496.html,BMVC,1990
...,...,...,...,...,...
14296,Mutual Contrastive Low-rank Learning to Disent...,Chao Li,https://dblp.org/pid/66/190-31.html,BMVC,2022
14297,Sampling Based On Natural Image Statistics Imp...,Ricardo Kleinlein,https://dblp.org/pid/250/0791.html,BMVC,2022
14298,Sampling Based On Natural Image Statistics Imp...,Alexander Hepburn,https://dblp.org/pid/238/2087.html,BMVC,2022
14299,Sampling Based On Natural Image Statistics Imp...,Raúl Santos-Rodríguez,https://dblp.org/pid/24/7253.html,BMVC,2022


In [15]:
all_bmvc_data.to_csv('bmvc_paper_authors.csv', index=False)

## **Split into separate CSVs per Year**

In [20]:
years = range(1990, 2023)  # Update this range based on your data

for year in years:
    year_data = all_bmvc_data[(all_bmvc_data['Conference Year'] == year)]
    
    output_file = f'bmvc_data_{year}.csv'
    
    year_data.to_csv(output_file, index=False)

## **Extract Author Names to JSON per Year**

In [21]:
authors_per_year = {}

for year in years:
    year_data = all_bmvc_data[all_bmvc_data['Conference Year'] == year]
    
    unique_authors = year_data['Author Names'].unique()
    
    output_file = f'bmvc_authors_{year}.json'
    
    json_data = [{'name': value} for value in unique_authors]
    
    with open(output_file, 'w') as json_file:
        json.dump(json_data, json_file, indent=2)