In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

In [2]:
def extract_data_from_url(url, conference_year):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    data = {
        'Paper Title': [],
        'Author Names': [],
        'Author URLs': [],
        'Conference Name': 'CVPR',
        'Conference Year': conference_year
    }

    papers = soup.select('.tts-content')

    for paper in papers:
        title = paper.select_one('.title').text.strip()
        author_elements = paper.select('span[itemprop="author"]')
        links = [a['href'] for a in paper.select('a[href^="https://dblp.org/pid/"]')]

        for author_element, link in zip(author_elements, links):
            author_name = author_element.select_one('span[itemprop="name"]').text.strip()

            data['Paper Title'].append(title)
            data['Author Names'].append(author_name)
            data['Author URLs'].append(link)

    # Create a Pandas DataFrame
    df = pd.DataFrame(data)

    return df


## **Webscrape 1988-2000**

In [63]:
url_conference_years = [
    ('https://dblp.org/db/conf/cvpr/cvpr1988.html', 1988),
    ('https://dblp.org/db/conf/cvpr/cvpr1989.html', 1989),
    ('https://dblp.org/db/conf/cvpr/cvpr1991.html', 1991),
    ('https://dblp.org/db/conf/cvpr/cvpr1992.html', 1992),
    ('https://dblp.org/db/conf/cvpr/cvpr1993.html', 1993),
    ('https://dblp.org/db/conf/cvpr/cvpr1994.html', 1994),
    ('https://dblp.org/db/conf/cvpr/cvpr1995.html', 1995),
    ('https://dblp.org/db/conf/cvpr/cvpr1996.html', 1996),
    ('https://dblp.org/db/conf/cvpr/cvpr1997.html', 1997),
    ('https://dblp.org/db/conf/cvpr/cvpr1998.html', 1998),
    ('https://dblp.org/db/conf/cvpr/cvpr1999.html', 1999),
    ('https://dblp.org/db/conf/cvpr/cvpr2000.html', 2000)
]

In [64]:
dfs = []
for url, conference_year in url_conference_years:
    df = extract_data_from_url(url, conference_year)
    dfs.append(df)

In [65]:
combined_df_1988_2000 = pd.concat(dfs, ignore_index=True)
combined_df_1988_2000

Unnamed: 0,Paper Title,Author Names,Author URLs,Conference Name,Conference Year
0,Generalizing epipolar-plane image analysis on ...,H. Harlyn Baker,https://dblp.org/pid/30/3020.html,CVPR,1988
1,Generalizing epipolar-plane image analysis on ...,Robert C. Bolles,https://dblp.org/pid/84/4626.html,CVPR,1988
2,A color metric for computer vision.,Glenn Healey,https://dblp.org/pid/49/3841.html,CVPR,1988
3,A color metric for computer vision.,Thomas O. Binford,https://dblp.org/pid/47/2095.html,CVPR,1988
4,Line-drawing interpretation: a mathematical fr...,Vishvjit S. Nalwa,https://dblp.org/pid/50/488.html,CVPR,1988
...,...,...,...,...,...
4113,Multi-Modality Model-Based Registration in the...,Richard D. White,https://dblp.org/pid/47/3635.html,CVPR,2000
4114,Visual Venture: Investigations with Images and...,Lisa M. Brown,https://dblp.org/pid/b/LisaMGBrown.html,CVPR,2000
4115,Invariant Web Defect Detection and Classificat...,Dominik R. Rohrmus,https://dblp.org/pid/18/5595.html,CVPR,2000
4116,Intel's Computer Vision Library: Applications ...,Gary R. Bradski,https://dblp.org/pid/b/GaryRBradski.html,CVPR,2000


## **Webscrape 2001-2008 CVPR**

In [57]:
url_conference_years = [
    ('https://dblp.org/db/conf/cvpr/cvpr2001-1.html', 2001),
    ('https://dblp.org/db/conf/cvpr/cvpr2001-2.html', 2001),
    ('https://dblp.org/db/conf/cvpr/cvpr2003-1.html', 2003),
    ('https://dblp.org/db/conf/cvpr/cvpr2003-2.html', 2003),
    ('https://dblp.org/db/conf/cvpr/cvpr2004-1.html', 2004),
    ('https://dblp.org/db/conf/cvpr/cvpr2004-2.html', 2004),
    ('https://dblp.org/db/conf/cvpr/cvpr2005-1.html', 2005),
    ('https://dblp.org/db/conf/cvpr/cvpr2005-2.html', 2005),
    ('https://dblp.org/db/conf/cvpr/cvpr2006-1.html', 2006),
    ('https://dblp.org/db/conf/cvpr/cvpr2006-2.html', 2006),
    ('https://dblp.org/db/conf/cvpr/cvpr2007.html', 2007),
    ('https://dblp.org/db/conf/cvpr/cvpr2008.html', 2008)
]

In [58]:
dfs = []
for url, conference_year in url_conference_years:
    df = extract_data_from_url(url, conference_year)
    dfs.append(df)

In [59]:
combined_df_2001_2008 = pd.concat(dfs, ignore_index=True)
combined_df_2001_2008

Unnamed: 0,Paper Title,Author Names,Author URLs,Conference Name,Conference Year
0,Image Indexing with Mixture Hierarchies.,Nuno Vasconcelos,https://dblp.org/pid/78/4806.html,CVPR,2001
1,Small Sample Learning during Multimedia Retrie...,Xiang Sean Zhou,https://dblp.org/pid/z/XiangSeanZhou.html,CVPR,2001
2,Small Sample Learning during Multimedia Retrie...,Thomas S. Huang,https://dblp.org/pid/h/ThomasSHuang.html,CVPR,2001
3,Color Constant Ratio Gradients for Image Segme...,Theo Gevers,https://dblp.org/pid/12/6600.html,CVPR,2001
4,Color Constant Ratio Gradients for Image Segme...,Arnold W. M. Smeulders,https://dblp.org/pid/15/5400.html,CVPR,2001
...,...,...,...,...,...
7046,Human-assisted motion annotation.,William T. Freeman,https://dblp.org/pid/86/6650.html,CVPR,2008
7047,Human-assisted motion annotation.,Edward H. Adelson,https://dblp.org/pid/73/143.html,CVPR,2008
7048,Human-assisted motion annotation.,Yair Weiss,https://dblp.org/pid/44/1092.html,CVPR,2008
7049,Globally optimal bilinear programming for comp...,Manmohan Krishna Chandraker,https://dblp.org/pid/79/589.html,CVPR,2008


## **Webscrape 2009-2014 CVPR**

In [3]:
# List of URLs with corresponding conference years
url_conference_years = [
    ('https://dblp.org/db/conf/cvpr/cvpr2014.html', 2014),
    ('https://dblp.org/db/conf/cvpr/cvpr2013.html', 2013),
    ('https://dblp.org/db/conf/cvpr/cvpr2012.html', 2012),
    ('https://dblp.org/db/conf/cvpr/cvpr2011.html', 2011),
    ('https://dblp.org/db/conf/cvpr/cvpr2010.html', 2010),
    ('https://dblp.org/db/conf/cvpr/cvpr2009.html', 2009)
]
    

In [4]:
# Extract data from each URL with the respective conference year
dfs = []
for url, conference_year in url_conference_years:
    df = extract_data_from_url(url, conference_year)
    dfs.append(df)

In [36]:
combined_df_2009_2014 = pd.concat(dfs, ignore_index=True)

In [37]:
combined_df_2009_2014

Unnamed: 0,Paper Title,Author Names,Author URLs,Conference Name,Conference Year
0,Fast and Accurate Image Matching with Cascade ...,Jian Cheng,https://dblp.org/pid/14/6145-1.html,CVPR,2014
1,Fast and Accurate Image Matching with Cascade ...,Cong Leng,https://dblp.org/pid/147/9188.html,CVPR,2014
2,Fast and Accurate Image Matching with Cascade ...,Jiaxiang Wu,https://dblp.org/pid/119/6799-1.html,CVPR,2014
3,Fast and Accurate Image Matching with Cascade ...,Hainan Cui,https://dblp.org/pid/151/8858.html,CVPR,2014
4,Fast and Accurate Image Matching with Cascade ...,Hanqing Lu,https://dblp.org/pid/39/6752.html,CVPR,2014
...,...,...,...,...,...
9353,A projective framework for radiometric image a...,Ping Tan,https://dblp.org/pid/61/6118.html,CVPR,2009
9354,A projective framework for radiometric image a...,Todd E. Zickler,https://dblp.org/pid/33/2279.html,CVPR,2009
9355,Beyond pairwise energies: Efficient optimizati...,Nikos Komodakis,https://dblp.org/pid/05/3782.html,CVPR,2009
9356,Beyond pairwise energies: Efficient optimizati...,Nikos Paragios,https://dblp.org/pid/p/NikosParagios.html,CVPR,2009


## **Webscrape 2015-2023**

In [41]:
url_conference_years = [
    ('https://dblp.org/db/conf/cvpr/cvpr2023.html', 2023),
    ('https://dblp.org/db/conf/cvpr/cvpr2022.html', 2022),
    ('https://dblp.org/db/conf/cvpr/cvpr2021.html', 2021),
    ('https://dblp.org/db/conf/cvpr/cvpr2020.html', 2020),
    ('https://dblp.org/db/conf/cvpr/cvpr2019.html', 2019),
    ('https://dblp.org/db/conf/cvpr/cvpr2018.html', 2018),
    ('https://dblp.org/db/conf/cvpr/cvpr2017.html', 2017),
    ('https://dblp.org/db/conf/cvpr/cvpr2016.html', 2016),
    ('https://dblp.org/db/conf/cvpr/cvpr2015.html', 2015),
]

In [42]:
data_2015_2023 = []
for url, conference_year in url_conference_years:
    df = extract_data_from_url(url, conference_year)
    data_2015_2023.append(df)

In [45]:
combined_df_2015_2023 = pd.concat(data_2015_2023, ignore_index=True)
combined_df_2015_2023

Unnamed: 0,Paper Title,Author Names,Author URLs,Conference Name,Conference Year
0,Affordances from Human Videos as a Versatile R...,Shikhar Bahl,https://dblp.org/pid/223/4390.html,CVPR,2023
1,Affordances from Human Videos as a Versatile R...,Russell Mendonca,https://dblp.org/pid/215/5062.html,CVPR,2023
2,Affordances from Human Videos as a Versatile R...,Lili Chen,https://dblp.org/pid/92/169.html,CVPR,2023
3,Affordances from Human Videos as a Versatile R...,Unnat Jain,https://dblp.org/pid/199/2043.html,CVPR,2023
4,Affordances from Human Videos as a Versatile R...,Deepak Pathak,https://dblp.org/pid/155/9860.html,CVPR,2023
...,...,...,...,...,...
56544,Fine-grained recognition without part annotati...,Jianchao Yang,https://dblp.org/pid/96/3835.html,CVPR,2015
56545,Fine-grained recognition without part annotati...,Li Fei-Fei,https://dblp.org/pid/79/2528.html,CVPR,2015
56546,Robust reconstruction of indoor scenes.,Sungjoon Choi,https://dblp.org/pid/81/618.html,CVPR,2015
56547,Robust reconstruction of indoor scenes.,Qian-Yi Zhou,https://dblp.org/pid/74/6382.html,CVPR,2015


In [66]:
all_cvpr_data = pd.concat([combined_df_2015_2023, combined_df_2009_2014, combined_df_2001_2008, combined_df_1988_2000], ignore_index=True)
all_cvpr_data

Unnamed: 0,Paper Title,Author Names,Author URLs,Conference Name,Conference Year
0,Affordances from Human Videos as a Versatile R...,Shikhar Bahl,https://dblp.org/pid/223/4390.html,CVPR,2023
1,Affordances from Human Videos as a Versatile R...,Russell Mendonca,https://dblp.org/pid/215/5062.html,CVPR,2023
2,Affordances from Human Videos as a Versatile R...,Lili Chen,https://dblp.org/pid/92/169.html,CVPR,2023
3,Affordances from Human Videos as a Versatile R...,Unnat Jain,https://dblp.org/pid/199/2043.html,CVPR,2023
4,Affordances from Human Videos as a Versatile R...,Deepak Pathak,https://dblp.org/pid/155/9860.html,CVPR,2023
...,...,...,...,...,...
77071,Multi-Modality Model-Based Registration in the...,Richard D. White,https://dblp.org/pid/47/3635.html,CVPR,2000
77072,Visual Venture: Investigations with Images and...,Lisa M. Brown,https://dblp.org/pid/b/LisaMGBrown.html,CVPR,2000
77073,Invariant Web Defect Detection and Classificat...,Dominik R. Rohrmus,https://dblp.org/pid/18/5595.html,CVPR,2000
77074,Intel's Computer Vision Library: Applications ...,Gary R. Bradski,https://dblp.org/pid/b/GaryRBradski.html,CVPR,2000


In [68]:
all_cvpr_data.to_csv('cvpr_paper_author.csv', index=False)

## **Split Whole CSV to CSV by Year**

In [69]:
years = range(1988, 2024)  # Update this range based on your data

for year in years:
    year_data = all_cvpr_data[(all_cvpr_data['Conference Year'] == year)]
    
    output_file = f'cvpr_data_{year}.csv'
    
    year_data.to_csv(output_file, index=False)

## **Extract Author Names as JSON**

In [71]:
years = range(1988, 2024)

authors_per_year = {}

for year in years:
    if year in [1990, 1995]:
        continue
    
    year_data = all_cvpr_data[all_cvpr_data['Conference Year'] == year]
    
    unique_authors = year_data['Author Names'].unique()
    
    output_file = f'authors_{year}.json'
    
    json_data = [{'name': value} for value in unique_authors]
    
    with open(output_file, 'w') as json_file:
        json.dump(json_data, json_file, indent=2)