# Wrangling author data
## About the data
- 2015 - 2021: Separate datasets for proceedings (full papers) and adjunct proceedings (demos, posters, student innovation contest submissions)

In [1]:
import pandas as pd
import os

In [6]:
# Function for outputting summary stats

def summarize(df: pd.DataFrame, year: int):
    print(f"Summary for {year}:\n")
    
    # Skip for 2019 - 2021
    if year < 2019:
        papers = df['Paper Title']
        num_unique_papers = len(pd.unique(papers))
        print(f"Number of unique papers: {num_unique_papers}\n")
    authors = df['First Name'].astype(str) + df['Middle Initial'].fillna('').astype(str) + df['Last Name'].astype(str)
    num_unique_authors = len(pd.unique(authors))
    print(f"Number of authors: {len(authors)}\n")
    print(f"Number of unique authors: {num_unique_authors}\n")
    print("-------")

In [7]:
# Load in the data for each year
proc_data_repo = "./data/raw_author_data/"
file_names = {
    2015: "uist2015_proceedings.csv", 
    2016: "uist2016_proceedings.csv", 
    2017: "uist2017_proceedings.csv", 
    2018: "uist2018_proceedings.csv", 
    2019: "uist2019_proceedings.csv", 
    2020: "uist2020_proceedings.csv", 
    2021: "uist2021_proceedings.csv"
}

for year, fn in file_names.items(): 
    fn_path = os.path.join(proc_data_repo, fn)
    df = pd.read_csv(fn_path)

    # Print summary stats for each year
    summarize(df, year)

Summary for 2015:

Number of unique papers: 70

Number of authors: 286

Number of unique authors: 265

-------
Summary for 2016:

Number of unique papers: 79

Number of authors: 413

Number of unique authors: 374

-------
Summary for 2017:

Number of unique papers: 74

Number of authors: 74

Number of unique authors: 69

-------
Summary for 2018:

Number of unique papers: 82

Number of authors: 359

Number of unique authors: 341

-------
Summary for 2019:

Number of authors: 442

Number of unique authors: 389

-------
Summary for 2020:

Number of authors: 472

Number of unique authors: 416

-------
Summary for 2021:

Number of authors: 472

Number of unique authors: 420

-------


In [None]:
# For each year, create a new dataframe with columns: first name, middle, last name, email, year
# Ensure that first + middle + last is a unique identifier --> print the names that are de-duplicated
proc_2015 = proc_2015[]

# Iteratively merge each year
result = pd.merge(left, right, how="left", on=["key1", "key2"])

In [None]:

# Combine all the years into one dataframe, de-duplicate authors (first + middle + last)
df = None # all the years
# Keep the latest year email
df.drop_duplicates(subset=['brand', 'style'], keep='last')


In [None]:
# Summary statistics for all years combined
# Total number of papers
# Unique set & number of authors
# Unique set & number of institutions
# Number of times an author has published 2015 - 2021
# Numebr of times an instutition has published 2015 - 2021

In [None]:
# Send survey

# Gather data

# Wrangle collected data