In [1]:
import re
import json
import pandas as pd

In [2]:
def is_peer_reviewed(comment):

    '''
    Parameters:
    ----------------
    comment: String, contains any author comments posted along with the arXiv metadata

    Returns:
    ----------------
    peer_reviewed: Boolean, True or False, depending on the comment
    '''

    # Define a regular expression pattern to match strings meeting the criteria for peer review indicators
    pattern = r'(accept.*|publish.*|present.*|2023|2024|2025)'
    
    # Use re.search to find if the pattern matches anywhere in the text
    match = re.search(pattern, comment, re.IGNORECASE)
    
    # If a match is found, return True, otherwise return False
    return bool(match)




In [3]:
# Read in the IDs w/ the metadata -- change filename to the relevant one
with open('../data/raw/240301_240401_arxiv_metadata.json') as json_data:
    metadata = json.load(json_data)
    json_data.close()

# Convert JSON to DataFrame
metadata_df = pd.DataFrame.from_dict(metadata, orient='index')
metadata_df.index.name = 'arxiv_id'
metadata_df = metadata_df.reset_index()

In [4]:
metadata_df.head()

Unnamed: 0,arxiv_id,title,authors,comments,subjects,abstract
0,1404.0736,Exploiting Linear Structure Within Convolution...,"[Remi Denton, Wojciech Zaremba, Joan Bruna, Ya...",,Computer Vision and Pattern Recognition (cs.CV),We present techniques for speeding up the te...
1,1607.01327,Feature Selection Library (MATLAB Toolbox),[Giorgio Roffo],Feature Selection Library (FSLib) 2024,Computer Vision and Pattern Recognition (cs.CV),The Feature Selection Library (FSLib) introduc...
2,1610.00291,Deep Feature Consistent Variational Autoencoder,"[Xianxu Hou, Linlin Shen, Ke Sun, Guoping Qiu]",WACV,Computer Vision and Pattern Recognition (cs.CV),We present a novel method for constructing Var...
3,1611.0643,Semi-Supervised Learning with Context-Conditio...,"[Remi Denton, Sam Gross, Rob Fergus]",,Computer Vision and Pattern Recognition (cs.CV),We introduce a simple semi-supervised learni...
4,1611.06544,Stochastic Agent-Based Models of Intimate Part...,"[Elisa Guidi, Patrizia Meringolo, Andrea Guazz...",,Social and Information Networks (cs.SI),Intimate partner violence (IPV) is a significa...


In [5]:
metadata_df.shape

(7119, 6)

In [6]:
metadata_df['peer_reviewed'] = metadata_df['comments'].apply(is_peer_reviewed)
metadata_df = metadata_df.loc[metadata_df['peer_reviewed']].reset_index(drop=True)


In [7]:
metadata_df.shape

(2081, 7)

In [10]:
metadata_df['subjects'].value_counts()

Computer Vision and Pattern Recognition (cs.CV)    752
Computation and Language (cs.CL)                   419
Machine Learning (cs.LG)                           324
Robotics (cs.RO)                                   275
Human-Computer Interaction (cs.HC)                 110
Artificial Intelligence (cs.AI)                     68
Information Retrieval (cs.IR)                       63
Computers and Society (cs.CY)                       35
Social and Information Networks (cs.SI)             19
Networking and Internet Architecture (cs.NI)        16
Name: subjects, dtype: int64

In [None]:
metadata_df.head()