In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from src.utils import is_peer_reviewed


In [2]:
# Read in all JSON files, concatenate them into a single DataFrame
json_files = [
    'data/arxiv_metadata/240301_240310_all_cats_arxiv_metadata.json', 
    'data/arxiv_metadata/240310_240320_all_cats_arxiv_metadata.json', 
    'data/arxiv_metadata/240320_240325_all_cats_arxiv_metadata.json',
    'data/arxiv_metadata/240325_240401_all_cats_arxiv_metadata.json',
]

# Initialize an empty list to hold DataFrames
dataframes = []

# Loop through the JSON files and read them into DataFrames
for json_file in json_files:
    df = pd.read_json(json_file, orient='index')
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
march_arxiv_cs = pd.concat(dataframes)
# Reset index and name it 'arxiv_id'
march_arxiv_cs.reset_index(inplace=True)
march_arxiv_cs.rename(columns={'index': 'arxiv_id'}, inplace=True)

# Drop duplicates from the date overlaps
march_arxiv_cs.drop_duplicates(subset='arxiv_id', inplace=True)

march_arxiv_cs.shape


(15627, 12)

In [3]:
march_arxiv_cs.head()

Unnamed: 0,arxiv_id,url,title,summary,updated,published,authors,comments,categories,primary_category,doi,journal_ref
0,0710.3901v3,http://arxiv.org/abs/0710.3901v3,A recursive linear time modular decomposition ...,A module of a graph G is a set of vertices tha...,2024-03-01T16:03:35Z,2007-10-21T03:30:05Z,"[Derek Corneil, Michel Habib, Christophe Paul,...",An EA of this work appeared in ICALP'08. The a...,[cs.DM],cs.DM,,
1,1505.02681v2,http://arxiv.org/abs/1505.02681v2,Socio-Spatial Group Queries for Impromptu Acti...,The development and integration of social netw...,2015-05-13T10:35:11Z,2015-05-11T15:58:31Z,"[Chih-Ya Shen, De-Nian Yang, Liang-Hao Huang, ...",,"[cs.DS, cs.DB]",cs.DS,10.1109/TKDE.2015.2468726,
2,1607.06444v4,http://arxiv.org/abs/1607.06444v4,The Complexity of Drawing Graphs on Few Lines ...,It is well known that any graph admits a cross...,2024-03-01T10:30:02Z,2016-07-21T19:50:36Z,"[Steven Chaplick, Krzysztof Fleszar, Fabian Li...",A preliminary version appeared in Proc. WADS 2017,"[cs.CC, cs.CG]",cs.CC,10.7155/jgaa.00630,"Journal of Graph Algorithms and Applications, ..."
3,1611.06544v1,http://arxiv.org/abs/1611.06544v1,Stochastic Agent-Based Models of Intimate Part...,Intimate partner violence (IPV) is a significa...,2016-11-20T16:53:18Z,2016-11-20T16:53:18Z,"[Elisa Guidi, Patrizia Meringolo, Andrea Guazz...",,[cs.SI],cs.SI,10.19272/201711402005,Intimate Partner Violence: A Stochastic Model....
4,1710.01837v5,http://arxiv.org/abs/1710.01837v5,Postquantum Brègman relative entropies,We develop a new approach to construction of B...,2024-03-01T10:29:12Z,2017-10-05T00:40:11Z,[Ryszard Paweł Kostecki],v3: paper rewritten from scratch; parts of v2 ...,"[math-ph, cs.IT, math.IT, math.MP]",math-ph,,


In [4]:
# Convert 'updated' and 'published' columns to datetime
march_arxiv_cs['updated'] = pd.to_datetime(march_arxiv_cs['updated'])
march_arxiv_cs['published'] = pd.to_datetime(march_arxiv_cs['published'])

march_arxiv_cs.head(3)

Unnamed: 0,arxiv_id,url,title,summary,updated,published,authors,comments,categories,primary_category,doi,journal_ref
0,0710.3901v3,http://arxiv.org/abs/0710.3901v3,A recursive linear time modular decomposition ...,A module of a graph G is a set of vertices tha...,2024-03-01 16:03:35+00:00,2007-10-21 03:30:05+00:00,"[Derek Corneil, Michel Habib, Christophe Paul,...",An EA of this work appeared in ICALP'08. The a...,[cs.DM],cs.DM,,
1,1505.02681v2,http://arxiv.org/abs/1505.02681v2,Socio-Spatial Group Queries for Impromptu Acti...,The development and integration of social netw...,2015-05-13 10:35:11+00:00,2015-05-11 15:58:31+00:00,"[Chih-Ya Shen, De-Nian Yang, Liang-Hao Huang, ...",,"[cs.DS, cs.DB]",cs.DS,10.1109/TKDE.2015.2468726,
2,1607.06444v4,http://arxiv.org/abs/1607.06444v4,The Complexity of Drawing Graphs on Few Lines ...,It is well known that any graph admits a cross...,2024-03-01 10:30:02+00:00,2016-07-21 19:50:36+00:00,"[Steven Chaplick, Krzysztof Fleszar, Fabian Li...",A preliminary version appeared in Proc. WADS 2017,"[cs.CC, cs.CG]",cs.CC,10.7155/jgaa.00630,"Journal of Graph Algorithms and Applications, ..."


In [5]:
# Count rows
print(march_arxiv_cs.shape)

# First filter: only keep papers that were published OR updated in 2024
march_arxiv_cs = march_arxiv_cs[
    (march_arxiv_cs['updated'].dt.year == 2024) | 
    (march_arxiv_cs['published'].dt.year == 2024)]
# Additional filter: only keep papers that were published OR updated in March
march_arxiv_cs = march_arxiv_cs[
    (march_arxiv_cs['updated'].dt.month == 3) | 
    (march_arxiv_cs['published'].dt.month == 3)]


# Count rows
print(march_arxiv_cs.shape)
march_arxiv_cs.head(3)


(15627, 12)
(13922, 12)


Unnamed: 0,arxiv_id,url,title,summary,updated,published,authors,comments,categories,primary_category,doi,journal_ref
0,0710.3901v3,http://arxiv.org/abs/0710.3901v3,A recursive linear time modular decomposition ...,A module of a graph G is a set of vertices tha...,2024-03-01 16:03:35+00:00,2007-10-21 03:30:05+00:00,"[Derek Corneil, Michel Habib, Christophe Paul,...",An EA of this work appeared in ICALP'08. The a...,[cs.DM],cs.DM,,
2,1607.06444v4,http://arxiv.org/abs/1607.06444v4,The Complexity of Drawing Graphs on Few Lines ...,It is well known that any graph admits a cross...,2024-03-01 10:30:02+00:00,2016-07-21 19:50:36+00:00,"[Steven Chaplick, Krzysztof Fleszar, Fabian Li...",A preliminary version appeared in Proc. WADS 2017,"[cs.CC, cs.CG]",cs.CC,10.7155/jgaa.00630,"Journal of Graph Algorithms and Applications, ..."
4,1710.01837v5,http://arxiv.org/abs/1710.01837v5,Postquantum Brègman relative entropies,We develop a new approach to construction of B...,2024-03-01 10:29:12+00:00,2017-10-05 00:40:11+00:00,[Ryszard Paweł Kostecki],v3: paper rewritten from scratch; parts of v2 ...,"[math-ph, cs.IT, math.IT, math.MP]",math-ph,,


In [6]:
# Second filter: only keep papers that are in selected primary categories
desirable_prim_cats = ['cs.HC', 'cs.CY', 'cs.AI']
march_arxiv_cs_prim_cats = march_arxiv_cs[march_arxiv_cs['primary_category'].isin(desirable_prim_cats)]

# Count rows
print(march_arxiv_cs_prim_cats.shape)


(968, 12)


In [7]:

# Third filter: peer-reviewed only
# Fill in NaN values in 'comments' column with empty string
march_arxiv_cs_prim_cats['comments'] = march_arxiv_cs_prim_cats['comments'].fillna('')
# Add column 'peer_reviewed' to DataFrame
march_arxiv_cs_prim_cats['peer_reviewed'] = march_arxiv_cs_prim_cats['comments'].apply(is_peer_reviewed)
# Drop rows where 'peer_reviewed' is False
march_arxiv_cs_prim_cats = march_arxiv_cs_prim_cats[march_arxiv_cs_prim_cats['peer_reviewed'] == True]
# Reset index
march_arxiv_cs_prim_cats.reset_index(drop=True, inplace=True)
# Count rows
print(march_arxiv_cs_prim_cats.shape)

(254, 13)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  march_arxiv_cs_prim_cats['comments'] = march_arxiv_cs_prim_cats['comments'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  march_arxiv_cs_prim_cats['peer_reviewed'] = march_arxiv_cs_prim_cats['comments'].apply(is_peer_reviewed)


In [8]:
march_arxiv_cs_prim_cats.head()

Unnamed: 0,arxiv_id,url,title,summary,updated,published,authors,comments,categories,primary_category,doi,journal_ref,peer_reviewed
0,2107.04771v2,http://arxiv.org/abs/2107.04771v2,Similar Cases Recommendation using Legal Knowl...,A legal knowledge graph constructed from court...,2024-03-02 08:46:51+00:00,2021-07-10 06:37:36+00:00,"[Jaspreet Singh Dhani, Ruchika Bhatt, Balaji G...",10 pages. 6 figures. 3rd Symposium on Artifici...,[cs.AI],cs.AI,,,True
1,2301.13755v3,http://arxiv.org/abs/2301.13755v3,Retrosynthetic Planning with Dual Value Networks,"Retrosynthesis, which aims to find a route to ...",2024-03-03 14:23:21+00:00,2023-01-31 16:43:53+00:00,"[Guoqing Liu, Di Xue, Shufang Xie, Yingce Xia,...",Accepted to ICML 2023,"[cs.AI, cs.LG]",cs.AI,,,True
2,2306.00292v4,http://arxiv.org/abs/2306.00292v4,Sustainable AI Regulation,"Current proposals for AI regulation, in the EU...",2024-03-06 16:57:25+00:00,2023-06-01 02:20:48+00:00,[Philipp Hacker],Privacy Law Scholars Conference 2023; Common M...,"[cs.CY, I.2]",cs.CY,,,True
3,2306.00919v5,http://arxiv.org/abs/2306.00919v5,Learning About Social Context from Smartphone ...,Understanding how social situations unfold in ...,2024-03-01 13:48:48+00:00,2023-06-01 17:20:56+00:00,"[Aurel Ruben Mader, Lakmal Meegahapola, Daniel...",Accepted at ACM CHI 2024,"[cs.HC, cs.CY]",cs.HC,10.1145/3613904.3642444,,True
4,2309.03685v2,http://arxiv.org/abs/2309.03685v2,PyGraft: Configurable Generation of Synthetic ...,Knowledge graphs (KGs) have emerged as a promi...,2024-03-05 21:56:43+00:00,2023-09-07 13:00:09+00:00,"[Nicolas Hubert, Pierre Monnin, Mathieu d'Aqui...",Accepted in ESWC 2024,"[cs.AI, cs.SE]",cs.AI,,,True


In [9]:
march_arxiv_cs_prim_cats['primary_category'].value_counts()

primary_category
cs.HC    116
cs.AI    102
cs.CY     36
Name: count, dtype: int64

In [10]:
# Write DataFrame to JSON
march_arxiv_cs_prim_cats.to_json(
    'data/arxiv_metadata/filtered/march_2024_ai_hc_cy_peer_reviewed.json', 
    orient='index'
)

In [11]:
# Set a seed for reproducibility
np.random.seed(10)

#Sample 1/4 of the data -- do strateified sampling to presereve the distribution of primary categories
march_arxiv_cs_prim_cats_sampled = march_arxiv_cs_prim_cats.groupby(
    'primary_category', group_keys=False).apply(lambda x: x.sample(frac=0.25))
march_arxiv_cs_prim_cats_sampled.reset_index(drop=True, inplace=True)
march_arxiv_cs_prim_cats_sampled.shape


  march_arxiv_cs_prim_cats_sampled = march_arxiv_cs_prim_cats.groupby(


(64, 13)

In [12]:
march_arxiv_cs_prim_cats_sampled['primary_category'].value_counts()

primary_category
cs.HC    29
cs.AI    26
cs.CY     9
Name: count, dtype: int64

In [13]:
# Write DataFrame to JSON
march_arxiv_cs_prim_cats_sampled.to_json(
    'data/arxiv_metadata/filtered/march_2024_ai_hc_cy_peer_reviewed_sampled.json', 
    orient='index'
)

In [14]:
# Also write the DataFrame to CSV
march_arxiv_cs_prim_cats_sampled.to_csv(
    'data/arxiv_metadata/filtered/march_2024_ai_hc_cy_peer_reviewed_sampled.csv', 
    index=False
)