In [2]:
import json
import pandas as pd
from datetime import datetime

In [3]:
def load_filtered_data(filepath, start_year, end_year):
    filtered_data = []
    with open(filepath, 'r') as f:
        for line in f:
            # Parse the JSON line
            paper = json.loads(line)
            
            # Get the update_date and convert it to a datetime object
            update_date = paper.get("update_date")
            if update_date:
                paper_date = datetime.strptime(update_date, "%Y-%m-%d")
                
                # Check if the paper is within the 2022-2024 range
                if start_year <= paper_date.year <= end_year:
                    # Extract only the required columns
                    filtered_data.append({
                        "title": paper.get("title"),
                        "abstract": paper.get("abstract"),
                        "update_date": paper.get("update_date"),
                        "authors_parsed": paper.get("authors_parsed"),
                        "categories": paper.get("categories"),
                        "submitter": paper.get("submitter"),
                        "journal-ref": paper.get("journal-ref")
                    })
    
    return filtered_data


In [4]:
file_path ="arxiv-metadata-oai-snapshot.json"

start_year = 2023
end_year = 2024

# Load the filtered data
filtered_data = load_filtered_data(file_path, start_year, end_year)

# Convert to DataFrame
df = pd.DataFrame(filtered_data)

FileNotFoundError: [Errno 2] No such file or directory: 'arxiv-metadata-oai-snapshot.json'

In [None]:
df.shape

(505932, 7)

In [None]:
df.head()


Unnamed: 0,title,abstract,update_date,authors_parsed,categories,submitter,journal-ref
0,What can emission lines tell us?,1 Generalities\n 2 Empirical diagnostics ba...,2023-06-07,"[[Stasinska, G., ]]",astro-ph,Grazyna Stasinska,
1,The Veldkamp Space of Two-Qubits,Given a remarkable representation of the gen...,2024-02-13,"[[Saniga, Metod, , ASTRINSTSAV], [Planat, Mich...",quant-ph math-ph math.MP,Metod Saniga,"SIGMA 3 (2007) 075, 7 pages"
2,Manifolds admitting a $\tilde G_2$-structure,We find a necessary and sufficient condition...,2023-03-06,"[[Le, Hong-Van, ]]",math.AT math.DG,HongVan Le,
3,Axiom A polynomial skew products of C^2 and th...,A polynomial skew product of C^2 is a map of...,2023-08-14,"[[DeMarco, Laura, ], [Hruska, Suzanne Lynch, ]]",math.DS,Suzanne Lynch Hruska,"Ergodic Theory and Dynamical Systems, Volume 2..."
4,Factor Analysis and Alternating Minimization,In this paper we make a first attempt at und...,2023-02-27,"[[Finesso, Lorenzo, ], [Spreij, Peter, ]]",math.PR math.OC,P. J. C. Spreij,"in Modeling, Estimation and Control, Festschri..."


In [None]:
df["categories"].value_counts

<bound method IndexOpsMixin.value_counts of 0                                  astro-ph
1                  quant-ph math-ph math.MP
2                           math.AT math.DG
3                                   math.DS
4                           math.PR math.OC
                        ...                
505927                             quant-ph
505928                     solv-int nlin.SI
505929    solv-int nlin.PS nlin.SI patt-sol
505930                     solv-int nlin.SI
505931                     solv-int nlin.SI
Name: categories, Length: 505932, dtype: object>

In [None]:
category_counts = df['categories'].value_counts()
filtered_counts = category_counts[category_counts > 500]

# Display the result
print(filtered_counts)

cs.CV                      24212
quant-ph                   11719
cs.CL                      11042
cs.LG                       9597
math.AP                     7103
                           ...  
astro-ph.CO astro-ph.GA      537
cs.CV cs.CL                  533
cs.CL cs.SD eess.AS          527
physics.geo-ph               517
hep-th hep-ph                515
Name: categories, Length: 147, dtype: int64


In [None]:
# Step 1: Get the counts of each category
category_counts = df['categories'].value_counts()

# Step 2: Filter to find categories with counts more than 500
categories_to_keep = category_counts[category_counts > 500].index

# Step 3: Filter the original DataFrame to keep only these categories
filtered_df = df[df['categories'].isin(categories_to_keep)]

# Display the resulting DataFrame
print(filtered_df)


                                                    title  \
1                        The Veldkamp Space of Two-Qubits   
3       Axiom A polynomial skew products of C^2 and th...   
8       Quasi Ordinary Singularities, Essential Diviso...   
9       Third Order Newton's Method for Zernike Polyno...   
12      Quantum Properties and Gravitational Field of ...   
...                                                   ...   
505918  Complete physical simulation of the entangling...   
505919                   Entanglement without nonlocality   
505920  Wave-Particle Duality in the Negative Informat...   
505924           Quantum Langevin Equations and Stability   
505927  Coloring the rational quantum sphere and the K...   

                                                 abstract update_date  \
1         Given a remarkable representation of the gen...  2024-02-13   
3         A polynomial skew product of C^2 is a map of...  2023-08-14   
8         We define Poincar\'e series associated

In [None]:
filtered_df.head()

Unnamed: 0,title,abstract,update_date,authors_parsed,categories,submitter,journal-ref
1,The Veldkamp Space of Two-Qubits,Given a remarkable representation of the gen...,2024-02-13,"[[Saniga, Metod, , ASTRINSTSAV], [Planat, Mich...",quant-ph math-ph math.MP,Metod Saniga,"SIGMA 3 (2007) 075, 7 pages"
3,Axiom A polynomial skew products of C^2 and th...,A polynomial skew product of C^2 is a map of...,2023-08-14,"[[DeMarco, Laura, ], [Hruska, Suzanne Lynch, ]]",math.DS,Suzanne Lynch Hruska,"Ergodic Theory and Dynamical Systems, Volume 2..."
8,"Quasi Ordinary Singularities, Essential Diviso...",We define Poincar\'e series associated to a ...,2024-05-01,"[[Perez, Pedro Daniel Gonzalez, ], [Hernando, ...",math.AG,Pedro Daniel Gonzalez Perez,"Journal of the London Mathematical Society, 79..."
9,Third Order Newton's Method for Zernike Polyno...,The Zernike radial polynomials are a system ...,2024-04-23,"[[Mathar, Richard J., ]]",math.NA cs.NA,Richard J. Mathar,
12,Quantum Properties and Gravitational Field of ...,We find that a field with oscillations of ma...,2024-04-22,"[[Yau, Hou Y., ]]",physics.gen-ph,Hou Yau,


In [None]:
filtered_df["categories"].value_counts()

cs.CV                      24212
quant-ph                   11719
cs.CL                      11042
cs.LG                       9597
math.AP                     7103
                           ...  
astro-ph.CO astro-ph.GA      537
cs.CV cs.CL                  533
cs.CL cs.SD eess.AS          527
physics.geo-ph               517
hep-th hep-ph                515
Name: categories, Length: 147, dtype: int64

In [None]:
pd.set_option('display.max_rows', None)  # This will allow displaying all rows

# Display the value counts of the categories in the filtered DataFrame
print(filtered_df['categories'].value_counts())


cs.CV                                  24212
quant-ph                               11719
cs.CL                                  11042
cs.LG                                   9597
math.AP                                 7103
cs.RO                                   6325
math.CO                                 6257
cs.LG cs.AI                             6175
astro-ph.GA                             5857
cond-mat.mtrl-sci                       5806
cs.CL cs.AI                             5554
math.NA cs.NA                           5194
hep-ph                                  4978
math.OC                                 4753
eess.SY cs.SY                           4517
cs.CV cs.AI                             4497
math.NT                                 4183
hep-th                                  3845
astro-ph.HE                             3632
eess.SP                                 3593
math.PR                                 3509
gr-qc                                   3467
cond-mat.m

In [None]:
import pandas as pd

# Define a mapping dictionary based on the categories you provided
category_map = {
    'cs.CV': 'Computer Science',
    'quant-ph': 'Physics',
    'cs.CL': 'Computer Science',
    'cs.LG': 'Computer Science',
    'math.AP': 'Mathematics',
    'cs.RO': 'Computer Science',
    'math.CO': 'Mathematics',
    'cs.LG cs.AI': 'Computer Science',
    'astro-ph.GA': 'Astronomy/Physics',
    'cond-mat.mtrl-sci': 'Physics',
    'cs.CL cs.AI': 'Computer Science',
    'math.NA cs.NA': 'Mathematics',
    'hep-ph': 'Physics',
    'math.OC': 'Mathematics',
    'eess.SY cs.SY': 'Electrical Engineering/Computer Science',
    'cs.CV cs.AI': 'Computer Science',
    'math.NT': 'Mathematics',
    'hep-th': 'Physics',
    'astro-ph.HE': 'Astronomy/Physics',
    'eess.SP': 'Electrical Engineering',
    'math.PR': 'Mathematics',
    'gr-qc': 'Physics',
    'cond-mat.mes-hall': 'Physics',
    'math.AG': 'Mathematics',
    'cs.CR': 'Computer Science',
    'cs.AI': 'Computer Science',
    'stat.ME': 'Statistics',
    'physics.flu-dyn': 'Physics',
    'cs.HC': 'Computer Science',
    'cs.SE': 'Computer Science',
    'cs.CV cs.LG': 'Computer Science',
    'eess.IV cs.CV': 'Electrical Engineering/Computer Science',
    'math.DG': 'Mathematics',
    'physics.optics': 'Physics',
    'cs.LG stat.ML': 'Computer Science',
    'math.DS': 'Mathematics',
    'cond-mat.str-el': 'Physics',
    'astro-ph.CO': 'Astronomy/Physics',
    'cs.IT math.IT': 'Computer Science/Mathematics',
    'math.FA': 'Mathematics',
    'astro-ph.SR': 'Astronomy',
    'astro-ph.EP': 'Astronomy',
    'stat.ML cs.LG': 'Computer Science/Statistics',
    'cs.CL cs.AI cs.LG': 'Computer Science',
    'cs.IT eess.SP math.IT': 'Computer Science/Mathematics/Electrical Engineering',
    'gr-qc hep-th': 'Physics',
    'cs.DS': 'Computer Science',
    'cond-mat.soft': 'Physics',
    'cs.CL cs.LG': 'Computer Science',
    'cs.DC': 'Computer Science',
    'cs.IR': 'Computer Science',
    'hep-ph hep-ex': 'Physics',
    'cs.CV cs.AI cs.LG': 'Computer Science',
    'cs.NI': 'Computer Science',
    'cond-mat.stat-mech': 'Physics',
    'hep-th gr-qc': 'Physics',
    'physics.chem-ph': 'Physics/Chemistry',
    'hep-ex': 'Physics',
    'cs.CY': 'Computer Science',
    'math.GR': 'Mathematics',
    'cs.LG cs.CV': 'Computer Science',
    'math.LO': 'Mathematics',
    'math.GT': 'Mathematics',
    'cond-mat.supr-con': 'Physics',
    'physics.plasm-ph': 'Physics',
    'cs.LO': 'Computer Science',
    'econ.GN q-fin.EC': 'Economics',
    'math.ST stat.TH': 'Mathematics/Statistics',
    'astro-ph.IM': 'Astronomy',
    'math.CA': 'Mathematics',
    'cs.CV eess.IV': 'Computer Science/Electrical Engineering',
    'stat.AP': 'Statistics',
    'eess.IV cs.CV cs.LG': 'Electrical Engineering/Computer Science',
    'math.RA': 'Mathematics',
    'math.RT': 'Mathematics',
    'physics.app-ph': 'Physics',
    'math-ph math.MP': 'Mathematics/Physics',
    'cond-mat.mes-hall cond-mat.mtrl-sci': 'Physics',
    'nucl-th': 'Physics',
    'cs.RO cs.SY eess.SY': 'Computer Science/Electrical Engineering',
    'cs.GT': 'Computer Science',
    'physics.soc-ph': 'Physics',
    'cs.RO cs.AI': 'Computer Science',
    'eess.AS cs.SD': 'Electrical Engineering/Computer Science',
    'math.CV': 'Mathematics',
    'cond-mat.mtrl-sci cond-mat.mes-hall': 'Physics',
    'physics.optics physics.app-ph': 'Physics',
    'cs.CV cs.RO': 'Computer Science',
    'eess.IV': 'Electrical Engineering',
    'astro-ph.SR astro-ph.GA': 'Astronomy',
    'astro-ph.GA astro-ph.CO': 'Astronomy',
    'cs.SD eess.AS': 'Computer Science/Electrical Engineering',
    'math.AC': 'Mathematics',
    'hep-ph nucl-th': 'Physics',
    'cs.AI cs.LG': 'Computer Science',
    'math.AT': 'Mathematics',
    'stat.ME stat.AP': 'Statistics',
    'econ.EM': 'Economics',
    'cs.CE': 'Computer Science',
    'cond-mat.str-el cond-mat.mtrl-sci': 'Physics',
    'cs.CV cs.GR': 'Computer Science',
    'cs.LG cs.AI stat.ML': 'Computer Science',
    'quant-ph physics.optics': 'Physics',
    'cs.LG cs.AI cs.CV': 'Computer Science',
    'cs.SI': 'Computer Science',
    'math.GM': 'Mathematics',
    'astro-ph.GA astro-ph.SR': 'Astronomy',
    'cond-mat.quant-gas': 'Physics',
    'physics.ins-det': 'Physics',
    'math.OC cs.SY eess.SY': 'Mathematics/Computer Science/Electrical Engineering',
    'cs.LG cs.AI cs.CL': 'Computer Science',
    'cs.DB': 'Computer Science',
    'cs.RO cs.CV': 'Computer Science',
    'physics.atom-ph': 'Physics',
    'econ.TH': 'Economics',
    'cs.LG cs.CR': 'Computer Science',
    'astro-ph.EP astro-ph.SR': 'Astronomy',
    'cs.CR cs.LG': 'Computer Science',
    'astro-ph.HE astro-ph.GA': 'Astronomy/Physics',
    'hep-ph hep-th': 'Physics',
    'cs.LG cs.CL': 'Computer Science',
    'math.CO cs.DM': 'Mathematics/Computer Science',
    'cs.AR': 'Computer Science',
    'cs.NE': 'Computer Science',
    'math.PR math-ph math.MP': 'Mathematics/Physics',
    'cond-mat.mes-hall quant-ph': 'Physics',
    'physics.med-ph': 'Physics',
    'math.AP math-ph math.MP': 'Mathematics/Physics',
    'astro-ph.HE astro-ph.SR': 'Astronomy',
    'quant-ph math-ph math.MP': 'Physics',
    'physics.gen-ph': 'Physics',
    'astro-ph.CO gr-qc': 'Astronomy/Physics',
    'cs.HC cs.AI': 'Computer Science',
    'hep-ph astro-ph.CO': 'Physics',
    'cs.PL': 'Computer Science',
    'q-bio.NC': 'Biology',
    'cond-mat.supr-con cond-mat.str-el': 'Physics',
    'cs.AI cs.CL': 'Computer Science',
    'cs.LG math.OC': 'Computer Science/Mathematics',
    'math.NT math.AG': 'Mathematics',
    'quant-ph cond-mat.mes-hall': 'Physics',
    'cs.CR cs.AI': 'Computer Science',
    'astro-ph.CO astro-ph.GA': 'Astronomy',
    'cs.CV cs.CL': 'Computer Science',
    'cs.CL cs.SD eess.AS': 'Computer Science/Electrical Engineering',
    'physics.geo-ph': 'Physics',
    'hep-th hep-ph': 'Physics'
}

# Assuming `filtered_df` is your DataFrame
# Map the categories column to the broader fields
filtered_df['broader_field'] = filtered_df['categories'].apply(lambda x: category_map.get(x, 'Other'))

# Display the updated DataFrame



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['broader_field'] = filtered_df['categories'].apply(lambda x: category_map.get(x, 'Other'))


In [None]:
filtered_df.head()

Unnamed: 0,title,abstract,update_date,authors_parsed,categories,submitter,journal-ref,broader_field
1,The Veldkamp Space of Two-Qubits,Given a remarkable representation of the gen...,2024-02-13,"[[Saniga, Metod, , ASTRINSTSAV], [Planat, Mich...",quant-ph math-ph math.MP,Metod Saniga,"SIGMA 3 (2007) 075, 7 pages",Physics
3,Axiom A polynomial skew products of C^2 and th...,A polynomial skew product of C^2 is a map of...,2023-08-14,"[[DeMarco, Laura, ], [Hruska, Suzanne Lynch, ]]",math.DS,Suzanne Lynch Hruska,"Ergodic Theory and Dynamical Systems, Volume 2...",Mathematics
8,"Quasi Ordinary Singularities, Essential Diviso...",We define Poincar\'e series associated to a ...,2024-05-01,"[[Perez, Pedro Daniel Gonzalez, ], [Hernando, ...",math.AG,Pedro Daniel Gonzalez Perez,"Journal of the London Mathematical Society, 79...",Mathematics
9,Third Order Newton's Method for Zernike Polyno...,The Zernike radial polynomials are a system ...,2024-04-23,"[[Mathar, Richard J., ]]",math.NA cs.NA,Richard J. Mathar,,Mathematics
12,Quantum Properties and Gravitational Field of ...,We find that a field with oscillations of ma...,2024-04-22,"[[Yau, Hou Y., ]]",physics.gen-ph,Hou Yau,,Physics


In [None]:
filtered_df["broader_field"].value_counts()

Computer Science                                       116237
Physics                                                 69176
Mathematics                                             52822
Astronomy/Physics                                       13042
Astronomy                                                9586
Electrical Engineering/Computer Science                  9261
Statistics                                               5074
Electrical Engineering                                   4458
Computer Science/Electrical Engineering                  3502
Computer Science/Mathematics                             2854
Economics                                                2707
Mathematics/Physics                                      2261
Computer Science/Statistics                              1940
Computer Science/Mathematics/Electrical Engineering      1889
Physics/Chemistry                                        1412
Mathematics/Statistics                                   1227
Mathemat

In [None]:
filtered_df.to_csv('filtered_data.csv', index=False)


(299362, 8)

In [None]:
filtered_df.to_csv('filtered_data.csv', index=False)
