In [1]:
import pandas as pd
import json

In [4]:
cols = ['id', 'title', 'authors','abstract', 'categories']
data = []
file_name = 'arxiv-metadata-oai-snapshot.json'
topics = ['cs.AI', 'cs.CV', 'cs.IR', 'cs.LG', 'cs.CL','cs.DB','cs.DC','cs.GT','cs.IT','cs.MA']
category_map = {
    'cs.AI': 'Artificial Intelligence',
    'cs.CL': 'Computation and Language',
    'cs.CV': 'Computer Vision and Pattern Recognition',
    'cs.DB': 'Databases',
    'cs.DC': 'Distributed, Parallel, and Cluster Computing',
    'cs.GT': 'Computer Science and Game Theory',
    'cs.IR': 'Information Retrieval',
    'cs.IT': 'Information Theory',
    'cs.LG': 'Machine Learning',
    'cs.MA': 'Multiagent Systems'
}

In [9]:
def clean_text(x):
    # Replace newline characters with a space
    new_text = x.replace("\n", " ")
    # Remove leading and trailing spaces
    new_text = new_text.strip()
    return new_text

In [10]:
with open(file_name, encoding='latin-1') as f:
    for line in f:
        doc = json.loads(line)
        doc_categories = doc['categories'].split()
        if all(category in topics for category in doc_categories):
            mapped_categories = [category_map[category] for category in doc_categories]
            mapped_categories_str = ', '.join(mapped_categories)
            lst = [doc['id'], doc['title'],doc['authors'], doc['abstract'], mapped_categories_str]
            data.append(lst)

df_data = pd.DataFrame(data=data, columns=cols)

In [11]:
print(df_data.shape)

(402586, 5)


In [12]:
df_data['title'] = df_data['title'].apply(clean_text)
df_data['abstract'] = df_data['abstract'].apply(clean_text)

In [13]:
df_data.head()

Unnamed: 0,id,title,authors,abstract,categories
0,704.1267,Text Line Segmentation of Historical Documents...,"Laurence Likforman-Sulem, Abderrazak Zahour, B...",There is a huge amount of historical documents...,Computer Vision and Pattern Recognition
1,704.1274,Parametric Learning and Monte Carlo Optimization,David H. Wolpert and Dev G. Rajnarayan,This paper uncovers and explores the close rel...,Machine Learning
2,704.1394,Calculating Valid Domains for BDD-Based Intera...,"Tarik Hadzic, Rune Moller Jensen, Henrik Reif ...",In these notes we formally describe the functi...,Artificial Intelligence
3,704.1409,Preconditioned Temporal Difference Learning,Yao HengShuai,This paper has been withdrawn by the author. T...,"Machine Learning, Artificial Intelligence"
4,704.1827,Transaction-Oriented Simulation In Ad Hoc Grids,Gerald Krafft,This paper analyses the possibilities of perfo...,"Distributed, Parallel, and Cluster Computing"


In [14]:
df_data.describe()

Unnamed: 0,id,title,authors,abstract,categories
count,402586,402586,402586,402586,402586
unique,201293,201188,188348,201231,340
top,cs/9912017,Evidence-based Factual Error Correction,Peter D. Turney (National Research Council of ...,"Designing and implementing efficient, provably...",Computer Vision and Pattern Recognition
freq,2,4,50,4,121484


In [22]:
df_data['prepared_text'] = df_data['title'] + ' ' + df_data['categories'] + ' ' + df_data['abstract']

In [24]:
df_data.head()

Unnamed: 0,id,title,authors,abstract,categories,prepared_text
0,704.1267,Text Line Segmentation of Historical Documents...,"Laurence Likforman-Sulem, Abderrazak Zahour, B...",There is a huge amount of historical documents...,Computer Vision and Pattern Recognition,Text Line Segmentation of Historical Documents...
1,704.1274,Parametric Learning and Monte Carlo Optimization,David H. Wolpert and Dev G. Rajnarayan,This paper uncovers and explores the close rel...,Machine Learning,Parametric Learning and Monte Carlo Optimizati...
2,704.1394,Calculating Valid Domains for BDD-Based Intera...,"Tarik Hadzic, Rune Moller Jensen, Henrik Reif ...",In these notes we formally describe the functi...,Artificial Intelligence,Calculating Valid Domains for BDD-Based Intera...
3,704.1409,Preconditioned Temporal Difference Learning,Yao HengShuai,This paper has been withdrawn by the author. T...,"Machine Learning, Artificial Intelligence",Preconditioned Temporal Difference Learning Ma...
4,704.1827,Transaction-Oriented Simulation In Ad Hoc Grids,Gerald Krafft,This paper analyses the possibilities of perfo...,"Distributed, Parallel, and Cluster Computing",Transaction-Oriented Simulation In Ad Hoc Grid...


In [27]:
df_data.to_pickle('preprocessed_data.zip')