In [None]:
import numpy as np
import pandas as pd
from urllib.request import urlopen
import xmltodict
import time
from tqdm import tqdm

In [None]:
categories = [
    "cs.ai",
    "cs.cv",
    "cs.lg",
    "cs.cl",
    "stat.ml"
]

In [None]:

def arxiv_abstracts(
    category: str, start_index: int = 0, max_results: int = 100
) -> pd.DataFrame:
    """
    Scrapes arxiv for abstracts.
    """
    url = (
        f'http://export.arxiv.org/api/query?'
        f'search_query=cat:{category}&'
        f'start={start_index}&'
        f'max_results={max_results}&'
        f'sortBy=submittedDate&'
        f'sortOrder=descending'
    )
    print(url)
    raw_data = urlopen(url)
    xml_data = raw_data.read().decode('utf-8')
    data = xmltodict.parse(xml_data)
    
    df = pd.DataFrame(columns=["publish_date", "abstract"])
    abstracts = []
    entries = data["feed"].get("entry")
    if entries:
        for entry in entries:
            publish_date = entry["published"]
            summary = entry["summary"].replace("\n", " ")
            df = pd.concat([
                df, 
                pd.DataFrame({
                    "publish_date": [publish_date], "abstract": [summary]
                })
            ])
    else:
        print(f"Empty df: {category}, {start_index}, {max_results}")
        
    return df.reset_index(drop=True)
    

In [None]:
categories = [
    "cs.cv",
    "cs.lg",
    "stat.ml"
]
max_results = 250
for start_index in tqdm(range(47_000, 200_000, max_results)):
    for c in categories:
        df = arxiv_abstracts(c, start_index, max_results)
        df.to_csv(f'data/abstracts.{c}.csv', mode='a', header=False, index=False)
        time.sleep(2)

# Concatenate datasets

In [None]:
df = pd.DataFrame(columns=["publish_date", "abstract"])
for c in categories:
    temp = pd.read_csv(f"data/abstracts.{c}.csv", names=["publish_date", "abstract"])
    df = pd.concat([df, temp])

In [None]:
df.shape

In [None]:
# drop duplicate rows (might have accidentally gathered duplicates 
# during the rough scraping process, or arxiv stores duplicates in some form)
df = df.drop_duplicates()

In [None]:
df.shape

In [None]:
n = df.shape[0]

In [None]:
num_train = int(0.8 * n)
num_val = int(0.1 * n)
num_test = n - num_train - num_val

In [None]:
num_train, num_val, num_test

In [None]:
df = df.sample(frac=1)

In [None]:
df

In [None]:
train_df = df.iloc[:num_train, :]
val_df = df.iloc[num_train:(num_train + num_val), :]
test_df = df.iloc[(num_train + num_val):, :]

In [None]:
train_df.shape, val_df.shape, test_df.shape

In [None]:
train_df.to_csv("data/abstracts_train.csv", index=False)
val_df.to_csv("data/abstracts_val.csv", index=False)
test_df.to_csv("data/abstracts_test.csv", index=False)