In [327]:
import numpy as np
import pandas as pd
from urllib.request import urlopen
import xmltodict
import time
from tqdm import tqdm

In [328]:
categories = [
    "cs.ai",
    "cs.cv",
    "cs.lg",
    "cs.cl",
    "stat.ml"
]

In [336]:

def arxiv_abstracts(
    category: str, start_index: int = 0, max_results: int = 100
) -> pd.DataFrame:
    """
    Scrapes arxiv for abstracts.
    """
    url = (
        f'http://export.arxiv.org/api/query?'
        f'search_query=cat:{category}&'
        f'start={start_index}&'
        f'max_results={max_results}&'
        f'sortBy=submittedDate&'
        f'sortOrder=descending'
    )
    print(url)
    raw_data = urlopen(url)
    xml_data = raw_data.read().decode('utf-8')
    data = xmltodict.parse(xml_data)
    
    df = pd.DataFrame(columns=["publish_date", "abstract"])
    abstracts = []
    entries = data["feed"].get("entry")
    if entries:
        for entry in entries:
            publish_date = entry["published"]
            summary = entry["summary"].replace("\n", " ")
            df = pd.concat([
                df, 
                pd.DataFrame({
                    "publish_date": [publish_date], "abstract": [summary]
                })
            ])
    else:
        print(f"Empty df: {category}, {start_index}, {max_results}")
        
    return df.reset_index(drop=True)
    

In [337]:
categories = [
    "cs.cv",
    "cs.lg",
    "stat.ml"
]
max_results = 250
for start_index in tqdm(range(47_000, 200_000, max_results)):
    for c in categories:
        df = arxiv_abstracts(c, start_index, max_results)
        df.to_csv(f'data/part2/abstracts.{c}.csv', mode='a', header=False, index=False)
        time.sleep(2)

  0%|                                                                                             | 0/612 [00:00<?, ?it/s]

http://export.arxiv.org/api/query?search_query=cat:cs.cv&start=47000&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: cs.cv, 47000, 250
http://export.arxiv.org/api/query?search_query=cat:cs.lg&start=47000&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: cs.lg, 47000, 250
http://export.arxiv.org/api/query?search_query=cat:stat.ml&start=47000&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: stat.ml, 47000, 250


  0%|▏                                                                                  | 1/612 [00:07<1:14:55,  7.36s/it]

http://export.arxiv.org/api/query?search_query=cat:cs.cv&start=47250&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: cs.cv, 47250, 250
http://export.arxiv.org/api/query?search_query=cat:cs.lg&start=47250&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: cs.lg, 47250, 250
http://export.arxiv.org/api/query?search_query=cat:stat.ml&start=47250&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: stat.ml, 47250, 250


  0%|▎                                                                                  | 2/612 [00:14<1:14:01,  7.28s/it]

http://export.arxiv.org/api/query?search_query=cat:cs.cv&start=47500&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: cs.cv, 47500, 250
http://export.arxiv.org/api/query?search_query=cat:cs.lg&start=47500&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: cs.lg, 47500, 250
http://export.arxiv.org/api/query?search_query=cat:stat.ml&start=47500&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: stat.ml, 47500, 250


  0%|▍                                                                                  | 3/612 [00:22<1:15:44,  7.46s/it]

http://export.arxiv.org/api/query?search_query=cat:cs.cv&start=47750&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: cs.cv, 47750, 250
http://export.arxiv.org/api/query?search_query=cat:cs.lg&start=47750&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: cs.lg, 47750, 250
http://export.arxiv.org/api/query?search_query=cat:stat.ml&start=47750&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: stat.ml, 47750, 250


  1%|▌                                                                                  | 4/612 [00:30<1:18:22,  7.74s/it]

http://export.arxiv.org/api/query?search_query=cat:cs.cv&start=48000&max_results=250&sortBy=submittedDate&sortOrder=descending
Empty df: cs.cv, 48000, 250
http://export.arxiv.org/api/query?search_query=cat:cs.lg&start=48000&max_results=250&sortBy=submittedDate&sortOrder=descending


  1%|▌                                                                                  | 4/612 [00:39<1:38:50,  9.75s/it]


KeyboardInterrupt: 

In [333]:
url

# Concatenate datasets

In [315]:
df = pd.DataFrame(columns=["publish_date", "abstract"])
for c in categories:
    temp = pd.read_csv(f"data/abstracts.{c}.csv", names=["publish_date", "abstract"])
    df = pd.concat([df, temp])

In [316]:
df.shape

(189832, 2)

In [317]:
# drop duplicate rows (might have accidentally gathered duplicates 
# during the rough scraping process, or arxiv stores duplicates in some form)
df = df.drop_duplicates()

In [318]:
df.shape

(149500, 2)

In [319]:
n = df.shape[0]

In [320]:
num_train = int(0.8 * n)
num_val = int(0.1 * n)
num_test = n - num_train - num_val

In [321]:
num_train, num_val, num_test

(119600, 14950, 14950)

In [322]:
df = df.sample(frac=1)

In [323]:
df

Unnamed: 0,publish_date,abstract
23449,2017-09-06T18:09:15Z,Sarcasm occurring due to the presence of numer...
43514,2015-03-14T19:43:30Z,We devise a one-shot approach to distributed s...
12028,2020-06-10T17:43:43Z,Particle filtering is a popular method for inf...
12015,2021-04-05T06:39:12Z,Understanding the context of complex and clutt...
14262,2020-01-31T07:50:25Z,Conventional Generative Adversarial Networks (...
...,...,...
36061,2004-02-09T11:03:20Z,The fixpoint completion fix(P) of a normal log...
15621,2021-05-17T13:19:23Z,The appeal of serverless (FaaS) has triggered ...
13613,2020-03-25T19:24:05Z,The dominant approaches to text representation...
25418,2020-03-30T02:52:40Z,Semi-supervised learning (SSL) has been extens...


In [324]:
train_df = df.iloc[:num_train, :]
val_df = df.iloc[num_train:(num_train + num_val), :]
test_df = df.iloc[(num_train + num_val):, :]

In [325]:
train_df.shape, val_df.shape, test_df.shape

((119600, 2), (14950, 2), (14950, 2))

In [326]:
train_df.to_csv("data/abstracts_train.csv", index=False)
val_df.to_csv("data/abstracts_val.csv", index=False)
test_df.to_csv("data/abstracts_test.csv", index=False)