In [1]:
!pip install git+git://github.com/huggingface/transformers.git

Collecting git+git://github.com/huggingface/transformers.git
  Cloning git://github.com/huggingface/transformers.git to /tmp/pip-req-build-xtvdb9ro
  Running command git clone -q git://github.com/huggingface/transformers.git /tmp/pip-req-build-xtvdb9ro
Building wheels for collected packages: transformers
  Building wheel for transformers (setup.py) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-2.11.0-cp36-none-any.whl size=675468 sha256=afed651aaa59cdc0b33bd139c1cbcff9abf85e703d92de11cdd06aece0a2f629
  Stored in directory: /tmp/pip-ephem-wheel-cache-ff5qze3h/wheels/85/2f/0c/d60f3471df1c6c05cbd5fcfaeb0d3778182cd35c12ecae98fa
Successfully built transformers


In [0]:
import numpy as np
import pandas as pd
import random
random.seed(42)
from datetime import datetime
import re

# Labels
startoftext = "<|startoftext|>"
endoftext = "<|endoftext|>"
sep = "<|sep|>"


def read_datasets(filename):
    """Read a .tsv dataset and extracts titles, abstracts, and created dates

    Args:
        filename: The name of the dataset file.

    Returns:
        A list of zipped titles, abstracts, and created dates
    """
    dataset = pd.read_csv(filename, delimiter="\t")
    titles = [f"{startoftext} {title} {sep}" for title in list(dataset["title"])]
    abstracts = [abst + endoftext for abst in list(dataset["abstract"])]
    date = [datetime.strptime(i, "%Y-%m-%d") for i in list(dataset["created"])]
    arxiv_id = [re.sub("[^0-9]", "", i) for i in list(dataset["arxiv_id"])]
    return list(zip(titles, abstracts, arxiv_id, date))


def merge_datasets():
    """Merge all four datasets.

    Returns:
        A list of titles, abstracts, dates, sorted by dates.
    """
    ai = read_datasets("cs.AI.tsv")
    lg = read_datasets("cs.LG.tsv")
    cl = read_datasets("cs.CL.tsv")
    cv = read_datasets("cs.CV.tsv")
    data = ai + lg + cl + cv
    unique_ids = set()
    filtered_data = []
    for d in data:
        if d[-2] not in unique_ids:
            unique_ids.add(d[-2])
            filtered_data.append(d)
    sorted_data = sorted(filtered_data, key=lambda x: x[-1])
    return sorted_data


def split_datasets(data):
    """Split the dataset into train, valid, test sets.

    Args:
        data: A list of titles, abstracts, dates, sorted by dates.

    Returns:
        Train, valid, test sets.
    """
    train_text = data[:-9880]
    eval_text = data[-9880:]
    valid_test_ratio = 0.5
    valid_text = eval_text[:int(len(eval_text) * valid_test_ratio)]
    test_text = eval_text[int(len(eval_text) * valid_test_ratio):]
    assert len(train_text) == 90000
    assert len(valid_text) == 4940
    assert len(test_text) == 4940
    return (train_text, valid_text, test_text)


def write_datasets(data, name):
    """Write a .txt file of the dataset

    Args:
        data: A list of titles, abstracts, dates, sorted by dates.
        name: The name of the file to write.
    """
    with open(name + ".txt", "w+") as f:
        for d in data:
            f.write(f"{d[0]} {d[1]}\n\n")
    f.close()
    print(f"{name} file completed.")

In [0]:
data = merge_datasets()

In [4]:
train, valid, test = split_datasets(data)
write_datasets(train, "train")
write_datasets(valid, "valid")
write_datasets(test, "test")

train file completed.
valid file completed.
test file completed.


In [5]:
print(len(train))
print(len(valid))
print(len(test))

90000
4940
4940


In [0]:
from transformers import GPT2Tokenizer

In [0]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [8]:
print(len(tokenizer.tokenize(open("train.txt", "r").read())))
print(len(tokenizer.tokenize(open("valid.txt", "r").read())))
print(len(tokenizer.tokenize(open("test.txt", "r").read())))

20834012
1195056
1218754
