## Imports

In [None]:
from data_processing import get_data, split_data, process_data, make_train_vocab, convert_tokens_to_ids
import numpy as np
import os
import pickle
import random
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Setup

In [None]:
# Set seeds
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False
random.seed(seed)
np.random.seed(seed)

## Getting the Data (WikiText2)

In [4]:
# Get raw dataset
docs_splits = get_data()

Size of train: 36718
Size of validation: 3760
Size of test: 4358
Total dataset size: 44836


In [5]:
# Adjust the dataset to the dataset size and train-val-test split required
DATASET_SIZE = 37500
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.1

In [6]:
corrected_docs_splits = split_data(docs_splits=docs_splits, full_dataset_size=DATASET_SIZE, train_split=TRAIN_SPLIT, val_split=VAL_SPLIT)

New train size: 30000
New validation size: 3750
New test size: 3750


## Data Processing

In [8]:
tokenised_docs = process_data(corrected_docs_splits=corrected_docs_splits)

## Numericalisation of Tokens

In [10]:
# Make vocabulary from train set
train_vocab = make_train_vocab(train_docs=tokenised_docs["train"])

# Use vocabulary to numericalise train, val and test datasets
converted_tokenised_docs = convert_tokens_to_ids(tokenised_docs=tokenised_docs, train_vocab=train_vocab)

Training vocabulary size: 58177


## Save Data

In [None]:
# Create data folder
os.makedirs(os.path.normpath(os.path.join("..", "data")), exist_ok=True)

# Save the ids of the tokenised documents
with open(os.path.normpath(os.path.join("..", "data", f"wikitext2_data.pkl")), "wb") as f:
    pickle.dump(converted_tokenised_docs, f)

# Save the training vocabulary
with open(os.path.normpath(os.path.join("..", "data", f"wikitext2_train_vocab.pkl")), "wb") as f:
    pickle.dump(train_vocab, f)