## Imports

In [1]:
from src.data_processing import get_and_split_data, process_and_format_docs_to_ids
import numpy as np
import os
import pickle
import random
import torch

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\bxchi\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bxchi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bxchi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Setup

In [2]:
# Set seeds
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False
random.seed(seed)
np.random.seed(seed)

## Getting the Data (Reuters)

In [3]:
# Adjust the dataset to the dataset size and train-val-test split required
DATASET_SIZE = 5000
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.1

In [4]:
docs_splits = get_and_split_data(full_dataset_size=DATASET_SIZE, train_split=TRAIN_SPLIT, val_split=VAL_SPLIT, seed=seed)

Total number of documents: 10788
New train size: 4000
New validation size: 500
New test size: 500


## Data Processing

### Tokenisation by Word

In [5]:
word_tokenised_numericalised_docs, word_tokenisation_train_vocab = process_and_format_docs_to_ids(docs_splits=docs_splits, using_subword=False)

Training vocabulary size: 22087


### Tokenisation by Subword

In [6]:
subword_tokenised_numericalised_docs, subword_tokenisation_train_vocab = process_and_format_docs_to_ids(docs_splits=docs_splits, using_subword=True)

Training vocabulary size: 16079


## Save Data

In [7]:
# Create data folder
os.makedirs("data", exist_ok=True)

with open(os.path.normpath(os.path.join("data", f"word_tokenisation_reuters_data.pkl")), "wb") as f:
    pickle.dump(word_tokenised_numericalised_docs, f)

with open(os.path.normpath(os.path.join("data", f"word_tokenisation_reuters_train_vocab.pkl")), "wb") as f:
    pickle.dump(word_tokenisation_train_vocab, f)

with open(os.path.normpath(os.path.join("data", f"subword_tokenisation_reuters_data.pkl")), "wb") as f:
    pickle.dump(subword_tokenised_numericalised_docs, f)

with open(os.path.normpath(os.path.join("data", f"subword_tokenisation_reuters_train_vocab.pkl")), "wb") as f:
    pickle.dump(subword_tokenisation_train_vocab, f)