# Simple Recurrent Language Model

Predicting the next token.

# Imports and Setup

Common imports and standardized code for importing the relevant data, models, etc., in order to minimize copy-paste/typo errors.


Set the relevant text field (`'abstract'` or `'title'`) and whether we are working with `'one-hot'` or `'tokenized'` text.  

In [1]:
TEXT_FIELD = 'abstract'
TEXT_ENCODING = 'one-hot'
assert TEXT_FIELD in ('abstract', 'title'), 'TEXT_FIELD must be one of "title" or "abstract".'
assert TEXT_ENCODING in ('one-hot', 'tokenized'), 'TEXT_ENCODING must be one of "one-hot" or "tokenized".'
# The above choices determine the relevant sequence length of the data.
SEQ_LEN = 512 if TEXT_ENCODING == 'tokenized' else 1024

Imports and colab setup

In [2]:
%%capture import_capture --no-stder
# Jupyter magic methods
# For auto-reloading when external modules are changed
%load_ext autoreload
%autoreload 2
# For showing plots inline
%matplotlib inline

# pip installs needed in Colab for arxiv_vixra_models
!pip install wandb
!pip install pytorch-lightning
!pip install unidecode
# Update sklearn
!pip uninstall scikit-learn -y
!pip install -U scikit-learn

from copy import deepcopy

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option(u'float_format', '{:f}'.format)
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import seaborn as sns
import torch
import wandb

`wandb` log in:

In [3]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mgarrett361[0m (use `wandb login --relogin` to force relogin)


True

Google drive access

In [5]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
# Enter the relevant foldername
FOLDERNAME = '/content/drive/My Drive/ML/arxiv_vixra'
assert FOLDERNAME is not None, "[!] Enter the foldername."
# For importing modules stored in FOLDERNAME or a subdirectory thereof:
import sys
sys.path.append(FOLDERNAME)

Mounted at /content/drive


Import my models, loaders, and utility functions:

In [6]:
import arxiv_vixra_models as avm

Set the model, datamodule, and text utils to be instantianted in the notebook

In [7]:
notebook_model = avm.LitOneHotCharRNNNextLM
notebook_datamodule = avm.OneHotCharDataModuleNextLM
notebook_encoder = avm.str_to_one_hot 
notebook_decoder = avm.one_hot_to_str 
notebook_wandb_callback = avm.WandbTextGenerationCallback

Copy data to cwd for speed.

In [8]:
train_data_file_name = 'large_filtered_normalized_data_train.feather'
val_data_file_name = 'balanced_filtered_normalized_data_validation.feather'
SUBDIR = '/data/data_splits/'
train_data_path = FOLDERNAME + SUBDIR + train_data_file_name
val_data_path = FOLDERNAME + SUBDIR + val_data_file_name
if TEXT_ENCODING == 'one-hot':
    tokens_file_name = 'normalized_char_set.feather'
else:
    tokens_file_name = 'balanced_title_normalized_vocab.feather'
tokens_path = FOLDERNAME + SUBDIR + tokens_file_name
!cp '{train_data_path}' .
!cp '{val_data_path}' .
!cp '{tokens_path}' .
train_data_df = pd.read_feather(train_data_file_name)
val_data_df = pd.read_feather(val_data_file_name)
tokens_df = pd.read_feather(tokens_file_name)
if TEXT_ENCODING == 'one-hot':
    text_to_idx = dict(zip(tokens_df.char.values, np.arange(len(tokens_df))))
else:
    # 0 and 1 are reserved for padding and <UNK> for embeddings and not included
    # in tokens_df
    text_to_idx = dict(zip(tokens_df.word.values, np.arange(2, len(tokens_df) + 2)))
    text_to_idx['<PAD>'] = 0
    text_to_idx['<UNK>'] = 1
idx_to_text = {val: key for key, val in text_to_idx.items()}
if TEXT_FIELD == 'title':
    train_text_file_name = 'concatenated_large_normalized_train_title.txt'
    val_text_file_name = 'concatenated_balanced_normalized_validation_title.txt'
else:
    train_text_file_name = 'concatenated_large_normalized_train_abstract.txt'
    val_text_file_name = 'concatenated_balanced_normalized_validation_abstract.txt'
with open(FOLDERNAME + SUBDIR + train_text_file_name, 'r') as f:
    train_text = f.read().strip()
with open(FOLDERNAME + SUBDIR + val_text_file_name, 'r') as f:
    val_text = f.read().strip()

Computing specs. Save the number of processors to pass as `num_workers` into the Datamodule and cuda availability for other flags.

In [9]:
# GPU. Save availability to IS_CUDA_AVAILABLE.
gpu_info= !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
  IS_CUDA_AVAILABLE = False
else:
  print(f"GPU\n{50 * '-'}\n", gpu_info, '\n')
  IS_CUDA_AVAILABLE = True

# Memory.
from psutil import virtual_memory, cpu_count
ram_gb = virtual_memory().total / 1e9
print(f"Memory\n{50 * '-'}\n", 'Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb), '\n')

# CPU.
print(f"CPU\n{50 * '-'}\n", f'CPU Processors: {cpu_count()}')
# Determine the number of workers to use in the datamodule
NUM_PROCESSORS = cpu_count()

GPU
--------------------------------------------------
 Tue Jan 18 20:18:51 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------

Use notebook name as `wandb` `project` string. Remove the file extension and any "Copy of" or "Kopie van" text which arises from copying notebooks and running in parallel. The `entity` needed for various `wandb` calls is just the `wandb` user name.

In [10]:
from requests import get
PROJECT = get('http://172.28.0.2:9000/api/sessions').json()[0]['name']
PROJECT = PROJECT.replace('.ipynb', '').replace('Kopie%20van%20', '').replace('Copy%20of%20', '')
print(PROJECT)
ENTITY = 'garrett361'

large_abstract_recurrent_one_hot_next_language_model


# Model Testing

Setting hyperparameters and performing a small test run.

Dictionary args for model and datamodule.

In [None]:
model_args_dict = {'seq_len': SEQ_LEN,
                  'tokens': tokens_df,
                  'num_layers': 2,
                  'hidden_size': 512,
                  'rnn_type': 'GRU',
                  'fc_dims': None,
                  'zero_fc_bias_init': True,
                  'truncated_bptt_steps': 128
                  }

data_args_dict = {'seq_len': SEQ_LEN,
                 'train_text': train_text,
                 'val_text': val_text,
                 'tokens': tokens_df, 
                 'num_workers': NUM_PROCESSORS,
                 'batch_size': 128,
                 'pin_memory': IS_CUDA_AVAILABLE,
                 'persistent_workers': True,
                 }

Small test run.

In [None]:
small_data_module = notebook_datamodule(**data_args_dict)
small_data_module.setup()
small_loader = small_data_module.train_dataloader()
small_inputs, small_targets = next(iter(small_loader))
# Print the first few input texts
for input, target in  zip(small_inputs[:3], small_targets[:3]):
    sample_text = notebook_decoder(input, idx_to_text)
    sample_target = ''.join(idx_to_text[ch.item()] for ch in target)
    print(f"input  text: {sample_text}",
          f"target text: {sample_target}",
          f'input, target lens: {len(sample_text), len(sample_target)}',
          sep='\n')
small_model = notebook_model(**model_args_dict)
print('Model layers:', small_model)
small_preds, small_losses, _ = small_model.scores_loss_hiddens(small_inputs, small_targets)
print('\npreds shape:', small_preds.shape)
print('\nactual loss:', small_losses.item())
print('\nexpected approx loss', np.log(len(tokens_df)))

input  text: features can be separated from the turbulent background flow through an analysis founded upon a complex - valued wavelet transform of the trajectory . application of the method to a set of one hundred modeled trajectories shows that the oscillatory motions of lagrangian particles orbiting vortex cores appear to be extracted very well by the method , which depends upon only a handful of free parameters and which requires no operator intervention . furthermore , vortex motions are clearly distinguished from wavelike meandering of the jet - - - the former are high frequency , nearly circular signals , while the latter are linear in polarization and at much lower frequencies . this suggests that the proposed method can be useful for identifying and studying vortex and wave properties in large lagrangian datasets . in particular , the eccentricity of the oscillatory displacement signals , a quantity which is not normally considered in lagrangian studies , emerges as an informat


The `F1` was deprecated since v0.7 in favor of `torchmetrics.classification.f_beta.F1Score`. It will be removed in v0.8.


Metric `AUROC` will save all targets and predictions in buffer. For large datasets this may lead to large memory footprint.


Metric `AveragePrecision` will save all targets and predictions in buffer. For large datasets this may lead to large memory footprint.




preds shape: torch.Size([128, 69, 1024])

actual loss: 4.23386287689209

expected approx loss 4.23410650459726


In [None]:
# pl implements gradient clipping through the Trainer.
small_trainer = Trainer(gpus=-1 if IS_CUDA_AVAILABLE else 0,
                        max_epochs=1,
                        gradient_clip_val=1
                        )

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


A `LR finder stopped early due to diverging loss.` here may be due to having too large a batch size, i.e., not enough samples from the datamodule; [see this github discussion](https://github.com/PyTorchLightning/pytorch-lightning/issues/5044)

In [None]:
# small_trainer_lr_finder = small_trainer.tuner.lr_find(small_model, datamodule=small_data_module, min_lr=1e-6, max_lr=1e-1)
# small_trainer_lr_finder_plot = small_trainer_lr_finder.plot(suggest=True)
# small_trainer_suggested_lr = small_trainer_lr_finder.suggestion()
# print(f'Suggested lr: {small_trainer_suggested_lr}')

# Training

In [None]:
cyclic_lr_scheduler_args = {'base_lr': 1e-5,
                            'max_lr': 5e-3,
                            'step_size_up': 2048,
                            'cycle_momentum': False}
plateau_lr_scheduler_args = {'verbose': True,
                             'patience': 512,
                             'factor': .5,
                             'mode': 'min'}

model_args_dict['save_models_to_wandb'] =True
model_args_dict['lr'] = 5e-3
model_args_dict['lr_scheduler'] = 'cyclic'
model_args_dict['lr_scheduler_args'] = cyclic_lr_scheduler_args
model_args_dict['lr_scheduler_monitor'] = 'train_batch_loss'
model = notebook_model(**model_args_dict)

data_args_dict['batch_size'] = 1024
datamodule = notebook_datamodule(**data_args_dict)


Metric `AUROC` will save all targets and predictions in buffer. For large datasets this may lead to large memory footprint.


Metric `AveragePrecision` will save all targets and predictions in buffer. For large datasets this may lead to large memory footprint.



Training:

In [None]:
# We accumulate gradients in batches to help smooth out the loss-curve.
trainer = Trainer(logger=WandbLogger(),
                  gpus=-1 if IS_CUDA_AVAILABLE else 0,
                  log_every_n_steps=1,
                  callbacks=[notebook_wandb_callback()],
                  gradient_clip_val=1,
                  )
with wandb.init(project=PROJECT) as run:
    run.name = f"lr_{model.hparams['lr']}_scheduler_{model_args_dict.get('lr_scheduler', None)}"[:128]
    trainer.fit(model, datamodule=datamodule)
    plt.close("all")


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.


  | Name               | Type       | Params
--------------------------------------------------
0 | train_metrics_dict | ModuleDict | 0     
1 | val_metrics_dict   | ModuleDict | 0     
2 | test_metrics_dict  | ModuleDict | 0     
3 | rnn                | GRU        | 2.5 M 
4 | fc_layers          | ModuleList | 35.4 K
--------------------------------------------------
2.5 M     Trainable params
0         Non-trainable params
2.5 M     Total params
10.027    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Saved best val_acc at global step: 0
Epoch: 0
Validation accuracy: 0.00043392181396484375
Validation Loss: 4.23007869720459
Saved best val_loss at global step: 0
Epoch: 0
Validation accuracy: 0.00043392181396484375
Validation Loss: 4.23007869720459


Training: 0it [00:00, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fb6b1d41d40>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fb6b1d41d40>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/pytho

Validating: 0it [00:00, ?it/s]

Saved best val_acc at global step: 1422
Epoch: 0
Validation accuracy: 0.702228307723999
Validation Loss: 0.9881129264831543
Saved best val_loss at global step: 1422
Epoch: 0
Validation accuracy: 0.702228307723999
Validation Loss: 0.9881129264831543


Validating: 0it [00:00, ?it/s]

Saved best val_acc at global step: 2845
Epoch: 1
Validation accuracy: 0.7126151323318481
Validation Loss: 0.9510302543640137
Saved best val_loss at global step: 2845
Epoch: 1
Validation accuracy: 0.7126151323318481
Validation Loss: 0.9510302543640137


Validating: 0it [00:00, ?it/s]

Saved best val_acc at global step: 4268
Epoch: 2
Validation accuracy: 0.7265303730964661
Validation Loss: 0.9057366251945496
Saved best val_loss at global step: 4268
Epoch: 2
Validation accuracy: 0.7265303730964661
Validation Loss: 0.9057366251945496


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# Loading Best Models

In [31]:
wandb_api = wandb.Api()
notebook_runs = wandb_api.runs(ENTITY + "/" + PROJECT) 

run_cats = ('best_val_acc', 'name', 'wandb_path', 'timestamp')
runs_sort_cat = 'best_val_acc'
notebook_runs_dict = {key: [] for key in run_cats}


for run in notebook_runs:
    run_json = run.summary._json_dict
    if runs_sort_cat in run_json:
        notebook_runs_dict[runs_sort_cat].append(run_json[runs_sort_cat])
        notebook_runs_dict['name'].append(run.name)
        notebook_runs_dict['wandb_path'].append('/'.join(run.path))
        notebook_runs_dict['timestamp'].append(run_json['_timestamp'])
    
notebook_runs_df = pd.DataFrame(notebook_runs_dict).sort_values(by=runs_sort_cat, ascending=False).reset_index(drop=True)
notebook_runs_df

Unnamed: 0,best_val_acc,name,wandb_path,timestamp


In [26]:
best_model_df = notebook_runs_df.iloc[notebook_runs_df['best_val_acc'].argmax()]
print(best_model_df)

best_val_acc                                             0.729404
name                                    lr_0.005_scheduler_cyclic
wandb_path      garrett361/large_abstract_recurrent_one_hot_ne...
timestamp                                              1642518700
Name: 0, dtype: object


Save the state dicts locally and rebuild the corresponding models.

In [27]:
best_model_file_name = f"model_init_params.pt"
wandb.restore(best_model_file_name,
              run_path=best_model_df.wandb_path,
              replace=True)

ValueError: ignored

In [None]:
# wandb stores None values in the config dict as a string literal. Need to
# fix these entries, annoyingly.
for key, val in best_model_df.config.items():
    if val == 'None':
        best_model_df.config[key] = None
# Write to disk
best_model_file_name = f"model_best_val_acc.pt"
wandb.restore(best_model_file_name,
              run_path=best_model_df.wandb_path,
              replace=True)
best_model_file_name_suffix = '_'.join(best_model_file_name.split('_')[-2:])
# Also copy to the final_models folder
!cp '{best_model_file_name}' "{FOLDERNAME + '/final_models/' + PROJECT + '_' + best_model_file_name_suffix}"

KeyboardInterrupt: ignored

In [None]:
best_model = notebook_model(**{**best_model_df.config, **{'tokens': tokens_df}})
best_model.load_state_dict(torch.load(best_model_file_name))

# Visualize

In [None]:
heatmap = avm.embedding_cosine_heatmap(model=best_model,
                                       words=heatmap_words,
                                       word_to_idx=title_word_to_idx)

In [None]:
pca = avm.pca_3d_embedding_plotter_topk(model=best_model,
                                     words=pca_words,
                                     word_to_idx=title_word_to_idx,
                                     idx_to_word=title_idx_to_word,
                                     title='PCA',
                                     k=5)

In [None]:
tsne = avm.tsne_3d_embedding_plotter_topk(model=best_model,
                                     words=tsne_words,
                                     word_to_idx=title_word_to_idx,
                                     idx_to_word=title_idx_to_word,
                                     title='t-SNE',
                                     k=5)

In [None]:
pca.show()

In [None]:
tsne.show()

In [None]:
avm.embedding_utils.topk_analogies_df(best_model,
                                      'newton mechanics heisenberg'.split(),
                                      title_word_to_idx,
                                      title_idx_to_word)