Note: This notebook is intended to be run in the Google Colab environment, to utilize the environment's compute resources, like GPUs.

In [None]:
# install dependencies to google colab environment
from google.colab import drive
drive.mount('/content/gdrive')
drive_path = "/content/gdrive/MyDrive/path/to/project/dir"

!pip install pytorch-lightning
!pip install optuna
!pip install wandb
!pip install transformers
!pip install datasets

In [None]:
# print gpu info
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
# import libraries
import gzip
import json
from typing import Callable, List, Tuple, Iterable, Dict, Type, Any
from functools import reduce, lru_cache
from collections import OrderedDict
import inspect

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams["figure.facecolor"] = "white"
from tqdm import tqdm

import torch as th
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.nn import Embedding
from torch.utils.data import DataLoader, random_split
# from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, PackedSequence
# from torchtext.vocab import vocab, Vocab, GloVe, build_vocab_from_iterator
# from torchtext.data.utils import get_tokenizer

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from torchmetrics import MeanSquaredError

import optuna
from optuna.visualization import plot_parallel_coordinate, plot_contour
from optuna.importance import get_param_importances

import wandb

from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM,
    AdamW,
    get_linear_schedule_with_warmup,
    # GPT2LMHeadModel,
    AutoConfig,
    pipeline
)
from datasets import load_dataset, DatasetDict, Dataset

# Dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

In [None]:
def df_with_text_concat(
    df: pd.DataFrame, num_items_concat: int, joining_text: str,
) -> pd.DataFrame:
    """
    Uses pandas groupby + agg to concat multiple texts together, joined with `joining_text`
    """
    assert "bin" not in df.columns
    df["bin"] = df.index // num_items_concat
    concat_fn = lambda series: joining_text.join([str(x) for x in series])
    df = df.groupby("bin").agg(concat_fn)
    return df.reset_index(drop=True)

In [None]:
raw_datasets = DatasetDict()
for name in ["train", "valid"]:
    df = pd.read_csv(f'{drive_path}/data/abstracts_{name}.csv')
    df = df.sample(frac=1)
    df = df_with_text_concat(
        df, num_items_concat=100, joining_text=tokenizer.eos_token
    )
    df = df[["abstract"]]
    ds = Dataset.from_pandas(df)
    raw_datasets[name] = ds

In [None]:
raw_datasets["train"][0]["abstract"][:2000]

In [None]:
context_length = 128

outputs = tokenizer(
    raw_datasets["train"][:2]["abstract"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["abstract"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

In [None]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="pt")

In [None]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

In [None]:
train_dl = DataLoader(
    tokenized_datasets["train"],
    batch_size=32,
    shuffle=True,
    collate_fn=data_collator,
    num_workers=4
)

val_dl = DataLoader(
    tokenized_datasets["valid"],
    batch_size=32,
    shuffle=False,
    collate_fn=data_collator,
    num_workers=4
)

# Model

In [None]:
class LitCausalLMModel(pl.LightningModule):
    def __init__(
        self,
        hf_model_name: str,
        total_steps: int,
        lr: float = 5e-5,
        weight_decay: float = 0.01,
        adam_epsilon: float = 1e-6,
        warmup_steps: int = 1000,
    ) -> None:
        super().__init__()
        self.hf_model = AutoModelForCausalLM.from_pretrained(hf_model_name)
        self.save_hyperparameters()

    def forward(self, **inputs):
        outputs = self.hf_model(**inputs)
        return outputs

    def training_step(self, batch: th.Tensor, batch_idx: int):
        outputs = self(**batch)
        loss = outputs.loss
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch: th.Tensor, batch_idx: int):
        outputs = self(**batch)
        loss = outputs.loss
        self.log("val_loss", loss)
        return loss

    def on_validation_epoch_end(self):
        # visualize the output
        pipe = pipeline(
            "text-generation", model=self.hf_model, tokenizer=tokenizer, device=0
        )
        txt = "We develop a method to"
        gen_text = pipe(txt, num_return_sequences=1)[0]["generated_text"]
        # self.wandb_table.add_data(self.global_step, gen_text)
        # wandb.log({"generated_text": self.wandb_table})
        # self.logger.log_table({"generated_text": self.wandb_table})
        print(gen_text)

    def configure_optimizers(self):
        model = self.hf_model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.hf_model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [
                    p for n, p in self.hf_model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.hparams.lr,
            eps=self.hparams.adam_epsilon
        )

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.hparams.total_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

In [None]:
th.cuda.empty_cache()
import gc
gc.collect()

In [None]:
wandb.init(project="expt4c_DistilGPT2OnArxivMLData")

model = LitCausalLMModel("distilgpt2", total_steps=len(train_dl), lr=1e-4)

logger = WandbLogger(log_model=True)
logger.watch(model, log="all")

In [None]:
# start fine tuning
trainer = Trainer(
    max_epochs=2,
    accelerator="gpu",
    logger=logger,
    val_check_interval=1000,
)
trainer.fit(model, train_dl, val_dl)
wandb.finish()

In [None]:
trainer.save_checkpoint("gdrive/MyDrive/path/to/project/dir/model.ckpt")

In [None]:
# check that model was saved
from pathlib import Path
p = Path(".") / "gdrive/MyDrive/path/to/project/dir"
list(p.iterdir())