In [1]:
 !nvidia-smi

Fri Jan  6 18:17:32 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:1B.0 Off |                    0 |
| N/A   34C    P0    56W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:00:1C.0 Off |                    0 |
| N/A   29C    P0    36W / 300W |      0MiB / 16384MiB |      0%      Defaul

In [2]:
# ! pip install transformers sentencepiece pytorch-lightning

In [3]:
import pytorch_lightning as pl
print(pl.__version__)

1.8.6


In [4]:
import torch
print(torch.__version__)

1.13.1


In [5]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np


In [6]:
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap
from pytorch_lightning.callbacks import ModelCheckpoint

In [7]:
from transformers import (
     AdamW,
     T5ForConditionalGeneration,
     T5Tokenizer,
     get_linear_schedule_with_warmup
 )
# Seeds all the processes including numpy torch and other imported modules.
pl.seed_everything(0)

Global seed set to 0


0

In [8]:
from pytorch_lightning.loggers.tensorboard import TensorBoardLogger

### Data extraction and loading

In [9]:
data = pd.read_csv("pt_question_answers.csv")

In [10]:
data.columns

Index(['Unnamed: 0', 'pt_post_id', 'pt_post_type_id', 'pt_accepted_answer_id',
       'pt_creation_date', 'pt_score', 'pt_title', 'pt_body', 'pt_tags',
       'pt_parent_id', 'pt_answer'],
      dtype='object')

In [11]:
data.head()

Unnamed: 0.1,Unnamed: 0,pt_post_id,pt_post_type_id,pt_accepted_answer_id,pt_creation_date,pt_score,pt_title,pt_body,pt_tags,pt_parent_id,pt_answer
0,27837861,34750268,1,34762233.0,2016-01-12T17:36:25.473,9,Extracting the top-k value-indices from a 1-D ...,<p>Given a 1-D tensor in Torch (<code>torch.Te...,<python><lua><pytorch><torch>,,"<p>As of pull request <a href=""https://github...."
1,30769673,38543850,1,38676842.0,2016-07-23T16:15:43.967,40,How to Display Custom Images in Tensorboard (e...,"<p>The <a href=""https://github.com/tensorflow/...",<python><tensorflow><matplotlib><pytorch><tens...,,<p>It is quite easy to do if you have the imag...
2,33236300,41767005,1,43824857.0,2017-01-20T15:22:08.063,11,Python wheels: cp27mu not supported,"<p>I'm trying to install pytorch (<a href=""htt...",<python><linux><unicode><pytorch>,,"<p>Yes, that is possible. Just create the obje..."
3,33307877,41861354,1,54261158.0,2017-01-25T20:45:35.297,8,Loading Torch7 trained models (.t7) in PyTorch,<p>I am using Torch7 library for implementing ...,<python><lua><pytorch><torch><pre-trained-model>,,<p><code>view()</code> reshapes the tensor wit...
4,33355427,41924453,1,42054194.0,2017-01-29T18:31:24.687,65,PyTorch: How to use DataLoaders for custom Dat...,<p>How to make use of the <code>torch.utils.da...,<python><torch><pytorch>,,<p>While you will not get as detailed informat...


### Tokenization

In [12]:
MODEL_NAME ='t5-base' 
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [13]:
data.loc[0, "pt_title"]

'Extracting the top-k value-indices from a 1-D Tensor'

In [14]:
data.loc[0, "pt_body"]

'<p>Given a 1-D tensor in Torch (<code>torch.Tensor</code>), containing values which can be compared (say floating point), how can we extract the indices of the top-<em>k</em> values in that tensor?</p>\n<p>Apart from the brute-force method, I am looking for some API call, that Torch/lua provides, which can perform this task efficiently.</p>\n'

In [15]:
sample_encoding = tokenizer(data.loc[0, "pt_body"])

In [16]:
sample_encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [17]:
print(sample_encoding["input_ids"])
print(sample_encoding["attention_mask"])
print(len(sample_encoding['input_ids']))

[3, 2, 102, 3155, 517, 757, 29, 3, 9, 8218, 308, 3, 324, 7, 127, 16, 3794, 524, 41, 2, 4978, 3155, 17, 127, 524, 5, 382, 35, 7, 127, 2, 87, 4978, 3155, 201, 3, 6443, 2620, 84, 54, 36, 3, 2172, 41, 8735, 12848, 500, 201, 149, 54, 62, 5819, 8, 3, 19082, 7, 13, 8, 420, 18, 2, 15, 51, 3155, 157, 2, 87, 15, 51, 3155, 2620, 16, 24, 3, 324, 7, 127, 58, 2, 87, 102, 3155, 3, 2, 102, 3155, 188, 2274, 45, 8, 18343, 15, 18, 10880, 1573, 6, 27, 183, 479, 21, 128, 6429, 580, 6, 24, 3794, 524, 87, 40, 76, 9, 795, 6, 84, 54, 1912, 48, 2491, 8877, 5, 2, 87, 102, 3155, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
125


In [18]:
len(sample_encoding['attention_mask'])

125

In [19]:
preds = [
      tokenizer.decode(input_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
      for input_id in sample_encoding['input_ids']
]

preds= " ".join(preds)



In [20]:
for wrap in textwrap.wrap(preds, width = 80):
    print(wrap)

 <unk> p > G ive n  a 1- D  ten s or in Tor ch ( <unk> code > t or ch . T en s
or <unk> / code > ),  containing values which can be  compared ( say floating
point ), how can we extract the  indice s of the top - <unk> e m > k <unk> / e m
> values in that  ten s or ? <unk> / p >  <unk> p > A part from the brut e -
force method , I am looking for some API call , that Tor ch / l u a provides ,
which can perform this task efficiently . <unk> / p > </s>


In [21]:
encoding = tokenizer(
     data.loc[0, "pt_title"],
     data.loc[0, "pt_body"],
     max_length=512,
     padding='max_length',
     truncation="only_second",
     return_attention_mask=True,
     add_special_tokens=True,
     return_tensors="pt"
)

In [22]:
encoding.keys()


dict_keys(['input_ids', 'attention_mask'])

In [23]:
# tokenizer.special_tokens_map

In [24]:
tokenizer.eos_token, tokenizer.eos_token_id


('</s>', 1)

In [25]:
tokenizer.decode(encoding['input_ids'].squeeze()) 


'Extracting the top-k value-indices from a 1-D Tensor</s> <unk> p>Given a 1-D tensor in Torch (<unk> code>torch.Tensor<unk> /code>), containing values which can be compared (say floating point), how can we extract the indices of the top-<unk> em>k<unk> /em> values in that tensor?<unk> /p> <unk> p>Apart from the brute-force method, I am looking for some API call, that Torch/lua provides, which can perform this task efficiently.<unk> /p></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

### Creating labels for answers

In [26]:
data.loc[0, "pt_answer"]

'<p>As of pull request <a href="https://github.com/torch/torch7/pull/496" rel="noreferrer">#496</a> Torch now includes a built-in API named <a href="https://github.com/torch/torch7/blob/03c04c6/doc/maths.md#torchtopkresval-resind-x-k-dim-dir-sort" rel="noreferrer"><code>torch.topk</code></a>. Example:</p>\n\n<pre><code>&gt; t = torch.Tensor{9, 1, 8, 2, 7, 3, 6, 4, 5}\n\n-- obtain the 3 smallest elements\n&gt; res = t:topk(3)\n&gt; print(res)\n 1\n 2\n 3\n[torch.DoubleTensor of size 3]\n\n-- you can also get the indices in addition\n&gt; res, ind = t:topk(3)\n&gt; print(ind)\n 2\n 4\n 6\n[torch.LongTensor of size 3]\n\n-- alternatively you can obtain the k largest elements as follow\n-- (see the API documentation for more details)\n&gt; res = t:topk(3, true)\n&gt; print(res)\n 9\n 8\n 7\n[torch.DoubleTensor of size 3]\n</code></pre>\n\n<p>At the time of writing the CPU implementation follows a <a href="https://github.com/wickedfoo/torch7/blob/ef019670474b69629a8b3d50eb426d5858bd5c45/lib

In [27]:
answer_encoding = tokenizer(
     data.loc[0, "pt_answer"],
     max_length=512,
     padding='max_length',
     truncation=True,
     return_attention_mask=True,
     add_special_tokens=True,
     return_tensors="pt"
)

In [28]:
tokenizer.decode(answer_encoding['input_ids'].squeeze())


'<unk> p>As of pull request <unk> a href="https://github.com/torch/torch7/pull/496" rel="noreferrer">#496<unk> /a> Torch now includes a built-in API named <unk> a href="https://github.com/torch/torch7/blob/03c04c6/doc/maths.md#torchtopkresval-resind-x-k-dim-dir-sort" rel="noreferrer"><unk> code>torch.topk<unk> /code><unk> /a>. Example:<unk> /p> <unk> pre><unk> code>&gt; t = torch.Tensor<unk> 9, 1, 8, 2, 7, 3, 6, 4, 5<unk> -- obtain the 3 smallest elements &gt; res = t:topk(3) &gt; print(res) 1 2 3 [torch.DoubleTensor of size 3] -- you can also get the indices in addition &gt; res, ind = t:topk(3) &gt; print(ind) 2 4 6 [torch.LongTensor of size 3] -- alternatively you can obtain the k largest elements as follow -- (see the API documentation for more details) &gt; res = t:topk(3, true) &gt; print(res) 9 8 7 [torch.DoubleTensor of size 3] <unk> /code><unk> /pre> <unk> p>At the time of writing the CPU implementation follows a <unk> a href="https://github.com/wickedfoo/torch7/blob/ef0196704

In [29]:
labels = answer_encoding["input_ids"]


In [30]:
labels


tensor([[    3,     2,   102,  3155,   188,     7,    13,  3197,  1690,     3,
             2,     9,     3,   107,    60,    89, 17592,  5948,     7,  1303,
         12651, 16420,     5,   287,    87,    17,   127,   524,    87,    17,
           127,   524,   940,    87,  4801,    40, 13572,  4314,   121,  8318,
         17592,    29,   127,    15,  1010,    52,    49,   121,  3155,  4663,
           591,  4314,     2,    87,     9,  3155,  3794,   524,   230,   963,
             3,     9,  1192,    18,    77,  6429,  2650,     3,     2,     9,
             3,   107,    60,    89, 17592,  5948,     7,  1303, 12651, 16420,
             5,   287,    87,    17,   127,   524,    87,    17,   127,   524,
           940,    87,  4672,   115,    87,  4928,    75,  6348,    75,   948,
            87,  7171,    87,  3357,   107,     7,     5,    51,    26,  4663,
            17,   127,   524,  2916,   157,    60,     7,  2165,    18,    60,
             7,    77,    26,    18,   226,    18,  

In [31]:
labels[labels == 0] = -100
labels 

tensor([[    3,     2,   102,  3155,   188,     7,    13,  3197,  1690,     3,
             2,     9,     3,   107,    60,    89, 17592,  5948,     7,  1303,
         12651, 16420,     5,   287,    87,    17,   127,   524,    87,    17,
           127,   524,   940,    87,  4801,    40, 13572,  4314,   121,  8318,
         17592,    29,   127,    15,  1010,    52,    49,   121,  3155,  4663,
           591,  4314,     2,    87,     9,  3155,  3794,   524,   230,   963,
             3,     9,  1192,    18,    77,  6429,  2650,     3,     2,     9,
             3,   107,    60,    89, 17592,  5948,     7,  1303, 12651, 16420,
             5,   287,    87,    17,   127,   524,    87,    17,   127,   524,
           940,    87,  4672,   115,    87,  4928,    75,  6348,    75,   948,
            87,  7171,    87,  3357,   107,     7,     5,    51,    26,  4663,
            17,   127,   524,  2916,   157,    60,     7,  2165,    18,    60,
             7,    77,    26,    18,   226,    18,  

### Create Dataset

In [32]:
class SODataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        source_max_token_len: int = 512,
        target_max_token_len: int = 512,
    ):
        self.data = data
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        source_encoding = tokenizer(
            data_row["pt_title"],
            data_row["pt_body"],
            max_length=self.source_max_token_len,
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )
        target_encoding = tokenizer(
            data_row["pt_answer"],
            max_length=self.target_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )
        labels = target_encoding["input_ids"]
        labels[labels == 0] = -100
        return dict(
            question=data_row["pt_title"],
            context=data_row["pt_body"],
            answer_text=data_row["pt_answer"],
            input_ids=source_encoding["input_ids"].flatten(),
            attention_mask=source_encoding["attention_mask"].flatten(),
            labels=labels.flatten(),
        )

In [33]:
sample_dataset = SODataset(data, tokenizer)


In [34]:
for sample_data in sample_dataset:
    print("Question: ", sample_data['question'])
    print("Answer text: ", sample_data['answer_text'])
    print("Input_ids: ", sample_data['input_ids'][:10])
    print("Labels: ", sample_data['labels'][:10])
    break 

Question:  Extracting the top-k value-indices from a 1-D Tensor
Answer text:  <p>As of pull request <a href="https://github.com/torch/torch7/pull/496" rel="noreferrer">#496</a> Torch now includes a built-in API named <a href="https://github.com/torch/torch7/blob/03c04c6/doc/maths.md#torchtopkresval-resind-x-k-dim-dir-sort" rel="noreferrer"><code>torch.topk</code></a>. Example:</p>

<pre><code>&gt; t = torch.Tensor{9, 1, 8, 2, 7, 3, 6, 4, 5}

-- obtain the 3 smallest elements
&gt; res = t:topk(3)
&gt; print(res)
 1
 2
 3
[torch.DoubleTensor of size 3]

-- you can also get the indices in addition
&gt; res, ind = t:topk(3)
&gt; print(ind)
 2
 4
 6
[torch.LongTensor of size 3]

-- alternatively you can obtain the k largest elements as follow
-- (see the API documentation for more details)
&gt; res = t:topk(3, true)
&gt; print(res)
 9
 8
 7
[torch.DoubleTensor of size 3]
</code></pre>

<p>At the time of writing the CPU implementation follows a <a href="https://github.com/wickedfoo/torch7/bl

### Split data into train and validation

In [35]:
train_df, val_df = train_test_split(data, test_size=0.05)
train_df.shape,  val_df.shape 

((13863, 11), (730, 11))

### Create PTL Data module class

In [36]:
class SODataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 4,
        source_max_token_len: int = 512,
        target_max_token_len: int = 512,
    ):
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self, stage):
        self.train_dataset = SODataset(
            self.train_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len
        )
        self.test_dataset = SODataset(
            self.test_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=4)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, num_workers=4)


In [37]:
BATCH_SIZE = 4
N_EPOCHS = 1
data_module = SODataModule(train_df, val_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup("fit")

### Loading T5 pretrained model

In [38]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "pre

In [39]:
input_ids_translated = tokenizer(
    "translate English to German : Oppertunity did not knock until I built a door",
    return_tensors="pt",
).input_ids
generated_ids = model.generate(input_ids=input_ids_translated)



In [40]:
pred_translated = [
    tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    for gen_id in generated_ids
]

In [41]:
"".join(pred_translated)

'Die Gelegenheit klopfte erst, als ich eine Tür gebaut hatte.'

### Model finetuning

In [42]:
class SOModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=0.0001)
        return optimizer

In [43]:
model = SOModel()

In [44]:
model

SOModel(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_features=30

In [45]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min",
)
# logger = TensorBoardLogger("training-logs", name="bio-qa")
logger = TensorBoardLogger("training-logs", name="so-qa")
trainer = pl.Trainer(
    logger = logger,
    callbacks = [checkpoint_callback],
    max_epochs=1,
    gpus=4)

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [46]:
# ! pip install -q tensorboard

In [47]:
# %load_ext tensorboard
# %tensorboard --logdir ./training-logs

In [48]:
trainer.fit(model, data_module)

Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/4
Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4
Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/4
Missing logger folder: training-logs/so-qa
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 4 processes
----------------------------------------------------------------------------------------------------

Missing logger folder: training-logs/so-qa
Missing logger folder: training-logs/so-qa
Missing logger folder: training-logs/so-qa
LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditio

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Epoch 0, global step 867: 'val_loss' reached 2.15606 (best 2.15606), saving model to '/home/ubuntu/checkpoints/best-checkpoint.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=1` reached.


In [63]:
trainer.test(model, data_module)

Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/4
Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4
Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/4
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 4 processes
----------------------------------------------------------------------------------------------------

LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
  rank_zero_warn(
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]



────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           2.1247711181640625
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 2.1247711181640625}]

### test against validation dataset

In [57]:
def generate_answer(question):
    source_encoding = tokenizer(
        question["pt_title"],
        question["pt_body"],
        max_length=512,
        padding="max_length",
        truncation="only_second",
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt",
    )
    generated_ids = model.model.generate(
        input_ids=source_encoding["input_ids"],
        attention_mask=source_encoding["attention_mask"],
        num_beams=1,  # greedy search
        max_length=80,
        repetition_penalty=2.5,
        early_stopping=True,
        use_cache=True,
    )
    preds = [
        tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for generated_id in generated_ids
    ]
    return "".join(preds)

In [58]:
sample_question = val_df.iloc[20]
sample_question

Unnamed: 0                                                        57682206
pt_post_id                                                        74635994
pt_post_type_id                                                          1
pt_accepted_answer_id                                           74638164.0
pt_creation_date                                   2022-12-01T01:32:23.947
pt_score                                                                 1
pt_title                 Pytorch's share_memory_() vs built-in Python's...
pt_body                  <p>Trying to learn about the built-in <a href=...
pt_tags                  <python><pytorch><multiprocessing><shared-memory>
pt_parent_id                                                           NaN
pt_answer                <p><code>datasets.FashionMNIST</code> returns ...
Name: 7551, dtype: object

In [59]:
sample_question["pt_title"]


"Pytorch's share_memory_() vs built-in Python's shared_memory: Why in Pytorch we don't need to access the shared memory-block?"

In [60]:
sample_question["pt_answer"]

'<p><code>datasets.FashionMNIST</code> returns (image, target) where target is index of the target class. So if you want to take the mean you need to extract just the image.</p>\n<pre class="lang-py prettyprint-override"><code>images = torch.vstack([pair[0] for pair in train_dataset])\n</code></pre>\n<p>images should now be of shape (N, H, W) and you can do whatever you want from there.</p>\n<p>Another solution as noted by OP is to use <code>train_dataset.data</code> to directly access the data.</p>\n'

In [61]:
generate_answer(sample_question)

'p>It seems that the PyTorch package is fully compatible with the original module. Then you can use it to access shared memory blocks, which are not used by Python\'s built-in packages. You can do this using:/a> – pre class="lang" rel="nofollow noreferrer", "#'