<a href="https://colab.research.google.com/github/gagan3012/keytotext/blob/master/Notebooks/GPTModelKTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
from IPython.display import HTML
from subprocess import getoutput
s = getoutput('nvidia-smi')
if 'K80' in s:
  gpu = 'K80'
elif 'T4' in s:
  gpu = 'T4'
elif 'P100' in s:
  gpu = 'P100'
display(HTML(f"<h2>{gpu}</h2>"))
print(s)
# or simply
!nvidia-smi -L

Fri Mar 19 14:20:29 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    35W / 250W |   6913MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [21]:
import psutil
def get_size(bytes, suffix="B"):
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor
print("="*40, "Memory Information", "="*40)
svmem = psutil.virtual_memory()
print(f"Total: {get_size(svmem.total)}") ; print(f"Available: {get_size(svmem.available)}")
print(f"Used: {get_size(svmem.used)}") ; print(f"Percentage: {svmem.percent}%")

Total: 12.72GB
Available: 9.62GB
Used: 4.29GB
Percentage: 24.3%


In [3]:
import os
import datetime
import time
import random

import pandas as pd
import numpy as np
import re

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from tqdm import tqdm

!pip install transformers

from transformers import (
	AdamW,
	GPT2LMHeadModel,
	GPT2Tokenizer, 
	GPT2Config,
	get_linear_schedule_with_warmup
)

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 7.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 50.1MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 56.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=6a3c7ff7c9d

In [4]:
!git clone https://github.com/gagan3012/keytotext

Cloning into 'keytotext'...
remote: Enumerating objects: 127, done.[K
remote: Counting objects: 100% (127/127), done.[K
remote: Compressing objects: 100% (104/104), done.[K
remote: Total 127 (delta 57), reused 54 (delta 16), pack-reused 0[K
Receiving objects: 100% (127/127), 2.54 MiB | 10.00 MiB/s, done.
Resolving deltas: 100% (57/57), done.


In [5]:
class BOTHData(Dataset):
	def __init__(self, tokenizer, max_source_length,
		max_target_length, type_path):
		super().__init__()
		self.tokenizer = tokenizer
		self.max_source_length = max_source_length
		self.max_target_length = max_target_length
		self.input_ids = []
		self.attn_masks = []
		self._build(type_path)
	
	def __len__(self):
		return len(self.input_ids)
	
	def __getitem__(self, index):

		return self.input_ids[index], self.attn_masks[index]

	def _build(self, type_path):
		if type_path == 'train':
			#df1 = pd.read_csv('/content/keytotext/data/dart_train.csv')
			df = pd.read_csv('/content/keytotext/data/webnlg_train.csv')
		elif type_path == 'eval':
			#df = pd.read_csv('/content/keytotext/data/dart_dev.csv')
			df = pd.read_csv('/content/keytotext/data/webnlg_dev.csv')
		else:
			#df = pd.read_csv('/content/keytotext/data/dart_test.csv')
			df = pd.read_csv('/content/keytotext/data/webnlg_test.csv')
		
	#	df = pd.concat([df1, df2])
		# n = 1 
		# df = df.head(int(len(df)*(n/100)))

		for index, row in df.iterrows():
				line = row['input_text']
				target = row['target_text']
				encodings = self.tokenizer('<|startoftext|>'+ line + ' = ' + target + '<|endoftext|>', truncation=True, max_length=self.max_source_length, padding="max_length")
				self.input_ids.append(torch.tensor(encodings['input_ids']))
				self.attn_masks.append(torch.tensor(encodings['attention_mask']))

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') 

print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.



The max model length is 1024 for this model, although the actual embedding size for GPT small is 768
The beginning of sequence token <|startoftext|> token has the id 50257
The end of sequence token <|endoftext|> has the id 50256
The padding token <|pad|> has the id 50258


In [7]:
batch_size = 4
train_dataset = BOTHData(tokenizer, 512, 512, 'train')
val_dataset = BOTHData(tokenizer, 512, 512, 'val')
test_dataset = BOTHData(tokenizer, 512, 512, 'test')

In [8]:
train_dataloader = DataLoader(
			train_dataset,  # The training samples.
			sampler = RandomSampler(train_dataset), # Select batches randomly
			batch_size = batch_size # Trains with this batch size.
		)

validation_dataloader = DataLoader(
			val_dataset, # The validation samples.
			sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
			batch_size = batch_size # Evaluate with this batch size.
		)
test_dataloader = DataLoader(
			test_dataset, # The validation samples.
			sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
			batch_size = batch_size # Evaluate with this batch size.
		)

In [9]:
configuration = GPT2Config.from_pretrained('distilgpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("distilgpt2", config=configuration)

model.resize_token_embeddings(len(tokenizer))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=762.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=352833716.0, style=ProgressStyle(descri…




Embedding(50259, 768)

In [10]:
device = torch.device("cuda:0")
model.cuda()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [11]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [12]:
epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
sample_every = 1000

In [13]:
optimizer = AdamW(model.parameters(),
				  lr = learning_rate,
				  eps = epsilon
				)

In [14]:
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
											num_warmup_steps = warmup_steps, 
											num_training_steps = total_steps)
total_steps

44285

In [15]:
def format_time(elapsed):
	return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [16]:
total_t0 = time.time()

training_stats = []

model = model.to(device)

In [17]:
!pip install wandb

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/33/ae/79374d2b875e638090600eaa2a423479865b7590c53fb78e8ccf6a64acb1/wandb-0.10.22-py2.py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 8.0MB/s 
Collecting pathtools
  Downloading https://files.pythonhosted.org/packages/e7/7f/470d6fcdf23f9f3518f6b0b76be9df16dcc8630ad409947f8be2eb0ed13a/pathtools-0.1.2.tar.gz
Collecting shortuuid>=0.5.0
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304e7f1b216f0896b26156b78e7c38e1211e9b798b4716c53d/shortuuid-1.0.1-py3-none-any.whl
Collecting configparser>=3.8.1
  Downloading https://files.pythonhosted.org/packages/fd/01/ff260a18caaf4457eb028c96eeb405c4a230ca06c8ec9c1379f813caa52e/configparser-5.0.2-py3-none-any.whl
Collecting GitPython>=1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/a6/99/98019716955ba243657daedd1de8f3a88ca1f5b75057c38e959db22fb87b/GitPython-3.1.14-py3-none-any.whl (159kB)
[K     |██████████████

In [18]:
%env WANDB_PROJECT = keytotext
import wandb

wandb.init(project="keytotext")

for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in tqdm(enumerate(train_dataloader)):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                    labels=b_labels,
                    attention_mask=b_masks,
                    token_type_ids=None
                    )

        loss = outputs[0]

        batch_loss = loss.item()

        total_train_loss += batch_loss
        wandb.log({'Train Loss': batch_loss,'Epoch':epoch_i})
# Get sample every x batches.
        if step % sample_every == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss,
                                                                         elapsed))

            model.eval()

            sample_outputs = model.generate(
                                            bos_token_id=random.randint(1, 30000),
                                            do_sample=True,
                                            top_k=50,
                                            max_length=200,
                                            top_p=0.95,
                                            num_return_sequences=1)
            for i, sample_output in enumerate(sample_outputs):
                print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

# Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

# Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

# ========================================
#               Validation
# ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

# Evaluate data for one epoch
    for batch in tqdm(validation_dataloader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids,
                        #                            token_type_ids=None,
                        attention_mask=b_masks,
                        labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss
        wandb.log({'Eval Loss': batch_loss,'Epoch':epoch_i})

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

# Record all statistics from this epoch.
    training_stats.append(
    {
        'epoch': epoch_i + 1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Training Time': training_time,
        'Validation Time': validation_time
    })


env: WANDB_PROJECT=keytotext


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


0it [00:00, ?it/s]


Training...


1000it [03:33,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 1,000  of  8,857. Loss: 0.07072309404611588.   Elapsed: 0:03:33.


1001it [03:33,  3.39it/s]

0:  bipartisan<H> Elliot See <R> status <T> Retired = Elliot See passed away on Sept. 8.


2000it [07:06,  4.68it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 2,000  of  8,857. Loss: 0.21562559902668.   Elapsed: 0:07:06.


2001it [07:06,  3.38it/s]

0:  increasing<H> A-Rosa Luna <R> ship beam <T> 32.2 = The A-Rosa Luna is 32.2m in length and its beam is 32.2m long.


3000it [10:39,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 3,000  of  8,857. Loss: 0.13636453449726105.   Elapsed: 0:10:40.


3001it [10:40,  2.20it/s]

0: day<H> United States <R> ethnic group <T> Asian Americans <H> A Fortress of Grey Ice <R> country <T> United States <H> United States <R> capital <T> Washington D.C. <H> United States <R> demonym <T> Americans <H> A Fortress of Grey Ice <R> country <T> United States = The A Fortress of Grey Ice was written in the U.S. (capital: Washington) in the English country of the United States. Washington, D.C. is the capital city of the United States. Americans live in the United States.


4000it [14:13,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 4,000  of  8,857. Loss: 0.09625663608312607.   Elapsed: 0:14:13.


4001it [14:14,  2.58it/s]

0:  Hang<H> Alison O'Donnell <R> genre <T> Jazz <H> Alison O'Donnell <R> associated band/associated musical artist <T> Mellow Candle <H> Alison O'Donnell <R> associated band/associated musical artist <T> Flibbertigibbet = Alison O'Donnell, whose musical genre is jazz and is associated with the band of Mellow Candle. The band member also performs jazz music.


5000it [17:46,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 5,000  of  8,857. Loss: 0.0676669031381607.   Elapsed: 0:17:47.


5001it [17:46,  3.56it/s]

0:  foods<H> Bandeja paisa <R> ingredient <T> Cooking plantain = Cooking plantain is one of the ingredients in the bandeja paisa.


6000it [21:19,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 6,000  of  8,857. Loss: 0.1234455481171608.   Elapsed: 0:21:20.


6001it [21:20,  2.53it/s]

0:  trail<H> A Fortress of Grey Ice <R> language <T> English language <H> English language <R> spoken in <T> Great Britain <H> A Fortress of Grey Ice <R> country <T> United States <H> United States <R> ethnic group <T> Native Americans in the United States = The book A Fortress of Grey Ice is written in English, which is spoken in Great Britain. One of the ethnic groups is the Native Americans.


7000it [24:52,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 7,000  of  8,857. Loss: 0.09289314597845078.   Elapsed: 0:24:53.


7001it [24:53,  3.42it/s]

0: intend<H> 101 Helena <R> apoapsis <T> 441092000.0 kilometres = 101 Helena has an apoapsis of 441092000.0 km.


8000it [28:25,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 8,000  of  8,857. Loss: 0.1014615148305893.   Elapsed: 0:28:26.


8001it [28:26,  3.32it/s]

0:  surround<H> Adolfo Suarez Madrid-Barajas Airport <R> runway length <T> 3500.0 = The runway length at the airport at Adolfo Suarez Madrid-Barajas Airport is 3500.0.


8857it [31:28,  4.69it/s]
  0%|          | 2/8857 [00:00<10:26, 14.13it/s]


  Average training loss: 0.12
  Training epoch took: 0:31:28

Running Validation...


100%|██████████| 8857/8857 [10:31<00:00, 14.02it/s]
0it [00:00, ?it/s]

  Validation Loss: 0.07
  Validation took: 0:10:32

Training...


1000it [03:32,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 1,000  of  8,857. Loss: 0.06564876437187195.   Elapsed: 0:03:33.


1001it [03:34,  1.90it/s]

0:  reflex<H> 1099 Figneria <R> epoch <T> 2006-12-31 <H> 1099 Figneria <R> orbital period <T> 179942000.0 <H> 1099 Figneria <R> periapsis <T> 349206000000.0 <H> 1099 Figneria <R> escape velocity <T> 0.0155 kilometrePerSeconds <H> 1099 Figneria <R> escape velocity <T> 0.0155 kilometrePerSeconds = 1099 Figneria, which has the epoch date of 31 December 2006, has an orbital period of 179942000. It has a periapsis of 349206000000.0 and an escape velocity of 0.0155 km/s.


2000it [07:06,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 2,000  of  8,857. Loss: 0.07628925889730453.   Elapsed: 0:07:07.


2001it [07:07,  2.54it/s]

0:  display<H> Alan Martin footballer <R> club <T> Hamilton Academical F.C. <H> Clyde F.C. <R> ground <T> Broadwood Stadium <H> Clyde F.C. <R> number of members <T> 23000 = Alan Martin plays football for Clyde F.C. which is part of the Hamilton Academical F.C. club. He has 23,000 members and they played at Broadwood Stadium.


3000it [10:40,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 3,000  of  8,857. Loss: 0.07688961178064346.   Elapsed: 0:10:40.


3001it [10:40,  3.53it/s]

0:  pastor<H> Alan B. Miller Hall <R> architect <T> Robert A. M. Stern = Alan B. Miller Hall was designed by Robert A M Stern.


4000it [14:13,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 4,000  of  8,857. Loss: 0.09746886044740677.   Elapsed: 0:14:14.


4001it [14:14,  3.44it/s]

0:  illicit<H> Aaron S. Daggett <R> battle <T> Battle of Fredericksburg = Aaron S. Daggett fought in the Battle of Fredericksburg.


5000it [17:47,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 5,000  of  8,857. Loss: 0.09212801605463028.   Elapsed: 0:17:47.


5001it [17:47,  2.50it/s]

0:  Liberation<H> Appleton International Airport <R> location <T> Greenville Wisconsin <H> Greenville Wisconsin <R> country <T> United States <H> Greenville Wisconsin <R> is part of <T> Grand Chute Wisconsin <H> Greenville Wisconsin <R> is part of <T> Ellington Wisconsin = Appleton International airport is located in Greenville, Wisconsin, USA. Greenville is part of Ellington, Wisconsin.


6000it [21:21,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 6,000  of  8,857. Loss: 0.08235713094472885.   Elapsed: 0:21:21.


6001it [21:22,  2.05it/s]

0:  Nam<H> Abraham A. Ribicoff <R> spouse <T> Casey Ribicoff <H> Casey Ribicoff <R> birth place <T> Chicago <H> Abraham A. Ribicoff <R> office <T> United States Secretary of Health Education and Welfare <H> Abraham A. Ribicoff <R> death place <T> New York <H> Abraham A. Ribicoff <R> office <T> United States Secretary of Health Education and Welfare = Abraham A Ribicoff, who was married to Casey Ribicoff, who was born in Chicago, USA, worked as the United States Secretary of Health, Education, and Welfare office. He died in New York.


7000it [24:55,  4.68it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 7,000  of  8,857. Loss: 0.04728128761053085.   Elapsed: 0:24:55.


7001it [24:56,  2.56it/s]

0: ION<H> Beef kway teow <R> country <T> Singapore <H> Beef kway teow <R> ingredient <T> Sesame oil <H> Beef kway teow <R> region <T> Indonesia <H> Singapore <R> leader <T> Halimah Yacob = Beef kway teow is a dish from Indonesia and is made from sesame oil. Sesame oil is spoken in Singapore.


8000it [28:29,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 8,000  of  8,857. Loss: 0.06720096617937088.   Elapsed: 0:28:29.


8001it [28:29,  2.60it/s]

0:  glimpse<H> Alan Bean <R> mission <T> Apollo 12 <H> Alan Bean <R> date of retirement <T> June 1981 <H> Alan Bean <R> birth date <T> 1932-03-15 <H> Alan Bean <R> status <T> Retired = Alan Bean, who was born on 1932-03-15 in 1932, was a crew member of Apollo 12. He retired in June 1981.


8857it [31:31,  4.68it/s]
  0%|          | 2/8857 [00:00<10:25, 14.16it/s]


  Average training loss: 0.08
  Training epoch took: 0:31:32

Running Validation...


100%|██████████| 8857/8857 [10:31<00:00, 14.02it/s]
0it [00:00, ?it/s]

  Validation Loss: 0.06
  Validation took: 0:10:32

Training...


1000it [03:32,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 1,000  of  8,857. Loss: 0.042699046432971954.   Elapsed: 0:03:33.


1001it [03:33,  2.27it/s]

0:  Laure<H> Andra singer <R> genre <T> Rhythm and blues <H> Rhythm and blues <R> music subgenre <T> Christian alternative rock <H> Rhythm and blues <R> derivative <T> Disco <H> Andra singer <R> record label <T> Rabadash Records <H> Rhythm and blues <R> stylistic origin <T> Blues = Andra performs in the genre of rhythm and blues, which is in turn, originated from blues music. He recorded with Rabadash Records. The genre of Andra is derived from disco.


2000it [07:06,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 2,000  of  8,857. Loss: 0.07294628024101257.   Elapsed: 0:07:06.


2001it [07:06,  3.22it/s]

0: ism<H> AmeriGas <R> country <T> United States <H> United States <R> ethnic group <T> Native American = AmeriGas works in the United States, a country with an ethnic group of Native American.


3000it [10:39,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 3,000  of  8,857. Loss: 0.06876874715089798.   Elapsed: 0:10:39.


3001it [10:39,  3.10it/s]

0: oun<H> AmeriGas <R> region served <T> United States <H> AmeriGas <R> city <T> King of Prussia Pennsylvania = AmeriGas serves the United States, but the company operates in the city of King of Prussia, Pennsylvania.


4000it [14:12,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 4,000  of  8,857. Loss: 0.059728216379880905.   Elapsed: 0:14:13.


4001it [14:13,  2.42it/s]

0:  election<H> AZ Alkmaar <R> manager <T> John van den Brom <H> John van den Brom <R> club <T> R.S.C. Anderlecht <H> John van den Brom <R> club <T> Netherlands national football team = John van den Brom is manager of AZ Alkmaar and plays for R.S.C. Anderlecht. He also plays for both R.S.C. Anderlecht and Netherlands National football team.


5000it [17:46,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 5,000  of  8,857. Loss: 0.05229993537068367.   Elapsed: 0:17:46.


5001it [17:46,  3.18it/s]

0:  crazy<H> Aaron Turner <R> associated band/associated musical artist <T> Twilight band <H> Aaron Turner <R> associated band/associated musical artist <T> Old Man Gloom = Aaron Turner played for Twilight, who played for Old Man Gloom.


6000it [21:19,  4.68it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 6,000  of  8,857. Loss: 0.05016343295574188.   Elapsed: 0:21:19.


6001it [21:20,  1.75it/s]

0:  bench<H> Al Asad Airbase <R> operating organisation <T> United States Air Force <H> United States Air Force <R> attack aircraft <T> Lockheed AC-130 <H> United States Air Force <R> battle <T> Korean War <H> United States Air Force <R> transport aircraft <T> Boeing C-17 Globemaster III <H> United States Air Force <R> aircraft fighter <T> McDonnell Douglas F-15 Eagle <H> United States Air Force <R> battle <T> Operation Enduring Freedom = The United States Air Force is the operating organisation for Al Asad airbase. The aircraft include Lockheed AC-130 attack aircraft and the McDonnell Douglas F-15 Eagle. The Operation Enduring Freedom is a battle involving the United States Air Force which has the transport aircraft Boeing C-17 Globemaster III transport planes. McDonnell Douglas F-15 Eagle and Operation Enduring Freedom.


7000it [24:53,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 7,000  of  8,857. Loss: 0.10574602335691452.   Elapsed: 0:24:53.


7001it [24:53,  3.52it/s]

0:  incorporated<H> Alcatraz Versus the Evil Librarians <R> media type <T> Print = Alcatraz Versus the Evil Librarians is produced in print.


8000it [28:26,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 8,000  of  8,857. Loss: 0.05633364990353584.   Elapsed: 0:28:27.


8001it [28:26,  3.36it/s]

0: Peter<H> Aleksey Chirikov icebreaker <R> builder <T> Finland = Finland is the home to the icebreaker called the icebreaker called the Aleksey Chirikov.


8857it [31:29,  4.69it/s]
  0%|          | 2/8857 [00:00<10:24, 14.17it/s]


  Average training loss: 0.06
  Training epoch took: 0:31:29

Running Validation...


100%|██████████| 8857/8857 [10:31<00:00, 14.02it/s]
0it [00:00, ?it/s]

  Validation Loss: 0.05
  Validation took: 0:10:32

Training...


1000it [03:32,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 1,000  of  8,857. Loss: 0.05574030429124832.   Elapsed: 0:03:33.


1001it [03:33,  2.19it/s]

0: uring<H> United States <R> demonym <T> Americans <H> United States <R> capital <T> Washington D.C. <H> Atlas II <R> country origin <T> United States <H> United States <R> ethnic group <T> African Americans <H> United States <R> leader title <T> President of the United States = The Atlas II originates from the United States, where Washington DC is the capital and people who live there are called Americans, and where African Americans are one of the ethnic groups. The leader of the United States is called the President of the United States.


2000it [07:06,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 2,000  of  8,857. Loss: 0.05780227109789848.   Elapsed: 0:07:07.


2001it [07:07,  2.52it/s]

0:  reproductive<H> Alex Day <R> associated band/associated musical artist <T> Chameleon Circuit band <H> Alex Day <R> associated band/associated musical artist <T> Charlie McDonnell <H> Alex Day <R> genre <T> Folk music <H> Alex Day <R> active years start year <T> 2006 = Synthpop musician Alex Day became active in 2006 and is associated with Charlie McDonnell. He plays with the band Chameleon Circuit.


3000it [10:40,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 3,000  of  8,857. Loss: 0.07194700092077255.   Elapsed: 0:10:41.


3001it [10:41,  2.72it/s]

0:  zone<H> ACM Transactions on Information Systems <R> abbreviation <T> ACM Trans. Inf. Syst. <H> ACM Transactions on Information Systems <R> issn number <T> 1558-2868 = The Acta Mathematica Hungarica is abbreviated to ACM Trans. Inf. Syst. and ISSN number is 1558-2868.


4000it [14:14,  4.68it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 4,000  of  8,857. Loss: 0.07130850851535797.   Elapsed: 0:14:14.
0:  commits<H> Asam pedas <R> country <T> Indonesia = Asam pedas is a food found in Indonesia.


5000it [17:47,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 5,000  of  8,857. Loss: 0.05967577174305916.   Elapsed: 0:17:48.


5001it [17:48,  3.01it/s]

0:  irony<H> Al Anderson NRBQ band <R> associated band/associated musical artist <T> The Wildweeds <H> Al Anderson NRBQ band <R> genre <T> Rock music = Al Anderson plays with the band NRBQ and was once a member of The Wildweeds.


6000it [21:21,  4.68it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 6,000  of  8,857. Loss: 0.050889741629362106.   Elapsed: 0:21:22.


6001it [21:22,  2.91it/s]

0:  Sah<H> A Severed Wasp <R> publisher <T> Farrar Straus and Giroux <H> Farrar Straus and Giroux <R> country <T> United States = Farrar Straus and Giroux published A Severed Wasp. The book is from the United States.


7000it [24:55,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 7,000  of  8,857. Loss: 0.06217757612466812.   Elapsed: 0:24:55.


7001it [24:56,  2.26it/s]

0:  Bryan<H> William Anders <R> selected by nasa <T> 1963 <H> William Anders <R> nationality <T> United States <H> William Anders <R> status <T> Retired <H> William Anders <R> occupation <T> Fighter pilot <H> William Anders <R> birth place <T> British Hong Kong <H> William Anders <R> mission <T> Apollo 8 = William Anders, a US national was born in British Hong Kong and was selected by NASA in 1963. He was a fighter pilot and crew member of Apollo 8. The retired.


8000it [28:29,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 8,000  of  8,857. Loss: 0.03835288807749748.   Elapsed: 0:28:29.


8001it [28:30,  1.96it/s]

0:  spirits<H> Abilene Regional Airport <R> city served <T> Abilene Texas <H> Abilene Regional Airport <R> runway length <T> 2195.0 <H> Abilene Regional Airport <R> elevation above the sea level <T> 546 <H> Abilene Regional Airport <R> icao location identifier <T> KABI <H> Abilene Regional Airport <R> runway name <T> 17L/35R = Abilene, Texas, is served by the Abilene regional airport. The ICAO Location Identifier of the airport is KABI and it is 546 metres above sea level. The airport's runway, 17L/35R, measures a length of 2195.


8857it [31:32,  4.68it/s]
  0%|          | 2/8857 [00:00<10:34, 13.96it/s]


  Average training loss: 0.05
  Training epoch took: 0:31:33

Running Validation...


100%|██████████| 8857/8857 [10:32<00:00, 14.01it/s]
0it [00:00, ?it/s]

  Validation Loss: 0.04
  Validation took: 0:10:32

Training...


1000it [03:33,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 1,000  of  8,857. Loss: 0.052169542759656906.   Elapsed: 0:03:33.


1001it [03:33,  3.04it/s]

0:  sees<H> Alfa Romeo 164 <R> related mean of transportation <T> Fiat Croma <H> Alfa Romeo 164 <R> related mean of transportation <T> Lancia Thema = The Alfa Romeo 164 and the Fiat Croma, both are similar means of transport.


2000it [07:06,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 2,000  of  8,857. Loss: 0.0425146259367466.   Elapsed: 0:07:07.


2001it [07:07,  2.22it/s]

0:  hungry<H> Aleksandra Kovac <R> genre <T> Rhythm and blues <H> Aleksandra Kovac <R> background <T> solo singer <H> Aleksandra Kovac <R> birth year <T> 1972 <H> Aleksandra Kovac <R> birth place <T> Serbia <H> Aleksandra Kovac <R> background <T> solo singer = Aleksandra Kovac was born in 1972, in Serbia. She began her musical career as a solo singer and has worked with the musical artists, Kornelije Kovac and solo singer.


3000it [10:40,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 3,000  of  8,857. Loss: 0.028636882081627846.   Elapsed: 0:10:40.


3001it [10:40,  3.61it/s]

0:  PT<H> A.S. Roma <R> manager <T> Luciano Spalletti = Luciano Spalletti manages A.S. Roma.


4000it [14:13,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 4,000  of  8,857. Loss: 0.047422487288713455.   Elapsed: 0:14:13.


4001it [14:13,  3.61it/s]

0: ü<H> A.S. Roma <R> chairman <T> James Pallotta = James Pallotta is chairman of A.S. Roma.


5000it [17:46,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 5,000  of  8,857. Loss: 0.05126702040433884.   Elapsed: 0:17:47.


5001it [17:47,  2.15it/s]

0: ruce<H> 11th Mississippi Infantry Monument <R> country <T> United States <H> 11th Mississippi Infantry Monument <R> location <T> Seminary Ridge <H> 11th Mississippi Infantry Monument <R> location <T> Adams County Pennsylvania <H> 11th Mississippi Infantry Monument <R> state <T> Pennsylvania <H> 11th Mississippi Infantry Monument <R> established <T> 2000 <H> 11th Mississippi Infantry Monument <R> category <T> Contributing property = The 11th Mississippi Infantry Monument is in the Seminary Ridge, Adams County, Pennsylvania. It was established in 2000 and falls under the category of Contributing property.


6000it [21:20,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 6,000  of  8,857. Loss: 0.06264246255159378.   Elapsed: 0:21:21.


6001it [21:21,  1.97it/s]

0:  derivatives<H> School of Business and Social Sciences at the Aarhus University <R> city <T> Aarhus <H> School of Business and Social Sciences at the Aarhus University <R> academic staff size <T> 737 <H> School of Business and Social Sciences at the Aarhus University <R> number of students <T> 16000 <H> School of Business and Social Sciences at the Aarhus University <R> country <T> Denmark <H> School of Business and Social Sciences at the Aarhus University <R> established <T> 1928 = The School of Business and Social Sciences at the Aarhus University is located in Aarhus, Denmark. It was established in 1928. It has 737 academic staff and 16,000 students.


7000it [24:54,  4.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 7,000  of  8,857. Loss: 0.05201065540313721.   Elapsed: 0:24:55.


7001it [24:55,  2.88it/s]

0: <H> Buzz Aldrin <R> birth place <T> Glen Ridge New Jersey <H> Buzz Aldrin <R> mission <T> Apollo 11 <H> Buzz Aldrin <R> status <T> Retired = Buzz Aldrin, now retired, was born in Glen Ridge New Jersey and served as a crew member of Apollo 11.


8000it [28:28,  4.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch 8,000  of  8,857. Loss: 0.05199335888028145.   Elapsed: 0:28:29.


8001it [28:29,  3.60it/s]

0:  remembering<H> Abraham A. Ribicoff <R> death place <T> New York City = Abraham A. Ribicoff died in New York City.


8857it [31:31,  4.68it/s]
  0%|          | 2/8857 [00:00<10:26, 14.14it/s]


  Average training loss: 0.04
  Training epoch took: 0:31:32

Running Validation...


100%|██████████| 8857/8857 [10:36<00:00, 13.91it/s]

  Validation Loss: 0.04
  Validation took: 0:10:37





In [19]:
!mkdir model
torch.save(model.state_dict(),'model/pytorch_model.bin')

In [22]:
!cd model
!wget https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json
!wget  https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tokenizer.json

--2021-03-19 14:27:03--  https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.134.13
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.134.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 762 [application/json]
Saving to: ‘distilgpt2-config.json’


2021-03-19 14:27:03 (20.5 MB/s) - ‘distilgpt2-config.json’ saved [762/762]

--2021-03-19 14:27:03--  https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tokenizer.json
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.134.13
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.134.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1355256 (1.3M) [application/json]
Saving to: ‘distilgpt2-tokenizer.json’


2021-03-19 14:27:04 (3.35 MB/s) - ‘distilgpt2-tokenizer.json’ saved [1355256/1355256]



In [23]:
!mv /content/distilgpt2-config.json /content/model/config.json
!mv /content/distilgpt2-tokenizer.json /content/model/tokenizer.json