# GPT2 Pretrained Model By HuggingFace

In [60]:
!pip install transformers
!pip install torch
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
import torch

In [61]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
model_generation = GPT2LMHeadModel.from_pretrained('gpt2')
# model #uncomment this line and see, there are 12 blocks in this model

In [62]:
#the tokenizer, tokenize the input and convert into ids which further used as a input to model
inputs = tokenizer("Why pineapples do not belong on the pizza", return_tensors="pt")
print("length of tokenizer: ", len(tokenizer))
print("Inputs: " ,inputs)

length of tokenizer:  50257
Inputs:  {'input_ids': tensor([[ 5195, 20161,  1324,   829,   466,   407,  5594,   319,   262, 14256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [63]:
outputs = model(**inputs) #model produces two outputs, and we are only concernd about first output
#when we give input as a argument to the model, it calls the forward function internally
outputs[0].shape # output of the model is ={batch_size, sequence_size, embedding_size}
#this outputs {outputs[0]} can be used as a input to language modeling head or 
#some other linear layers for different purpose

torch.Size([1, 10, 768])

## 1. Embedding of the input by the GPT2

In [64]:
#we will see how embedding can be generated using this pretrained model
#this function will produce the embedding without considering the position of the tokens in input
input_embed = model.get_input_embeddings()(inputs['input_ids'])
input_embed.shape
#now lets take the position of tokens into consideration for producing the final embedding vectors
position_embeddings = model.wpe.weight
seq_len = input_embed.size(1)
position_embed = position_embeddings[:seq_len]
print('Before unsqueeze operation shape of position_embed: ', position_embed.shape)
print('After unsqueeze operation shape of position_embed: ', position_embed.unsqueeze(0).shape)

final_input_embed = position_embed.unsqueeze(0) + input_embed
#unsqueeze operation= this function increase the dimension, we do this to make it compatible with the shape of 
#token embed
print("\nfinal_input_embed.shape: ", final_input_embed.shape)

#we can also skip the embedding layer of the gpt2
#lets say, the embedding vector that we derived, received from some other source or model 
#we can use this as a input to gpt2 model by skipping the embedding layer
outputs_ = model(inputs_embeds = final_input_embed)
#output from the gpt2 model, not skipping the embedding layer
outputs = model(input_ids=inputs['input_ids'])
print("shape of output, generated after skipping the embedding layer: ", outputs_[0].shape)
print("shape of output, generated without skipping the embedding layer: ", outputs[0].shape)
#to skip the embedding layer, onhe need to generate the embedding vectors from some other model 

Before unsqueeze operation shape of position_embed:  torch.Size([10, 768])
After unsqueeze operation shape of position_embed:  torch.Size([1, 10, 768])

final_input_embed.shape:  torch.Size([1, 10, 768])
shape of output, generated after skipping the embedding layer:  torch.Size([1, 10, 768])
shape of output, generated without skipping the embedding layer:  torch.Size([1, 10, 768])


## 2. Output from the every hidden layer

In [65]:
#since there are 12 hidden layers in GPT2, we can get the output from every hidden layer and 
#can use this output to feed some other user defined models

for i in range(12):
    if (i == 0):
        out = model.h[i](final_input_embed) #output from the first hidden layer
    else:
        out = model.h[i](out[0]) #output from the previous layer, used as a input to next layer
print("output shape of the final hidden layer:", out[0].shape)
#this output can be used to feed other linear layers, for some other purpose

output shape of the final hidden layer: torch.Size([1, 10, 768])


## 3. Text Generation

In [66]:
#contrastive decoding
#generating 15 tokens
#generate is a predefined function by transformer library used to generate text, if model is a generative model
#Based on the parameter given to generate function, it uses different sampling/decoding methods like greedy,
#nucleus, or contrastive 
output_generated = model_generation.generate(input_ids=inputs['input_ids'], max_length=15, penalty_alpha=0.6,
                                             top_k=6)
tokenizer.decode(output_generated[0]) 
#decoding the generated output from the Langugage modeling Head of the GPT2 model

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Why pineapples do not belong on the pizza menu, they are not'