In [None]:
# preprocess wikipedia_zh corpus

## load corpus

In [None]:
import numpy as np
class utils(object):
  def __init__(self,path=None):
    self.path=path
    #self.tokenizer = BertTokenizer.from_pretrained('../Language_model/bert-base-uncased')

  def preprocess_wiki(self, path):
    import pandas as pd
    f = pd.read_csv(self.path)
    f = f.dropna()
    f.iloc[:,1] = f.iloc[:,1].astype("string")
    res = list(f.iloc[:,1])

    vocab=set()
    length=[]
    sentence_arr=[]
    for i in range(len(res)):
      text=res[i]
      tokenized_text = list(text) # special for chinese
      # remove sentence which is too short, too long
      if 5<= len(tokenized_text)<=100:
        sentence_arr.append(tokenized_text)
        length.append(len(tokenized_text))
        vocab.update(tokenized_text)

    v2i={v: i for i, v in enumerate(sorted(vocab), start=1)}
    v2i['<PAD>']=0
    v2i["<SEP>"] = len(v2i) # <GO> as start of sequence ,<SEP> as end of sequence
    v2i["<GO>"] = len(v2i) # the total number of tokens should include these special tokens: len(v2i)

    i2v = {i: v for v, i in v2i.items()}  
    return sentence_arr, v2i, i2v, max(length)

  def token_to_idx(self,sentence_arr, v2i):
    sentence_idx=[]
    for i in range(len(sentence_arr)):
      sentence_idx.append([v2i['<GO>']]+[v2i[item] for item in sentence_arr[i]]+[v2i['<SEP>']])
    return sentence_idx

  # add a pad_zero function to align the sentences of various length
  def pad_zero(self, seqs, max_len):
      PAD_ID = 0
      padded = np.full((len(seqs), max_len), fill_value=PAD_ID, dtype=np.int32)
      for i, seq in enumerate(seqs):
          padded[i, :len(seq)] = seq
      return padded

  def get_idx_sentence(self):
    sentence_arr, v2i, i2v, max_len= self.preprocess_wiki(self.path) #input is part of wiki data, for demo usage
    sentence_idx = self.token_to_idx(sentence_arr, v2i)
    # define idx for padding
    PAD_ID= v2i['<PAD>']
    # there is <GO> and <SEP> at start and ending of sentence, so the full length should be 100+2=102
    sentence_idx_padded = self.pad_zero(sentence_idx,max_len+2)

    return sentence_idx_padded.tolist(), v2i



# define the module and gpt class

In [None]:
from torch import Tensor
import torch.nn.functional as f

In [None]:
import torch
from torch import nn

class AttentionHead(nn.Module):
    def __init__(self, dim_in: int, dim_k: int, dim_v: int):
        super().__init__()
        self.q = nn.Linear(dim_in, dim_k)
        self.k = nn.Linear(dim_in, dim_k)
        self.v = nn.Linear(dim_in, dim_v)
        #self.embedding=nn.Embedding() ##

    def forward(self, query: Tensor, key: Tensor, value: Tensor, mask: Tensor) -> Tensor:  ## 传入mask 
        query= self.q(query)
        key= self.k(key)
        value=self.v(value)

        temp = query.bmm(key.transpose(1, 2))
        scale = query.size(-1) ** 0.5

        score=temp/scale
        score=score+mask

        softmax = f.softmax(score, dim=-1)
        return softmax.bmm(value)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads: int, dim_in: int, dim_k: int, dim_v: int):
        super().__init__()
        self.heads = nn.ModuleList(
            [AttentionHead(dim_in, dim_k, dim_v) for _ in range(num_heads)]
        )
        self.linear = nn.Linear(num_heads * dim_v, dim_in)

    def forward(self, query: Tensor, key: Tensor, value: Tensor, mask) -> Tensor: ## 传入mask
        return self.linear(
            torch.cat([h(query, key, value,mask) for h in self.heads], dim=-1)
        )

In [None]:
def position_encoding(
    seq_len: int, dim_model: int, device: torch.device = torch.device("cpu"),
) -> Tensor:
    pos = torch.arange(seq_len, dtype=torch.float, device=device).reshape(1, -1, 1)
    dim = torch.arange(dim_model, dtype=torch.float, device=device).reshape(1, 1, -1)
    phase = pos / 1e4 ** (dim // dim_model)

    return torch.where(dim.long() % 2 == 0, torch.sin(phase), torch.cos(phase))

In [None]:
def feed_forward(dim_input: int = 512, dim_feedforward: int = 2048) -> nn.Module:
    return nn.Sequential(
        nn.Linear(dim_input, dim_feedforward),
        nn.ReLU(),
        nn.Linear(dim_feedforward, dim_input),
    )

In [None]:
class Residual(nn.Module):
    def __init__(self, sublayer: nn.Module, dimension: int, dropout: float = 0.1): ##不是在这里传入mask
        super().__init__()
        self.sublayer = sublayer
        self.norm = nn.LayerNorm(dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tensors0: Tensor, tensors1: Tensor, tensors2: Tensor, mask: Tensor) -> Tensor:
        # Assume that the "value" tensor is given last, so we can compute the
        # residual.  This matches the signature of 'MultiHeadAttention'.
        # self.mask=mask
        return self.norm(tensors0 + self.dropout(self.sublayer(tensors0, tensors1, tensors2,mask))) ## 传入mask

In [None]:
## feed fowward network,Residual需要传入mask，这里不用，所以要分别开
class Residual_ffn(nn.Module):
    def __init__(self, sublayer: nn.Module, dimension: int, dropout: float = 0.1): ##不传入mask
        super().__init__()
        self.sublayer = sublayer
        self.norm = nn.LayerNorm(dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tensors: Tensor) -> Tensor:
        # Assume that the "value" tensor is given last, so we can compute the
        # residual.  This matches the signature of 'MultiHeadAttention'.
        #self.mask=mask ##
        return self.norm(tensors + self.dropout(self.sublayer(tensors)))

In [None]:
class TransformerEncoderLayer(nn.Module):
    def __init__(
        self, 
        dim_model: int = 512, 
        num_heads: int = 6, 
        dim_feedforward: int = 2048, 
        dropout: float = 0.1, 
    ):
        super().__init__()
        dim_k = dim_v = dim_model // num_heads
        self.attention = Residual(
            MultiHeadAttention(num_heads, dim_model, dim_k, dim_v), ## 传入mask
            dimension=dim_model,
            dropout=dropout, 
        )
        self.feed_forward = Residual_ffn(
            feed_forward(dim_model, dim_feedforward),
            dimension=dim_model,
            dropout=dropout,
        )

    def forward(self, src: Tensor,mask: Tensor) -> Tensor: ##传入mask
        src = self.attention(src, src, src,mask) ###传入mask
        #return src 
        return self.feed_forward(src)

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(
        self, 
        num_layers: int = 6,
        dim_model: int = 512, 
        num_heads: int = 8, 
        dim_feedforward: int = 2048, 
        dropout: float = 0.1,
        device: str = 'cpu' 
    ):
        super().__init__()
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(dim_model, num_heads, dim_feedforward, dropout)
            for _ in range(num_layers)
        ])
        self.device= device

    def forward(self, src: Tensor, mask: Tensor) -> Tensor: ##可以传入mask
        seq_len, dimension = src.size(1), src.size(2)
        pos=position_encoding(seq_len, dimension) #
        pos=pos.to(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')) # load data to gpu
        src += pos
        for layer in self.layers:
            src = layer(src,mask)  ##传入mask

        return src

In [None]:
# start a trial of gpt model testing
class GPT(nn.Module):
    def __init__(
        self, 
        num_encoder_layers: int = 4,
        dim_model: int = 512, 
        num_heads: int = 8, 
        dim_feedforward: int = 2048//2, 
        dropout: float = 0.1, 
        activation: nn.Module = nn.ReLU(),
        n_vocab: int=4,
        device: str = 'cpu'
    ):
        super().__init__()

        self.embedding = nn.Embedding(n_vocab, dim_model)
        
        self.encoder = TransformerEncoder(
            num_layers=num_encoder_layers,
            dim_model=dim_model,
            num_heads=num_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            device= device
        )

        self.out = nn.Linear(dim_model, n_vocab)



    def forward(self, src: Tensor, mask: Tensor) -> Tensor: ##传入mask
        emb=self.embedding(src)
        enc=self.encoder(emb,mask) ##传入mask
        out=self.out(enc)
        
        return out ##no need softmax, nn.cross_entropy take care of it

# train model

In [None]:
# Generates a square matrix where the each row allows one word more to be seen

def generate_masks(src):
    seq_len= src.size(1)

    pad_int= [int(seq_len-i) for i in src.count_nonzero(dim=1)]

    mask = torch.tril(torch.ones(seq_len, seq_len) == 1) # Lower triangular matrix
    mask = mask.float()
    mask = mask.masked_fill(mask == 0, -1e9) # Convert zeros to -1e9
    mask = mask.masked_fill(mask == 1, float(0.0)) # Convert ones to 0

    mask_arr=[]
    for i in pad_int:
      mask[:,-(i):]= -1e9
      mask_arr.append(mask)

    masks=torch.cat(tuple(mask_arr),dim=0)
    masks=masks.reshape(src.size(0),seq_len,seq_len)
    
    return masks

In [None]:
import time

######
def loss_masked(output, src, loss_fn):
    nonpad_int=src.count_nonzero(dim=1)
    # discard pad elements
    res=[]
    for k,item in enumerate(nonpad_int):
        res.append(src[k][:int(item)])

    loss_res=0
    for i in range(src.size(0)):
        loss_res+=loss_fn(output[i], src[i])

    return loss_res/src.size(0)
######

def train(model, data, batch_size):
    torch.manual_seed(0)
    #model.to(device)
    model = nn.DataParallel(model)
    torch.cuda.set_device(0)
    model.cuda(0)

    # define loss function (criterion) and optimizer
    loss_fn = nn.CrossEntropyLoss().cuda(0)  ## this is for classification
    opt = torch.optim.SGD(model.parameters(), 1e-4)


    n=len(data)// batch_size

    # prepare for next word prediction
    t0=time.time()

    for i in range(n):
        t0= time.time()
        src=data[batch_size*i:batch_size*(i+1)]
        src=torch.tensor(src).long()

        _seq=src[:,:-1]
        seq_=src[:,1:]
        masks=generate_masks(_seq)

        
        #put to gpu
        _seq=_seq.cuda(non_blocking=True)
        seq_=seq_.cuda(non_blocking=True)
        masks=masks.cuda(non_blocking=True)

        # Forward pass
        outputs = model(_seq,masks)
        #print(outputs[0].shape)
        #print(seq_[0].shape)
        loss=loss_masked(outputs,seq_, loss_fn) # next word prediction

        # the part of padding loss should be removed before backprop
        opt.zero_grad()
        loss.backward()
        opt.step()

        print("loss is:",loss.detach().item())
        t1=time.time()
        print(t1-t0)
        
        if i%10000==0: np.savetxt('./model_dir/gpt_loss_%d.csv'%(i), np.array([loss.detach().item()]))

    # save model parameters after finish training model
    torch.save(model, "./model_dir/model.pkl")
    
    
    
def train_mp(model, data, batch_size):
    torch.manual_seed(0)
    #model.to(device)
    model = nn.DataParallel(model)
    torch.cuda.set_device(0)
    model.cuda(0)

    # define loss function (criterion) and optimizer
    loss_fn = nn.CrossEntropyLoss().cuda(0)  ## this is for classification
    opt = torch.optim.SGD(model.parameters(), 1e-4)


    n=len(data)// batch_size

    # prepare for next word prediction
    t0=time.time()

    
    scaler = torch.cuda.amp.GradScaler()
    for i in range(n):
        t0= time.time()
        src=data[batch_size*i:batch_size*(i+1)]
        src=torch.tensor(src).long()

        _seq=src[:,:-1]
        seq_=src[:,1:]
        masks=generate_masks(_seq)

        
        #put to gpu
        _seq=_seq.cuda(non_blocking=True)
        seq_=seq_.cuda(non_blocking=True)
        masks=masks.cuda(non_blocking=True)

        ## include mixed precision
        opt.zero_grad()
        # Casts operations to mixed precision
        with torch.cuda.amp.autocast():
            # Forward pass
            outputs = model(_seq,masks)
            #print(outputs[0].shape)
            #print(seq_[0].shape)
            loss=loss_masked(outputs,seq_, loss_fn) # next word prediction

        # Scales the loss, and calls backward()
        # to create scaled gradients
        scaler.scale(loss).backward()


        # Unscales gradients and calls
        # or skips optimizer.step()
        scaler.step(opt)

        # Updates the scale for next iteration
        scaler.update()

        print("loss is:",loss.detach().item())
        t1=time.time()
        print(t1-t0)
        
        if i%10000==0: 
            np.savetxt('./gpt_loss_%d.csv'%(i), np.array([loss.detach().item()]))
            # save model parameters after finish training model
            torch.save(model, "./model.pkl")

In [None]:
# the entry to start training the model, with data, with specified parameter
if __name__ == '__main__':
    # load cpu or gpu name
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    # model parameter
    MODEL_DIM = 256
    N_LAYER = 4*3
    N_HEAD = 8
    # training argument
    batch_size = 2
    # load data
    ut = utils(path='./file00')
    d, v2i=ut.get_idx_sentence()
    n_vocab= len(v2i)
    m= GPT(num_encoder_layers= N_LAYER,dim_model= MODEL_DIM, num_heads= N_HEAD, n_vocab=n_vocab, device=device)
    # start training
    train(m, d, batch_size)

In [None]:
# clear gpu and cpu memory
import gc
torch.cuda.empty_cache()
gc.collect()

# load saved model

In [None]:
# Model class must be defined somewhere
import torch
PATH = './model_dir/model.pkl'
model = torch.load(PATH)
#model.eval()

In [None]:
#model.parameters()