In [1]:
import torch
from torch import nn
from torchtext import *
import spacy
import string
import torchtext
import math
import copy
from torch import optim
import torch.nn.functional as F
print(f"Torch version is {torch.__version__} Torchtext version is {torchtext.__version__}")
from pytorch_transformers import OpenAIGPTModel,OpenAIGPTConfig,OpenAIGPTPreTrainedModel
device = torch.device("cuda")
import torch
import json
from tqdm import tqdm


Torch version is 1.2.0 Torchtext version is 0.4.0


In [2]:
# tokenizer_func = lambda s :s.split()
TEXT = data.Field(tokenize ="spacy" ,batch_first=True)
train_data,val_data,test_data = datasets.WikiText2.splits(TEXT)
TEXT.build_vocab(train_data)
print(f"The vocab size is {TEXT.vocab.stoi.__len__()}")
train_iter = data.BPTTIterator(train_data,batch_size = 64*8 , shuffle=True,bptt_len=50,train=True,device=device)
val_iter = data.BPTTIterator(val_data,batch_size = 64*8 , shuffle=True,bptt_len=50,train=False,device=device)

The vocab size is 33243


In [3]:
config = OpenAIGPTConfig(vocab_size_or_config_json_file=TEXT.vocab.stoi.__len__())

In [4]:
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
    def __init__(self, config):
        super(OpenAIGPTLMHeadModel, self).__init__(config)
        self.transformer = OpenAIGPTModel(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.init_weights()
        self.tie_weights()

    def tie_weights(self):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
        self._tie_or_clone_weights(self.lm_head,
                                   self.transformer.tokens_embed)

    def forward(self, input_ids,labels=None):
        transformer_outputs = self.transformer(input_ids)
        hidden_states = transformer_outputs[0]
        lm_logits = self.lm_head(hidden_states)

        outputs = (lm_logits,) + transformer_outputs[1:]
        if labels is not None:
            # Shift so that tokens < n predict n
            # Flatten the tokens
            loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)),
                            labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), lm_logits, (all hidden states), (all attentions)
model = OpenAIGPTLMHeadModel(config)
model = torch.nn.DataParallel(model,device_ids=[0,1,2,3,4,5,6,7],output_device=torch.device("cuda:1")).to(device)

In [5]:
import numpy as np

class Stepper():
    "Used to \"step\" from start,end (`vals`) over `n_iter` iterations on a schedule defined by `func`"
    
    def __init__(self, val, n_iter:int, func):
        self.start,self.end = val
        self.n_iter = max(1,n_iter)
        self.func = func
        self.n = 0

    def step(self):
        "Return next value along annealed schedule."
        self.n += 1
        return self.func(self.start, self.end, self.n/self.n_iter)

    @property
    def is_done(self):
        "Return `True` if schedule completed."
        return self.n >= self.n_iter
    
# Annealing functions
def annealing_no(start, end, pct):
    "No annealing, always return `start`."
    return start
  
def annealing_linear(start, end, pct):
    "Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0."
    return start + pct * (end-start)
  
def annealing_exp(start, end, pct):
    "Exponentially anneal from `start` to `end` as pct goes from 0.0 to 1.0."
    return start * (end/start) ** pct

def annealing_cos(start, end, pct):
    "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
    cos_out = np.cos(np.pi * pct) + 1
    return end + (start-end)/2 * cos_out


In [6]:
class OneCyclePolicy:
  
    def __init__(self,model , optimizer  ,num_iteration, momentum = (0.95,0.85) , div_factor=25 ,
                 pct_start=0.4,
                 save_model_filename= "hindi_openai_lm_saved_model_weights.pt" ,
                 log_file_name="hindi_openai_lm_train_log.txt",epochs=100,max_lr=0.01):
    
        self.model = model
        self.optimizer = optimizer
        self.criterion=nn.CrossEntropyLoss()
        self.save_model_filename = save_model_filename
        self.log_file_name = log_file_name
        self.num_epochs = epochs

        n = num_iteration * self.num_epochs
        a1 = int(n*pct_start)
        a2 = n-a1
        self.phases = ((a1 , annealing_linear) , (a2 , annealing_cos))
        min_lr = max_lr/div_factor
        self.lr_scheds = self.steps((min_lr,max_lr) , (max_lr,min_lr/1e1))
        self.mom_scheds =self.steps(momentum , momentum[::-1])
        self.idx_s = 0
        self.lr ,self.mom=None,None
        self.update_lr_mom(self.lr_scheds[0].start,self.mom_scheds[0].start)
        self.best_valid_loss = float('inf')

    def logger(self,*args,**kwargs):
        print(*args,**kwargs)
        with open(self.log_file_name,"a") as f:  # appends to file and closes it when finished
            print(file=f,*args,**kwargs)
  
    def steps(self, *steps):
        "Build anneal schedule for all of the parameters."
        return [Stepper(step, n_iter, func=func)for (step,(n_iter,func)) in zip(steps, self.phases)]

    def train_evaluate(self, trainLoader , validLoader ):
        data_loader = {"train":trainLoader , "val":validLoader}
        for epoch in tqdm(range(self.num_epochs),desc="Epochs"):
          
            for phase in ['train', 'val']:

                if phase=="train":     # put the model in training mode
                    self.logger(f"For Epoch [{epoch}|{self.num_epochs}] : ")
                    self.logger(f"\nLearning Rate is {self.lr} momentum is {self.mom}")
                    train_loss = self.train(data_loader[phase])
                    self.logger(f'\nTrain Loss: {train_loss:.3f}')


                else:     # put the model in validation mode
                    valid_loss = evaluate(self.model, data_loader[phase]) 
                    self.logger(f'\nVal. Loss: {valid_loss:.3f}' )

                    if valid_loss < self.best_valid_loss:
                        self.best_valid_loss = valid_loss
                        torch.save(self.model.module.state_dict(), self.save_model_filename)



    def update_lr_mom(self,lr=0.001,mom=0.99):
        for l in self.optimizer.param_groups:
            self.lr , l["lr"]=lr,lr
            if isinstance(self.optimizer , ( torch.optim.Adamax,torch.optim.Adam)):
                  l["betas"] = ( mom, 0.999)
            elif isinstance(self.optimizer, torch.optim.SGD):
                  l["momentum"] =mom
            self.mom = mom


    def train(self, iterator):
        epoch_loss = 0

        self.model.train()

        for batch in iterator:

            self.optimizer.zero_grad()

            text = batch.text
            labels =batch.target
            output  = self.model(text,labels=labels)[0]
            loss = output.mean()
            loss.backward()
            self.optimizer.step()
            self.update_lr_mom(self.lr_scheds[self.idx_s].step() ,self.mom_scheds[self.idx_s].step() )

            if self.lr_scheds[self.idx_s].is_done:
                self.idx_s += 1

            epoch_loss += loss.item()
        return epoch_loss / len(iterator)
                                
def evaluate(model, iterator):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            text = batch.text
            labels =batch.target
            output  = model(text,labels=labels)[0]
            loss = output.mean()

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [8]:
optimizer =  torch.optim.SGD(model.parameters(),lr=1e-3,momentum=True,nesterov=True)
fit_one_cycle = OneCyclePolicy(model ,
                               optimizer ,
                               num_iteration=len(train_iter),epochs=100,pct_start=0.3,max_lr=0.03,div_factor=20)
fit_one_cycle.train_evaluate(train_iter,val_iter)


Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

For Epoch [0|100] : 

Learning Rate is 0.0015 momentum is 0.95





Train Loss: 8.116

Val. Loss: 6.704


Epochs:   1%|          | 1/100 [03:29<5:45:18, 209.28s/it]

For Epoch [1|100] : 

Learning Rate is 0.00245 momentum is 0.9466666666666667

Train Loss: 6.947

Val. Loss: 6.244


Epochs:   2%|▏         | 2/100 [06:39<5:32:27, 203.55s/it]

For Epoch [2|100] : 

Learning Rate is 0.0034 momentum is 0.9433333333333332

Train Loss: 6.497

Val. Loss: 5.941


Epochs:   4%|▍         | 4/100 [12:59<5:14:43, 196.70s/it]

For Epoch [4|100] : 

Learning Rate is 0.005299999999999999 momentum is 0.9366666666666666

Train Loss: 6.383

Val. Loss: 5.838


Epochs:   5%|▌         | 5/100 [16:09<5:08:17, 194.71s/it]

For Epoch [5|100] : 

Learning Rate is 0.006249999999999999 momentum is 0.9333333333333333

Train Loss: 6.285

Val. Loss: 5.746


Epochs:   6%|▌         | 6/100 [19:19<5:02:52, 193.32s/it]

For Epoch [6|100] : 

Learning Rate is 0.0072 momentum is 0.9299999999999999

Train Loss: 6.197

Val. Loss: 5.665


Epochs:   7%|▋         | 7/100 [22:30<4:58:14, 192.41s/it]

For Epoch [7|100] : 

Learning Rate is 0.00815 momentum is 0.9266666666666666

Train Loss: 6.121

Val. Loss: 5.596


Epochs:   8%|▊         | 8/100 [25:40<4:54:01, 191.76s/it]

For Epoch [8|100] : 

Learning Rate is 0.009099999999999999 momentum is 0.9233333333333333

Train Loss: 6.051

Val. Loss: 5.534


Epochs:   9%|▉         | 9/100 [28:50<4:50:12, 191.35s/it]

For Epoch [9|100] : 

Learning Rate is 0.010049999999999998 momentum is 0.9199999999999999

Train Loss: 5.988

Val. Loss: 5.483


Epochs:  10%|█         | 10/100 [32:00<4:46:28, 190.98s/it]

For Epoch [10|100] : 

Learning Rate is 0.010999999999999998 momentum is 0.9166666666666666

Train Loss: 5.931

Val. Loss: 5.433


Epochs:  11%|█         | 11/100 [35:10<4:42:52, 190.70s/it]

For Epoch [11|100] : 

Learning Rate is 0.011949999999999999 momentum is 0.9133333333333333

Train Loss: 5.877

Val. Loss: 5.393


Epochs:  12%|█▏        | 12/100 [38:21<4:39:31, 190.59s/it]

For Epoch [12|100] : 

Learning Rate is 0.0129 momentum is 0.9099999999999999

Train Loss: 5.828

Val. Loss: 5.376


Epochs:  13%|█▎        | 13/100 [41:31<4:36:10, 190.46s/it]

For Epoch [13|100] : 

Learning Rate is 0.01385 momentum is 0.9066666666666666

Train Loss: 5.781

Val. Loss: 5.315


Epochs:  14%|█▍        | 14/100 [44:41<4:32:50, 190.36s/it]

For Epoch [14|100] : 

Learning Rate is 0.014799999999999999 momentum is 0.9033333333333333

Train Loss: 5.738

Val. Loss: 5.289


Epochs:  15%|█▌        | 15/100 [47:51<4:29:34, 190.29s/it]

For Epoch [15|100] : 

Learning Rate is 0.01575 momentum is 0.8999999999999999

Train Loss: 5.699

Val. Loss: 5.258


Epochs:  16%|█▌        | 16/100 [51:01<4:26:14, 190.17s/it]

For Epoch [16|100] : 

Learning Rate is 0.0167 momentum is 0.8966666666666666

Train Loss: 5.661

Val. Loss: 5.243


Epochs:  17%|█▋        | 17/100 [54:11<4:23:02, 190.15s/it]

For Epoch [17|100] : 

Learning Rate is 0.01765 momentum is 0.8933333333333333

Train Loss: 5.622

Val. Loss: 5.201


Epochs:  18%|█▊        | 18/100 [57:21<4:19:47, 190.09s/it]

For Epoch [18|100] : 

Learning Rate is 0.0186 momentum is 0.89

Train Loss: 5.588

Val. Loss: 5.163


Epochs:  19%|█▉        | 19/100 [1:00:31<4:16:27, 189.97s/it]

For Epoch [19|100] : 

Learning Rate is 0.019549999999999998 momentum is 0.8866666666666666

Train Loss: 5.556

Val. Loss: 5.148


Epochs:  20%|██        | 20/100 [1:03:40<4:13:09, 189.87s/it]

For Epoch [20|100] : 

Learning Rate is 0.020499999999999997 momentum is 0.8833333333333333

Train Loss: 5.523

Val. Loss: 5.122


Epochs:  21%|██        | 21/100 [1:06:50<4:09:57, 189.84s/it]

For Epoch [21|100] : 

Learning Rate is 0.021449999999999997 momentum is 0.88

Train Loss: 5.491

Val. Loss: 5.118


Epochs:  22%|██▏       | 22/100 [1:10:00<4:06:56, 189.95s/it]

For Epoch [22|100] : 

Learning Rate is 0.0224 momentum is 0.8766666666666667

Train Loss: 5.461

Val. Loss: 5.094


Epochs:  23%|██▎       | 23/100 [1:13:11<4:03:48, 189.98s/it]

For Epoch [23|100] : 

Learning Rate is 0.02335 momentum is 0.8733333333333333

Train Loss: 5.435

Val. Loss: 5.057


Epochs:  24%|██▍       | 24/100 [1:16:21<4:00:40, 190.01s/it]

For Epoch [24|100] : 

Learning Rate is 0.024300000000000002 momentum is 0.87

Train Loss: 5.402

Val. Loss: 5.035


Epochs:  25%|██▌       | 25/100 [1:19:31<3:57:35, 190.07s/it]

For Epoch [25|100] : 

Learning Rate is 0.02525 momentum is 0.8666666666666667

Train Loss: 5.378

Val. Loss: 5.032


Epochs:  26%|██▌       | 26/100 [1:22:41<3:54:26, 190.09s/it]

For Epoch [26|100] : 

Learning Rate is 0.0262 momentum is 0.8633333333333333

Train Loss: 5.349

Val. Loss: 5.001


Epochs:  27%|██▋       | 27/100 [1:25:51<3:51:22, 190.17s/it]

For Epoch [27|100] : 

Learning Rate is 0.02715 momentum is 0.86

Train Loss: 5.322

Val. Loss: 4.987


Epochs:  28%|██▊       | 28/100 [1:29:01<3:48:12, 190.17s/it]

For Epoch [28|100] : 

Learning Rate is 0.0281 momentum is 0.8566666666666667

Train Loss: 5.296

Val. Loss: 4.974


Epochs:  29%|██▉       | 29/100 [1:32:12<3:45:02, 190.17s/it]

For Epoch [29|100] : 

Learning Rate is 0.02905 momentum is 0.8533333333333333

Train Loss: 5.273

Val. Loss: 4.958


Epochs:  30%|███       | 30/100 [1:35:22<3:41:50, 190.15s/it]

For Epoch [30|100] : 

Learning Rate is 0.03 momentum is 0.85

Train Loss: 5.237

Val. Loss: 4.928


Epochs:  31%|███       | 31/100 [1:38:32<3:38:39, 190.14s/it]

For Epoch [31|100] : 

Learning Rate is 0.029984971518129122 momentum is 0.8500503466729342

Train Loss: 5.213

Val. Loss: 4.926


Epochs:  32%|███▏      | 32/100 [1:41:42<3:35:25, 190.08s/it]

For Epoch [32|100] : 

Learning Rate is 0.029939916337878944 momentum is 0.850201285300238

Train Loss: 5.184

Val. Loss: 4.888


Epochs:  33%|███▎      | 33/100 [1:44:52<3:32:11, 190.03s/it]

For Epoch [33|100] : 

Learning Rate is 0.029864925194386428 momentum is 0.8504525119116032

Train Loss: 5.157

Val. Loss: 4.865


Epochs:  34%|███▍      | 34/100 [1:48:02<3:29:01, 190.02s/it]

For Epoch [34|100] : 

Learning Rate is 0.029760149109834547 momentum is 0.8508035205700685

Train Loss: 5.132


Epochs:  35%|███▌      | 35/100 [1:51:09<3:25:03, 189.28s/it]


Val. Loss: 4.868
For Epoch [35|100] : 

Learning Rate is 0.029625799089313714 momentum is 0.8512536043909088

Train Loss: 5.107

Val. Loss: 4.840


Epochs:  36%|███▌      | 36/100 [1:54:19<3:22:05, 189.46s/it]

For Epoch [36|100] : 

Learning Rate is 0.029462145695885608 momentum is 0.8518018569652073

Train Loss: 5.084


Epochs:  37%|███▋      | 37/100 [1:57:27<3:18:18, 188.86s/it]


Val. Loss: 4.841
For Epoch [37|100] : 

Learning Rate is 0.029269518505705167 momentum is 0.8524471741852423

Train Loss: 5.057

Val. Loss: 4.817


Epochs:  38%|███▊      | 38/100 [2:00:37<3:15:30, 189.20s/it]

For Epoch [38|100] : 

Learning Rate is 0.029048305444298077 momentum is 0.8531882564680131

Train Loss: 5.033

Val. Loss: 4.800


Epochs:  39%|███▉      | 39/100 [2:03:47<3:12:40, 189.51s/it]

For Epoch [39|100] : 

Learning Rate is 0.028798952005330402 momentum is 0.8540236113724274

Train Loss: 5.011

Val. Loss: 4.774


Epochs:  40%|████      | 40/100 [2:06:57<3:09:42, 189.71s/it]

For Epoch [40|100] : 

Learning Rate is 0.028521960353443603 momentum is 0.854951556604879

Train Loss: 4.988

Val. Loss: 4.769


Epochs:  41%|████      | 41/100 [2:10:07<3:06:45, 189.93s/it]

For Epoch [41|100] : 

Learning Rate is 0.028217888312961813 momentum is 0.8559702234071631

Train Loss: 4.963

Val. Loss: 4.759


Epochs:  42%|████▏     | 42/100 [2:13:18<3:03:41, 190.02s/it]

For Epoch [42|100] : 

Learning Rate is 0.02788734824450785 momentum is 0.8570775603199067

Train Loss: 4.941

Val. Loss: 4.749


Epochs:  43%|████▎     | 43/100 [2:16:28<3:00:38, 190.15s/it]

For Epoch [43|100] : 

Learning Rate is 0.02753100581179044 momentum is 0.8582713373139348

Train Loss: 4.921

Val. Loss: 4.730


Epochs:  44%|████▍     | 44/100 [2:19:38<2:57:31, 190.20s/it]

For Epoch [44|100] : 

Learning Rate is 0.02714957864104609 momentum is 0.8595491502812526

Train Loss: 4.900

Val. Loss: 4.729


Epochs:  45%|████▌     | 45/100 [2:22:49<2:54:20, 190.19s/it]

For Epoch [45|100] : 

Learning Rate is 0.026743834875835346 momentum is 0.8609084258765984

Train Loss: 4.879

Val. Loss: 4.702


Epochs:  46%|████▌     | 46/100 [2:25:59<2:51:12, 190.23s/it]

For Epoch [46|100] : 

Learning Rate is 0.026314591630103894 momentum is 0.8623464266998194

Train Loss: 4.855

Val. Loss: 4.693


Epochs:  47%|████▋     | 47/100 [2:29:09<2:48:02, 190.23s/it]

For Epoch [47|100] : 

Learning Rate is 0.025862713342623817 momentum is 0.8638602568086304

Train Loss: 4.836

Val. Loss: 4.684


Epochs:  48%|████▊     | 48/100 [2:32:19<2:44:50, 190.20s/it]

For Epoch [48|100] : 

Learning Rate is 0.025389110036128957 momentum is 0.8654468675506567

Train Loss: 4.816

Val. Loss: 4.674


Epochs:  49%|████▉     | 49/100 [2:35:29<2:41:38, 190.17s/it]

For Epoch [49|100] : 

Learning Rate is 0.02489473548465021 momentum is 0.8671030637030144

Train Loss: 4.796

Val. Loss: 4.659


Epochs:  50%|█████     | 50/100 [2:38:39<2:38:26, 190.12s/it]

For Epoch [50|100] : 

Learning Rate is 0.024380585292741598 momentum is 0.8688255099070633

Train Loss: 4.777

Val. Loss: 4.652


Epochs:  51%|█████     | 51/100 [2:41:50<2:35:16, 190.13s/it]

For Epoch [51|100] : 

Learning Rate is 0.023847694890465163 momentum is 0.8706107373853763

Train Loss: 4.756

Val. Loss: 4.637


Epochs:  52%|█████▏    | 52/100 [2:45:00<2:32:03, 190.08s/it]

For Epoch [52|100] : 

Learning Rate is 0.02329713744817263 momentum is 0.8724551509273948

Train Loss: 4.737

Val. Loss: 4.628


Epochs:  53%|█████▎    | 53/100 [2:48:10<2:28:57, 190.16s/it]

For Epoch [53|100] : 

Learning Rate is 0.02273002171528315 momentum is 0.8743550361297047

Train Loss: 4.720

Val. Loss: 4.619


Epochs:  54%|█████▍    | 54/100 [2:51:20<2:25:47, 190.17s/it]

For Epoch [54|100] : 

Learning Rate is 0.022147489787409505 momentum is 0.87630656687635

Train Loss: 4.702

Val. Loss: 4.611


Epochs:  55%|█████▌    | 55/100 [2:54:30<2:22:35, 190.11s/it]

For Epoch [55|100] : 

Learning Rate is 0.021550714806329554 momentum is 0.878305813044122

Train Loss: 4.684

Val. Loss: 4.605


Epochs:  56%|█████▌    | 56/100 [2:57:40<2:19:23, 190.08s/it]

For Epoch [56|100] : 

Learning Rate is 0.02094089859743481 momentum is 0.8803487484173038

Train Loss: 4.667

Val. Loss: 4.597


Epochs:  57%|█████▋    | 57/100 [3:00:50<2:16:15, 190.13s/it]

For Epoch [57|100] : 

Learning Rate is 0.020319269249414042 momentum is 0.8824312587959329

Train Loss: 4.649

Val. Loss: 4.583


Epochs:  58%|█████▊    | 58/100 [3:04:01<2:13:09, 190.22s/it]

For Epoch [58|100] : 

Learning Rate is 0.01968707864104609 momentum is 0.8845491502812526

Train Loss: 4.616

Val. Loss: 4.572


Epochs:  60%|██████    | 60/100 [3:10:21<2:06:46, 190.16s/it]

For Epoch [60|100] : 

Learning Rate is 0.01839612493929799 momentum is 0.8888739533021842

Train Loss: 4.600

Val. Loss: 4.567


Epochs:  61%|██████    | 61/100 [3:13:31<2:03:35, 190.13s/it]

For Epoch [61|100] : 

Learning Rate is 0.017739961654869654 momentum is 0.8910721552600681

Train Loss: 4.585

Val. Loss: 4.559


Epochs:  62%|██████▏   | 62/100 [3:16:41<2:00:25, 190.14s/it]

For Epoch [62|100] : 

Learning Rate is 0.01707843149232851 momentum is 0.8932883367091172

Train Loss: 4.569

Val. Loss: 4.553


Epochs:  63%|██████▎   | 63/100 [3:19:51<1:57:15, 190.14s/it]

For Epoch [63|100] : 

Learning Rate is 0.016412866685383744 momentum is 0.8955180345548283

Train Loss: 4.555

Val. Loss: 4.546


Epochs:  64%|██████▍   | 64/100 [3:23:01<1:54:01, 190.05s/it]

For Epoch [64|100] : 

Learning Rate is 0.015744607592981436 momentum is 0.8977567584824742

Train Loss: 4.540

Val. Loss: 4.540


Epochs:  65%|██████▌   | 65/100 [3:26:11<1:50:52, 190.07s/it]

For Epoch [65|100] : 

Learning Rate is 0.015075 momentum is 0.8999999999999999

Train Loss: 4.527

Val. Loss: 4.535


Epochs:  66%|██████▌   | 66/100 [3:29:21<1:47:41, 190.05s/it]

For Epoch [66|100] : 

Learning Rate is 0.01440539240701857 momentum is 0.9022432415175257

Train Loss: 4.513

Val. Loss: 4.530


Epochs:  67%|██████▋   | 67/100 [3:32:31<1:44:27, 189.94s/it]

For Epoch [67|100] : 

Learning Rate is 0.013737133314616257 momentum is 0.9044819654451717

Train Loss: 4.499

Val. Loss: 4.525


Epochs:  68%|██████▊   | 68/100 [3:35:41<1:41:19, 190.00s/it]

For Epoch [68|100] : 

Learning Rate is 0.013071568507671497 momentum is 0.9067116632908827

Train Loss: 4.487

Val. Loss: 4.521


Epochs:  69%|██████▉   | 69/100 [3:38:51<1:38:09, 189.99s/it]

For Epoch [69|100] : 

Learning Rate is 0.01241003834513035 momentum is 0.9089278447399318

Train Loss: 4.474

Val. Loss: 4.516


Epochs:  70%|███████   | 70/100 [3:42:01<1:35:00, 190.01s/it]

For Epoch [70|100] : 

Learning Rate is 0.011753875060702008 momentum is 0.9111260466978157

Train Loss: 4.463

Val. Loss: 4.510


Epochs:  71%|███████   | 71/100 [3:45:11<1:31:50, 190.02s/it]

For Epoch [71|100] : 

Learning Rate is 0.011104400079917375 momentum is 0.9133018422783337

Train Loss: 4.451

Val. Loss: 4.507


Epochs:  72%|███████▏  | 72/100 [3:48:21<1:28:40, 190.01s/it]

For Epoch [72|100] : 

Learning Rate is 0.010462921358953912 momentum is 0.9154508497187474

Train Loss: 4.440

Val. Loss: 4.502


Epochs:  73%|███████▎  | 73/100 [3:51:31<1:25:31, 190.05s/it]

For Epoch [73|100] : 

Learning Rate is 0.009830730750585959 momentum is 0.9175687412040671

Train Loss: 4.429

Val. Loss: 4.499


Epochs:  74%|███████▍  | 74/100 [3:54:41<1:22:20, 190.01s/it]

For Epoch [74|100] : 

Learning Rate is 0.009209101402565192 momentum is 0.9196512515826961

Train Loss: 4.418

Val. Loss: 4.495


Epochs:  75%|███████▌  | 75/100 [3:57:51<1:19:08, 189.96s/it]

For Epoch [75|100] : 

Learning Rate is 0.008599285193670446 momentum is 0.9216941869558779

Train Loss: 4.408

Val. Loss: 4.492


Epochs:  76%|███████▌  | 76/100 [4:01:01<1:16:00, 190.01s/it]

For Epoch [76|100] : 

Learning Rate is 0.008002510212590496 momentum is 0.9236934331236499

Train Loss: 4.399

Val. Loss: 4.488


Epochs:  77%|███████▋  | 77/100 [4:04:11<1:12:49, 189.97s/it]

For Epoch [77|100] : 

Learning Rate is 0.007419978284716851 momentum is 0.9256449638702953

Train Loss: 4.389

Val. Loss: 4.486


Epochs:  78%|███████▊  | 78/100 [4:07:21<1:09:40, 190.00s/it]

For Epoch [78|100] : 

Learning Rate is 0.006852862551827371 momentum is 0.927544849072605

Train Loss: 4.381

Val. Loss: 4.484


Epochs:  79%|███████▉  | 79/100 [4:10:31<1:06:29, 189.98s/it]

For Epoch [79|100] : 

Learning Rate is 0.00630230510953484 momentum is 0.9293892626146236

Train Loss: 4.372

Val. Loss: 4.479


Epochs:  80%|████████  | 80/100 [4:13:41<1:03:20, 190.00s/it]

For Epoch [80|100] : 

Learning Rate is 0.0057694147072584025 momentum is 0.9311744900929366

Train Loss: 4.365

Val. Loss: 4.478


Epochs:  81%|████████  | 81/100 [4:16:51<1:00:10, 190.02s/it]

For Epoch [81|100] : 

Learning Rate is 0.005255264515349789 momentum is 0.9328969362969856

Train Loss: 4.357

Val. Loss: 4.476


Epochs:  82%|████████▏ | 82/100 [4:20:01<57:00, 190.03s/it]  

For Epoch [82|100] : 

Learning Rate is 0.0047608899638710445 momentum is 0.9345531324493432

Train Loss: 4.350

Val. Loss: 4.473


Epochs:  83%|████████▎ | 83/100 [4:23:11<53:51, 190.06s/it]

For Epoch [83|100] : 

Learning Rate is 0.0042872866573761825 momentum is 0.9361397431913695

Train Loss: 4.344

Val. Loss: 4.473


Epochs:  84%|████████▍ | 84/100 [4:26:21<50:40, 190.05s/it]

For Epoch [84|100] : 

Learning Rate is 0.003835408369896108 momentum is 0.9376535733001805

Train Loss: 4.338

Val. Loss: 4.470


Epochs:  85%|████████▌ | 85/100 [4:29:32<47:31, 190.13s/it]

For Epoch [85|100] : 

Learning Rate is 0.0034061651241646547 momentum is 0.9390915741234015

Train Loss: 4.332

Val. Loss: 4.469


Epochs:  86%|████████▌ | 86/100 [4:32:42<44:21, 190.13s/it]

For Epoch [86|100] : 

Learning Rate is 0.0030004213589539105 momentum is 0.9404508497187474

Train Loss: 4.327

Val. Loss: 4.468


Epochs:  87%|████████▋ | 87/100 [4:35:52<41:11, 190.14s/it]

For Epoch [87|100] : 

Learning Rate is 0.0026189941882095577 momentum is 0.9417286626860651

Train Loss: 4.322

Val. Loss: 4.467


Epochs:  88%|████████▊ | 88/100 [4:39:02<38:01, 190.14s/it]

For Epoch [88|100] : 

Learning Rate is 0.002262651755492148 momentum is 0.9429224396800933

Train Loss: 4.319

Val. Loss: 4.465


Epochs:  89%|████████▉ | 89/100 [4:42:12<34:51, 190.16s/it]

For Epoch [89|100] : 

Learning Rate is 0.0019321116870381853 momentum is 0.9440297765928368

Train Loss: 4.315

Val. Loss: 4.463


Epochs:  90%|█████████ | 90/100 [4:45:23<31:41, 190.17s/it]

For Epoch [90|100] : 

Learning Rate is 0.0016280396465563958 momentum is 0.945048443395121

Train Loss: 4.312

Val. Loss: 4.462


Epochs:  91%|█████████ | 91/100 [4:48:33<28:31, 190.14s/it]

For Epoch [91|100] : 

Learning Rate is 0.0013510479946696001 momentum is 0.9459763886275725

Train Loss: 4.309


Epochs:  92%|█████████▏| 92/100 [4:51:40<25:14, 189.31s/it]


Val. Loss: 4.462
For Epoch [92|100] : 

Learning Rate is 0.0011016945557019223 momentum is 0.9468117435319868

Train Loss: 4.307


Epochs:  93%|█████████▎| 93/100 [4:54:47<22:00, 188.71s/it]


Val. Loss: 4.462
For Epoch [93|100] : 

Learning Rate is 0.0008804814942948336 momentum is 0.9475528258147576

Train Loss: 4.305

Val. Loss: 4.462


Epochs:  94%|█████████▍| 94/100 [4:57:57<18:54, 189.09s/it]

For Epoch [94|100] : 

Learning Rate is 0.0006878543041143905 momentum is 0.9481981430347927

Train Loss: 4.303

Val. Loss: 4.462


Epochs:  95%|█████████▌| 95/100 [5:01:07<15:47, 189.40s/it]

For Epoch [95|100] : 

Learning Rate is 0.0005242009106862825 momentum is 0.9487463956090911

Train Loss: 4.302

Val. Loss: 4.462


Epochs:  96%|█████████▌| 96/100 [5:04:17<12:38, 189.54s/it]

For Epoch [96|100] : 

Learning Rate is 0.0003898508901654522 momentum is 0.9491964794299315

Train Loss: 4.300


Epochs:  97%|█████████▋| 97/100 [5:07:24<09:26, 188.83s/it]


Val. Loss: 4.462
For Epoch [97|100] : 

Learning Rate is 0.0002850748056135746 momentum is 0.9495474880883967

Train Loss: 4.300


Epochs:  98%|█████████▊| 98/100 [5:10:32<06:16, 188.34s/it]


Val. Loss: 4.462
For Epoch [98|100] : 

Learning Rate is 0.00021008366212105692 momentum is 0.949798714699762

Train Loss: 4.299


Epochs:  99%|█████████▉| 99/100 [5:13:39<03:07, 187.95s/it]


Val. Loss: 4.462
For Epoch [99|100] : 

Learning Rate is 0.00016502848187087977 momentum is 0.9499496533270657

Train Loss: 4.299

Val. Loss: 4.461


Epochs: 100%|██████████| 100/100 [5:16:49<00:00, 188.53s/it]


In [None]:
# model.module.load_state_dict(torch.load("hindi_openai_lm_saved_model_weights.pt"))

In [30]:
import spacy
nlp =spacy.load("en")

In [31]:
def greedy_decode(text,n=20):
    
    word_idx = torch.tensor([TEXT.vocab.stoi[i.text] for i in nlp.tokenizer(text)]).unsqueeze(0).to(device)
    with torch.no_grad():
        for _ in range(n):
            output = model.module(word_idx)[0][:,-1]
            prob = torch.argmax(output,dim=1).unsqueeze(0)
            word_idx = torch.cat([word_idx,prob],dim=1)
    return " ".join([TEXT.vocab.itos[i] for i in word_idx.view(-1)])


In [53]:
greedy_decode("""His greatest ambition was to serve his country """)

'His greatest ambition was to serve his country in the United States . He was a member of the University of London , and was the first of'