In [25]:
import sentencepiece as spm
import torchtext
import numpy  as np
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import trange
from torch import autograd
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"torch version is {torch.__version__} device is {device}")

torch version is 1.2.0 device is cuda


# Training Sentence Piece Tokenizer for hindi

Train your sentence piece model and load.

In [4]:
with open("data/hindi_updated.txt") as f:
    data = f.readlines()


# Preparing data 

In [5]:
val_fraction = 0.2
val_data = data[len(data)-int(val_fraction * len(data)):]
train_data = data[:len(data)-len(val_data)]
with open("data/hindi_train.txt" ,"w") as t , open("data/hindi_val.txt","w") as v:
    t.writelines(train_data)
    v.writelines(val_data)

# Creating Dataset and Data iterator 

In [6]:
TEXT = torchtext.data.Field(tokenize = sp.EncodeAsPieces,batch_first=True)
train_set = torchtext.datasets.LanguageModelingDataset("data/hindi_train.txt",TEXT,newline_eos=False)
val_set = torchtext.datasets.LanguageModelingDataset("data/hindi_val.txt",TEXT,newline_eos=False)

In [7]:
TEXT.build_vocab(train_set,val_set)

In [8]:
batch_size=128
train_iter = torchtext.data.BPTTIterator(train_set,batch_size,100,train=True,device=device)
val_iter = torchtext.data.BPTTIterator(val_set,batch_size,100,train=False,device=device)

In [9]:
next(iter(train_iter))


[torchtext.data.batch.Batch of size 128]
	[.text]:[torch.cuda.LongTensor of size 128x100 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 128x100 (GPU 0)]

# Defining Simple Model
![](https://i.stack.imgur.com/TtfMs.jpg)

In [21]:
class HindiLSTM(nn.Module):
    
    def __init__(self, input_dim , emb_dim, n_hidden=256, n_layers=2,drop_prob=0.25, lr=0.001):
        
        super().__init__()
        
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        # Embedding layer
        self.embed  = nn.Embedding(input_dim,emb_dim)
        # 2 layer lstm
        self.lstm = nn.LSTM(emb_dim, n_hidden, n_layers,  dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        #fc
        self.fc = nn.Linear(n_hidden, input_dim)
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        x = self.embed(x)    
        r_output, hidden = self.lstm(x, hidden)
        
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)
        
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        return hidden


In [22]:
model = HindiLSTM(len(TEXT.vocab),256,n_hidden=300).to(device)
model

HindiLSTM(
  (embed): Embedding(8061, 256)
  (lstm): LSTM(256, 300, num_layers=2, batch_first=True, dropout=0.25)
  (dropout): Dropout(p=0.25, inplace=False)
  (fc): Linear(in_features=300, out_features=8061, bias=True)
)

# Hyper-Parameter

In [30]:
epochs=20
lr=0.002
temperature=1.0
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
clip=5

# Training 

In [26]:
for epoch in trange(epochs):
    train_losses = []
    val_losses = []
    with autograd.detect_anomaly(): # Only for debugging purpose read the doc
        model.train()
        h = model.init_hidden(batch_size)
        for tr in train_iter:
            inputs,targets = tr.text,tr.target
            h = tuple([each.data for each in h])
            optimizer.zero_grad()
            output ,h = model(inputs,h)
            loss = criterion(output,targets.view(-1).long())
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(),clip )
            optimizer.step()
            train_losses.append(loss.item())
        
    with torch.no_grad():
        model.eval()    
        val_h = model.init_hidden(batch_size)
        for vl in val_iter:
            inputs,targets = vl.text,vl.target
            val_h = tuple([each.data for each in val_h])
            output,val_h = model(inputs,val_h)
            val_loss = criterion(output,targets.view(-1).long())
            val_losses.append(val_loss.item())
    print(f"Train loss is {np.mean(train_losses)} val loss is {np.mean(val_losses)}")

  5%|▌         | 1/20 [00:24<07:45, 24.52s/it]

Train loss is 7.357830519425241 val loss is 7.1380105257034305


 10%|█         | 2/20 [00:49<07:21, 24.52s/it]

Train loss is 6.8942562203658255 val loss is 6.672746467590332


 15%|█▌        | 3/20 [01:13<06:57, 24.53s/it]

Train loss is 6.484894024698358 val loss is 6.375603985786438


 20%|██        | 4/20 [01:37<06:31, 24.49s/it]

Train loss is 6.2033439234683385 val loss is 6.206889057159424


 25%|██▌       | 5/20 [02:02<06:06, 24.44s/it]

Train loss is 5.997840399491159 val loss is 6.091543221473694


 30%|███       | 6/20 [02:26<05:42, 24.44s/it]

Train loss is 5.828500762738679 val loss is 5.986026501655578


 35%|███▌      | 7/20 [02:51<05:17, 24.42s/it]

Train loss is 5.707831683911775 val loss is 5.929713726043701


 40%|████      | 8/20 [03:15<04:52, 24.40s/it]

Train loss is 5.61639596035606 val loss is 5.873499655723572


 45%|████▌     | 9/20 [03:39<04:28, 24.40s/it]

Train loss is 5.538553498920641 val loss is 5.838024306297302


 50%|█████     | 10/20 [04:04<04:04, 24.40s/it]

Train loss is 5.473059727016248 val loss is 5.8063994407653805


 55%|█████▌    | 11/20 [04:28<03:39, 24.38s/it]

Train loss is 5.413133847086053 val loss is 5.784366488456726


 60%|██████    | 12/20 [04:53<03:15, 24.39s/it]

Train loss is 5.3569102588452795 val loss is 5.755792593955993


 65%|██████▌   | 13/20 [05:17<02:50, 24.38s/it]

Train loss is 5.3037426471710205 val loss is 5.729305171966553


 70%|███████   | 14/20 [05:41<02:26, 24.37s/it]

Train loss is 5.2547503120020815 val loss is 5.7168717861175535


 75%|███████▌  | 15/20 [06:06<02:01, 24.39s/it]

Train loss is 5.207925412529393 val loss is 5.706015944480896


 80%|████████  | 16/20 [06:30<01:37, 24.39s/it]

Train loss is 5.165990161895752 val loss is 5.688183736801148


 85%|████████▌ | 17/20 [06:54<01:13, 24.37s/it]

Train loss is 5.124493847395245 val loss is 5.6801965713500975


 90%|█████████ | 18/20 [07:19<00:48, 24.40s/it]

Train loss is 5.084050517333181 val loss is 5.666971516609192


 95%|█████████▌| 19/20 [07:43<00:24, 24.38s/it]

Train loss is 5.051133073003668 val loss is 5.6619257688522335


100%|██████████| 20/20 [08:08<00:00, 24.36s/it]

Train loss is 5.017613338169299 val loss is 5.659657549858093





# Sample Prediction

In [34]:
def predict(net, char, h=None):
        # tensor inputs
        x = torch.tensor([[TEXT.vocab.stoi[char]]])
        inputs = x.to(device)
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)
        
        p = out.squeeze().div(temperature).exp().cpu()

        char = torch.multinomial(p,1)[0]
        # return the encoded value of the predicted char and the hidden state
        return TEXT.vocab.itos[char], h

In [37]:
def sample(net, size, prime='पुलिस का कहना है'):
    with torch.no_grad():    
        net.eval() # eval mode
        chars=[]
        h = net.init_hidden(1)
        for ch in sp.EncodeAsPieces(prime):
            chars.append(ch)
            char, h = predict(net, ch, h )
        chars.append(char)
            #     # Now pass in the previous word and get a new one
        for ii in range(size):
            char, h = predict(net, chars[-1], h)
            chars.append(char)

        return sp.decode_pieces(chars)

In [39]:
sample(model,200,prime="हमलावर नजदीकी")

"हमलावर नजदीकी समस्या जल्दी जाने 'अनजाने सुरक्षा विभाग में लगे हमलों का एक और्दा बताया गया है। बीबीसी के पूर्व मोंग्राअस्पताल में 2008 में अमरीकी विदेशमंत्री मिश्रण ने करीब जो दूरी हासिल कर दिया है। इन तीनों मीबैन (एनआईएलडो) के निदेशक हमें भ्रष्टाचार की संख्या तेजी से कोई कम बम हैं। इसदौरान आधिकारिन और इटहों में 1948किलोमीटर-मन में कुल भर जाएगा जब प्रोजेक्ट के परिसर में लिस्टेट वालीसॉटफों के तहत विफल करवाबॉ सके प्रकटकर दी जाएगी। पर अभीतक मोतियाबिंद का कहना है कि बेगओन ने शुक्रवार को 753 रनों में ऑस्ट्रेलिया एक इमारत बढ़ाने का अमेरिकी अपबोज कर लिया। हवाईअड्डे में श्रीलंका की टीम ने राज को सार्वजनिक करने और 1986 को बैठे बल्लेबाजियों पर रूस में पहले प्रदर्शन हासिल करने के बाद इसे बचाते हैं। उन्होंने बताया कि मेक्सिको के साथ बॉल बनाए हुए खंडड़ा सहित श्रीलंका का हिस्सा तिमाहियों से गिरफ्तार किया जाना होगा।"