In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
from torch import nn,tensor
from torch.utils.data import Dataset,DataLoader
from torchtext.vocab import build_vocab_from_iterator
from tqdm import tqdm
import re
import math
import spacy
import io

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
def removealphanumeric(string):
    string=re.sub('[^a-zA-Z0-9]',' ',string)
    return(string)

In [None]:
def txttolist(path):
    listofsentence=[]
    with io.open(path) as f:
        for line in f:
            if line.strip():
                line=line.splitlines()
                k=removealphanumeric(line[0])
                listofsentence.append(k)
    return listofsentence

In [None]:
strr_english_train=txttolist('/kaggle/input/translation-text/train.en')

In [None]:
strr_english_test=txttolist('/kaggle/input/translation-text/test.en')

In [None]:
strr_english_dev=txttolist('/kaggle/input/translation-text/dev.en')
strr_french_train=txttolist('/kaggle/input/translation-text/train.fr')
strr_french_test=txttolist('/kaggle/input/translation-text/test.fr')
strr_french_dev=txttolist('/kaggle/input/translation-text/dev.fr')
len(strr_english_dev),len(strr_english_test),len(strr_english_train)

In [None]:
tok_en=[]
for x in tqdm(strr_english_train+strr_english_dev+strr_english_test):
    x=x.strip()
    if len(x)==0 or x[0]=='=':continue
    tok_en+=[[w] for w in x.split(' ')]

In [None]:
vocab_english=build_vocab_from_iterator(tok_en,min_freq=3,specials=['<EOS>','<SOS>','<UNK>','<PAD>'])
vocab_english.set_default_index(vocab_english.get_stoi()['<SOS>'])
tok_fr=[]
for x in tqdm(strr_french_train+strr_french_dev+strr_french_test):
    x=x.strip()
    if len(x)==0 or x[0]=='=':continue
    tok_fr+=[[w] for w in x.split(' ')]
vocab_french=build_vocab_from_iterator(tok_fr,min_freq=3,specials=['<EOS>','<SOS>','<UNK>','<PAD>'])
vocab_french.set_default_index(vocab_french.get_stoi()['<SOS>'])

In [None]:
mxlen_eng=[]
for x in tqdm(strr_english_train+strr_english_dev+strr_english_test):
    mxlen_eng.append(len(x.split(' ')))
max(mxlen_eng)

In [None]:
sorted(mxlen_eng,reverse=True)[:15]

In [None]:
mxlen_french=[]
for x in tqdm(strr_french_train+strr_french_dev+strr_french_test):
    mxlen_french.append(len(x.split(' ')))

In [None]:
sorted(mxlen_french,reverse=True)[:15]

In [None]:
class Datafetch(Dataset):
    def __init__(self,split_eng,split_fr):
        data=[]
        labels=[]
        for x in tqdm(split_eng,desc='English  Sentences are as follows'):
            data_int=[]
            if len(x)<2 or x[0]=='=':continue
            words=x.strip().split(' ')
            words=words
            indices=[vocab_english[word] for word in words]
            if len(indices)>210:data.append([vocab_english['<SOS>']]+indices[:210]+[vocab_english['<EOS>']])
            elif len(indices)<210:data.append([vocab_english['<SOS>']]+indices+(210-len(indices))*[vocab_english['<PAD>']]+[vocab_english['<EOS>']])
            else:data.append([vocab_english['<SOS>']]+indices+[vocab_english['<EOS>']])
            #data.append(data_int)
        for x in tqdm(split_fr,desc=' French Sentences are as follows'):
            labels_int=[]
            if len(x)<2 or x[0]=='=':continue
            words=x.strip().split(' ')
            words=words
            indices=[vocab_french[word] for word in words]
            if len(indices)>210:labels.append([vocab_french['<SOS>']]+indices[:210]+[vocab_french['<EOS>']])
            elif len(indices)<210:labels.append([vocab_french['<SOS>']]+indices+(210-len(indices))*[vocab_french['<PAD>']]+[vocab_french['<EOS>']])
            else:labels.append([vocab_french['<SOS>']]+indices+[vocab_french['<EOS>']])
            #labels_int.append(vocab_french['<EOS>'])
            #labels_int.insert(0,vocab_french['<SOS>'])
            #labels.append(labels_int)
        self.data=tensor(data)
        self.data.to(device)
        self.labels=tensor(labels)
        self.labels.to(device)
    def __len__(self)->int:
        return len(self.data)
    def __getitem__(self,index:int):
        return self.data[index],self.labels[index]
        
        

In [None]:
train_dataset=Datafetch(strr_english_train,strr_french_train)

In [None]:
train_dataloader=DataLoader(train_dataset,batch_size=128,shuffle=True)

In [None]:
val_dataset=Datafetch(strr_english_dev,strr_french_dev)
val_dataloader=DataLoader(val_dataset,batch_size=128,shuffle=True)

In [None]:
test_dataset=Datafetch(strr_english_test,strr_french_test)
test_dataloader=DataLoader(test_dataset,batch_size=1,shuffle=True)

In [None]:
## 212 max len

In [None]:
class PE(nn.Module):
    def __init__(self,dim,max_len_inp):
        super(PE,self).__init__()
        pos_enc=torch.zeros(size=(max_len_inp,dim))
        #position=torch.arange(0,max_len_inp).unsqeeze(1)
        self.dropout=nn.Dropout(0.5)
       # max_len_inp=tensor(max_len_inp)
        for pos in range(max_len_inp):
            for emb_pos in range(0,dim,2):
                pos_enc[pos,emb_pos]=math.sin(pos/(10000**((2*emb_pos)/dim)))
                pos_enc[pos,emb_pos+1]=math.cos(pos/(10000**((2*emb_pos)/dim)))
        pos_enc=pos_enc.unsqueeze(0)
        self.register_buffer('pos_enc',pos_enc)
    def forward(self,x):
        x=x+self.pos_enc[:,:x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [None]:
class MHA(nn.Module):
    def __init__(self,dim,n_heads=4):
        super(MHA,self).__init__()
        assert dim%n_heads==0
        self.n_heads=n_heads
        self.dim=dim
        self.headdim=int(dim/n_heads)
       # print(self.headdim)
        self.Q=nn.Linear(self.dim,self.dim)
        self.K=nn.Linear(self.dim,self.dim)
        self.V=nn.Linear(self.dim,self.dim)
        self.combined_head_out=nn.Linear(self.dim,self.dim)
        self.register_buffer("attn_bias",torch.tril(torch.ones(212,212).view(1,1,212,212)))
    def forward(self,key,query,value,mask=None):
        seq_len_query=query.shape[1]
        ##Key(bs,seq_len,emb_dim) 
        k=self.K(key)
        k=k.view(key.shape[0],key.shape[1],self.n_heads,self.headdim).transpose(1,2)
        q=self.Q(query)
        q=q.view(query.shape[0],query.shape[1],self.n_heads,self.headdim).transpose(1,2)
        ## shape bs,seq,25,4
        v=self.V(value)
        v=v.view(value.shape[0],value.shape[1],self.n_heads,self.headdim).transpose(1,2)
        #print(v.shape)
        ##shape(bs,25,100,4) bs,25,4,100==bs,25,100,100
        prod=(q@k.transpose(-2,-1))*(1.0/math.sqrt(self.dim))
        #print(prod)
        #print(prod.shape)
        if mask is not None:
            print(prod)
            #print(self.attn_bias)
            prod=prod.masked_fill_(self.attn_bias[:,:,:212,:212]==0,float('-inf'))
            print(prod)
            #print(prod)
        prod_softmax=nn.functional.softmax(prod,dim=-1)
        # bs,25,100,100 bs ,25,100,4
        prod_final=prod_softmax@v ##bs ,25,100,4
        final_tensor=prod_final.transpose(1,2).contiguous().view(prod_final.shape[0],prod_final.shape[2],prod_final.shape[1]*prod_final.shape[3])
        #print(final_tensor.transpose(1,2).shape)
        return self.combined_head_out(final_tensor)

212,100==>64,212,4,25==>64,4,212,25 * 64,4,25,212==>64,4,212,212  *   64,4,212,25=>64,4,212,25==>64,212,4,25
[1,0,0,0,0,0,0]   [remain,0,0,0]              =>>>>>[reamin*k1,0,0,0,0]                          64,212,100
                                                     [reamin*k1,remian*k2,0,0,0,0]
                  [remain,remain,0,0,0]
[1,1,0,0,0,0,0]    64,212,100


In [None]:
class Block(nn.Module):
    def __init__(self,dim,block_state):
        super().__init__()
        self.l1=nn.LayerNorm(dim)
        self.block_state=block_state
        self.attn=MHA(dim,4)
        self.l2=nn.LayerNorm(dim)
        self.l3=nn.LayerNorm(dim)
        self.linear1=nn.Linear(dim,dim*2)
        self.linear2=nn.Linear(dim*2,dim)
        self.dropout=nn.Dropout()
        self.gleu=nn.GELU()
        #self.enc_op=enc_op
    def forward(self,x,enc_op=None):
        if (self.block_state=='dec'):
            x=x+self.l1(self.attn(x,x,x,True))
            #print(x)
            x=x+self.l2(self.attn(x,enc_op,enc_op,None))
            #print(x)
        else:
            x=x+self.l1(self.attn(x,x,x,None))
        x=x+self.l3(self.linear2(self.dropout(self.gleu(self.linear1(x)))))
        return x

In [None]:
class Encoder(nn.Module):
    def __init__(self,vocab_size,dim,max_seq_length):
        super(Encoder,self).__init__()
        self.emb_layer=nn.Embedding(vocab_size,dim,device=device)
        self.pe=PE(dim,max_seq_length)
        self.block=Block(dim,'enc')
    def forward(self,inp):
        emb=self.emb_layer(inp)
        ## bs,212,100
        pos=self.pe(emb)
        for block in range(2):
            pos=self.block(pos)
        return pos

In [None]:
class Decoder(nn.Module):
    def __init__(self,vocab_size,dim,max_seq_length):
        super(Decoder,self).__init__()
        self.emb_layer=nn.Embedding(vocab_size,dim,device=device)
        self.pe=PE(dim,max_seq_length)
        self.block=Block(dim,'dec')
    def forward(self,inp,enc_op=None):
        emb=self.emb_layer(inp)
        pos=self.pe(emb)
        ## x,212,100
        ltr_mat=torch.tril(torch.ones(212,100))
        pos
        for block in range(2):
            pos=self.block(pos,enc_op)
        return pos

In [None]:
ltr_mat=torch.tril(torch.ones(212,100))

In [None]:
class ScratchTransformer(nn.Module):
    def __init__(self,envocabsize,devocabsize,dim,max_seq_length):
        super(ScratchTransformer,self).__init__()
        self.envocabsize=envocabsize
        self.devocabsize=devocabsize
        self.dim=dim
        self.max_seq_length=max_seq_length
        self.Encoder=Encoder(self.envocabsize,self.dim,self.max_seq_length)
        self.Decoder=Decoder(self.devocabsize,self.dim,self.max_seq_length)
        self.Linear=nn.Linear(dim,self.devocabsize,bias=False)
    def forward(self,inp,op=None):
        enc_out=self.Encoder(inp)
        #print(enc_out)
        dec_out=self.Decoder(op,enc_out)
        #print(dec_out)
        logits=self.Linear(dec_out)
        #print(logits.shape)
        #print(op.shape)
        if op is not None:
            loss=nn.functional.cross_entropy(logits.view(-1,logits.size(2)),op.view(-1),ignore_index=-1)
        return logits,loss
    def encoder_out(self,inp):
        enc_out=self.Encoder(inp)
        return enc_out
    def decoder_out(self,enc_out,dec_mock_inp):
        #dec_mock_inp=torch.tensor([vocab_french['<SOS>']]+210*[vocab_french['<PAD>']]+[vocab_french['<EOS>']])
        dec_out=self.Decoder(dec_mock_inp,enc_out)
        logit=self.Linear(dec_out)
        return logit

In [None]:
transformer=ScratchTransformer(len(vocab_english),len(vocab_french),100,212)

In [None]:
transformer.to(device)

In [None]:
opt=torch.optim.Adam(transformer.parameters(),5e-5)

In [None]:
d,l=next(iter(train_dataloader))

In [None]:
del val_loss

In [None]:
epoch_loss=[]
for epoch in range(5):
    transformer.train(True)
    train_loss=0
    val_loss=0
    for batch in tqdm(train_dataloader,desc="training"):
        x,y=batch
        opt.zero_grad()
        #x.to(device)
        #y.to(device)
        logits,loss=transformer(x.to(device),y.to(device))
        #break
        loss.backward()
        opt.step()
        train_loss+=loss.item()
        #print(loss.item())
    print(train_loss)
    transformer.eval()
    for batch in tqdm(val_dataloader,desc='Validation'):
        with torch.no_grad():
            x,y=batch
            logits,loss=transformer(x.to(device),y.to(device))
            val_loss+=loss.item()
    print(val_loss)
    epoch_loss.append(train_loss)

In [None]:
import nltk

In [None]:
#Ble=nltk.translate.bleu_score.sentence_bleu([],[])

In [None]:
transformer.Linear

In [None]:
torch.save(transformer,'/kaggle/working/transformer')

In [None]:
transformer=torch.load('/kaggle/input/modelbinnfortransformers/transformer')

In [None]:
del transformer

In [None]:
del loss,logits

In [None]:
torch.cuda.empty_cache()

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

gc.collect()
torch.cuda.empty_cache()

In [None]:
del logitss

In [None]:
transformer.eval()
test_loss=0
logitss=[]
labels=[]
data=[]
test_sentences_logits=[]
for batch in test_dataloader:
    with torch.no_grad():
        x,y=batch
        #print()
        dec_inp=torch.tensor([vocab_french['<SOS>']]+210*[vocab_french['<PAD>']]+[vocab_french['<EOS>']])
        dec_inp=dec_inp.unsqueeze(0)
        #zz2=torch.tril(torch.ones(212,212))
        #zz2=zz2.masked_fill_(zz2==0,float('-inf'))
        #zz2=zz2.masked_fill_(zz2==1,float('0'))
        enc_out=transformer.encoder_out(x.to(device))
        #print(enc_out.shape)
        #print(enc_out)
        #break
        for i in range(enc_out.shape[1]-1):
            max_logit=transformer.decoder_out(enc_out,dec_inp.to(device))
            break
            #z2,z1=transformer(x.to(device),dec_inp.to(device))
            #print(max_logit[:,2,3],z2[:,2,3])
                #print("hey")
                #break
            #break
            z2=torch.argmax(max_logit[:,i,:])
            dec_inp[:,i+1]=z2
            if z2==vocab_french['<EOS>']:break
            
            #print(max_logit.shape)
        test_sentences_logits.append(dec_inp)
        break
    break
        #transformer.Encoder()
        #logits,loss=transformer(x.to(device),y.to(device))
    #logitss.append(logits)
    #logitss.append(logits)
    labels.append(y)
    data.append(x)
    #test_loss+=loss.item()

In [None]:
dec_inp=torch.tensor([vocab_french['<SOS>']]+211*[vocab_french['<PAD>']])

In [None]:
dec_inp.unsqueeze(0).shape

In [None]:
test_loss

In [None]:
len(logitss)

In [None]:
## [Start,<PAD>,<PAD>,<PAD>]

In [None]:
torch.argmax()

In [None]:
vocab_french.lookup_tokens([0,1,2])

In [None]:
len(logitss)

In [None]:
sentences=[]
for i in range(len(logitss)):
    arg=torch.argmax(logitss[i],dim=2)
    #print(arg.shape)
    for i in range(arg.shape[0]):
        z1=vocab_french.lookup_tokens(list(arg[i]))
        z1=" ".join(z1)
        sentences.append(z1)

In [None]:
len(sentences)

In [None]:
real_sentences=[]
for i in range(len(labels)):
    for j in range(labels[i].shape[0]):
        z1=vocab_french.lookup_tokens(list(labels[i][j]))
        z1=" ".join(z1)
        real_sentences.append(z1)

In [None]:
x_inp=[]
for i in range(len(data)):
    for j in range(data[i].shape[0]):
        z1=vocab_english.lookup_tokens(list(data[i][j]))
        z1=" ".join(z1)
        real_sentences.append(z1)

In [None]:
Bleu_score_list[]
for i in range(len(sentences)):
    Ble=nltk.translate.bleu_score.sentence_bleu(sentences,real_sentences)
    Bleu_score_list.append(Ble)

In [None]:
z1=pd.DataFrame(x_inp)
z2=pd.DataFrame(sentences)
z3=pd.DataFrame(Bleu_score_list)
zz1=pd.concat([z1,z2,z3],axis=1).to_csv('/kaggle/working/bleu_score')