#### Steps and Challenges & Possible solution: 
1. Text Preprocessing and Tokenization: Using the pretrained BERT Model ( multilingual cased)
2. Model Training and Fine-tuning
3. Fine-tune the inferencing time

Link: https://arxiv.org/pdf/2309.13222 

Transformer Paper Link: https://towardsdatascience.com/build-your-own-transformer-from-scratch-using-pytorch-84c850470dcb

In [1]:
## import important libraries
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
from transformers import AutoTokenizer  ## Hugging Face 
import tqdm as notebook_tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model = torch.load('weights/tmodel_00.pt')

In [1]:
def causal_mask(size):
    mask = torch.triu(torch.ones(1,size,size), diagonal=1).type(torch.int)
    return mask == 0 

In [3]:
import torch
causal_mask(20)

tensor([[[ True, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False],
         [ True,  True, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False],
         [ True,  True,  True, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False],
         [ True,  True,  True,  True, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False],
         [ True,  True,  True,  True,  True, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False],
         [ True,  True,  True,  True,  True,  True, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False],
         [ True,  True,  True,  Tr

In [23]:

encoder_input = torch.tensor([
    [[i for i in range(0,8)],[i for i in range(0,8)]],
    [[i for i in range(0,8)],[i for i in range(0,8)]],
     [[i for i in range(0,8)],[i for i in range(0,8)]]
])

In [51]:
encoder=torch.randint(0, 20, (5, 10))


In [63]:
file=open("data/train.en")

In [67]:
file.buffer?

[0;31mType:[0m        BufferedReader
[0;31mString form:[0m <_io.BufferedReader name='data/train.en'>
[0;31mDocstring:[0m   Create a new buffered reader using the given readable raw IO object.

In [56]:
pad.shape

torch.Size([5, 5])

In [3]:
# Check that MPS is available
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")
    print("MPS is available")

MPS is available


In [68]:
from transformers import AutoTokenizer  ## Hugging Face 
tokenizer=AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")

In [69]:
tokenizer.decode(tokenizer.encode("चोरी नहीं हत्या करना था मकसद, नफरत से भरे थे हत्‍यारे; हिरासत में चार संदिग्ध"))

'[CLS] चोरी नहीं हत्या करना था मकसद, नफरत से भरे थे हत्यारे ; हिरासत में चार संदिग्ध [SEP]'

#### Data Processing

In [14]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention,self).__init__()
        assert d_model % num_heads ==0 , "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2,-1)) / math.sqrt(self.d_k)
        if mask is not None: 
            attn_scores = attn_scores.masked_fill(mask == 0 , -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
    def split_heads(self,x):
        batch_size, _, seq_length, d_k = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1,2)
    def combine_heads(self,x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1,2).contiguous().view(batch_size, seq_length, self.d_model)
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [15]:
x=MultiHeadAttention(64,4)

In [19]:
seq_len=512
d_model=64
pe = torch.zeros(seq_len, d_model)

        # creata a vector of shape (seq_len)
position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/d_model))
        # Apply the sin to even position 
pe[:, 0::2] = torch.sin(position*div_term)
pe[:,1::2] = torch.cos(position*div_term)


In [47]:
torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1).shape

torch.Size([512, 1])

In [43]:
torch.arange(0, seq_len, dtype=torch.float).shape

torch.Size([512])

In [39]:
(pe.unsqueeze(0)).shape

torch.Size([1, 512, 64])

In [5]:
f=open("data/train.en",'r')
count=0
for i in f.readlines():
    print(i)
    if count>10:
        break
    count=count+1
f.close()

However, Paes, who was partnering Australia's Paul Hanley, could only go as far as the quarterfinals where they lost to Bhupathi and Knowles

Whosoever desires the reward of the world, with Allah is the reward of the world and of the Everlasting Life. Allah is the Hearer, the Seer.

The value of insects in the biosphere is enormous because they outnumber all other living groups in measure of species richness.

Mithali To Anchor Indian Team Against Australia in ODIs

After the assent of the Honble President on 8thSeptember, 2016, the 101thConstitutional Amendment Act, 2016 came into existence

The court has fixed a hearing for February 12

Please select the position where the track should be split.

As per police, armys 22RR, special operation Group (SOG) of police and the Central Reserve Police Force (CRPF) cordoned the village and launched search operation in the area.

Jharkhand chief minister Hemant Soren

Arvind Kumar, SHO of the sector 55/56 police station, said a case has been re

In [6]:
f=open("data/train.hi",'r')
count=0
for i in f.readlines():
    print(i)
    if count>10:
        break
    count=count+1
f.close()

आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाले पेस मियामी में क्वार्टरफाइनल तक ही पहुंच सके क्योंकि इस दौर में उन्हें भूपति और नोल्स ने हराया था।

और जो शख्स (अपने आमाल का) बदला दुनिया ही में चाहता है तो ख़ुदा के पास दुनिया व आख़िरत दोनों का अज्र मौजूद है और ख़ुदा तो हर शख्स की सुनता और सबको देखता है

जैव-मंडल में कीड़ों का मूल्य बहुत है, क्योंकि प्रजातियों की समृद्धि के मामले में उनकी संख्या अन्य जीव समूहों से ज़्यादा है।

आस्ट्रेलिया के खिलाफ वनडे टीम की कमान मिताली को

8 सितम्‍बर, 2016 को माननीय राष्‍ट्रपति की स्‍वीकृति मिलने के बाद 101वां संविधान संशोधन अधिनियम, 2016 अस्तित्‍व में आया

अदालत ने इस मामले में आगे की सुनवाई के लिए एक फरवरी की तारीख़ तय की

जहाँ पर ट्रैक को विभाजित किया जाना है, कृपया वह स्थान चुनें.

इसके तुरंत बाद सेना की 22 राष्ट्रीय राइफल्स (आरआर), सीआरपीएफ और पुलिस के स्पेशल ऑपरेशन ग्रुप (एसओजी) के जवानों द्वारा इलाके की घेराबंदी कर तलाशी अभियान चलाया।

झारखंड के मुख्यमंत्री हेमंत सोरेन (फोटोः पीटीआई)

सेक्टर 55/56 के एसएचओ अरविंद कुमार ने बताया कि इस मामले में आ

In [50]:
raw_text="As per police, armys 22RR, special operation Group (SOG) of police and the Central Reserve Police Force (CRPF) cordoned the village and launched search operation in the area."

In [140]:
class data_preprocessing():
    def __init__(self):
        pass
    def preprocess(self,text):
        # Replace non-breaking space with space
        text = text.replace('\u202f', ' ').replace('\xa0', ' ')
        # Insert space between words and punctuation marks
        no_space = lambda char, prev_char: char in ',.!?।' and prev_char != ' '
        out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char for i, char in enumerate(text.lower())]
        return ''.join(out)
    def tokenize(self,text):
        sequence=['<SOS>']
        sequence.extend(text.split())
        sequence.append('<EOS>')
        return sequence
    def seq_to_idx(self,sequence,vocab):
        indexed_sequence=[]
        for i in sequence:
            if i in vocab:
                indexed_sequence.append(vocab[i])
            else:
                indexed_sequence.append(vocab['<UNK>'])
        return indexed_sequence

In [141]:
a="8 सितम्‍बर, 2016 को माननीय राष्‍ट्रपति की स्‍वीकृति मिलने के बाद 101वां संविधान संशोधन अधिनियम, 2016 अस्तित्‍व में आया"
dat=data_preprocessing()
#cln=dat.preprocess(a)
seq=dat.tokenize(a)
idx_seq=dat.seq_to_idx(seq,hin_vocab)
idx_seq

[2, 75, 1, 77, 74, 78, 79, 50, 80, 81, 5, 82, 83, 84, 85, 1, 77, 87, 14, 88, 3]

In [75]:
# Create Vocab
eng_vocab={'<PAD>':0,'<UNK>':1,'<SOS>':2,'<EOS>':3}
eng_vocab_transform={}
hin_vocab={'<PAD>':0,'<UNK>':1,'<SOS>':2,'<EOS>':3}
hin_vocab_transform={}   

In [116]:
### Hindi ( First 1000 sentence)
f=open("data/train.hi",'r')
hin_vocab={'<PAD>':0,'<UNK>':1,'<SOS>':2,'<EOS>':3}
count=0
hindi=[]
count=0
data_prep=data_preprocessing()
for text in f.readlines():
    text=data_prep.preprocess(text)
    tokens=data_prep.tokenize(text)
    for token in tokens:
        if token in hin_vocab:
            continue
        else:
            hin_vocab[token]=len(hin_vocab)  
    if count>1000:
        break
    count=count+1
f.close()

In [122]:
### Hindi ( First 1000 sentence)
f=open("data/train.en",'r')
count=0
data_prep=data_preprocessing()
for text in f.readlines():
    text=data_prep.preprocess(text)
    tokens=data_prep.tokenize(text)
    for token in tokens:
        if token in eng_vocab:
            continue
        else:
            eng_vocab[token]=len(eng_vocab)  
    if count>1000:
        break
    count=count+1
f.close()

In [164]:
x=PositionalEncoding(64)

In [170]:
position = torch.arange(100).unsqueeze(1)

In [171]:
position.shape

torch.Size([100, 1])

In [106]:
from transformers import pipeline

In [110]:
unmasker = pipeline('fill-mask', model='bert-base-cased')
unmasker("Hello I'm a [MASK] model.")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'score': 0.09019210189580917,
  'token': 4633,
  'token_str': 'fashion',
  'sequence': "Hello I'm a fashion model."},
 {'score': 0.06350002437829971,
  'token': 1207,
  'token_str': 'new',
  'sequence': "Hello I'm a new model."},
 {'score': 0.06228209286928177,
  'token': 2581,
  'token_str': 'male',
  'sequence': "Hello I'm a male model."},
 {'score': 0.04417283087968826,
  'token': 1848,
  'token_str': 'professional',
  'sequence': "Hello I'm a professional model."},
 {'score': 0.03326152637600899,
  'token': 7688,
  'token_str': 'super',
  'sequence': "Hello I'm a super model."}]

In [None]:
unmasker("Hello I'm a [MASK] model.")

In [175]:
pe = torch.zeros(64,1,100)

In [184]:
emb_size = 64
max_len = 100
div_term = torch.exp(torch.arange(0,emb_size,2) * (-math.log(10000.0)/emb_size))
div_term.shape

torch.Size([32])

In [185]:
position = torch.arange(max_len).reshape(max_len,1)

In [191]:
div_term

tensor([1.0000e+00, 7.4989e-01, 5.6234e-01, 4.2170e-01, 3.1623e-01, 2.3714e-01,
        1.7783e-01, 1.3335e-01, 1.0000e-01, 7.4989e-02, 5.6234e-02, 4.2170e-02,
        3.1623e-02, 2.3714e-02, 1.7783e-02, 1.3335e-02, 1.0000e-02, 7.4989e-03,
        5.6234e-03, 4.2170e-03, 3.1623e-03, 2.3714e-03, 1.7783e-03, 1.3335e-03,
        1.0000e-03, 7.4989e-04, 5.6234e-04, 4.2170e-04, 3.1623e-04, 2.3714e-04,
        1.7783e-04, 1.3335e-04])

In [190]:
y=position*div_term
y.shape

torch.Size([100, 32])

In [119]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float = 0.1, max_len : int = 100):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).reshape(max_len,1)
        div_term = torch.exp(torch.arange(0,emb_size,2) * (-math.log(10000.0)/emb_size))
        pe = torch.zeros(max_len, 1, emb_size)
        pe[:, 0, 0::2] = torch.sin(position* div_term)
        pe[:, 0, 1::2] = torch.cos(position* div_term)
        self.register_buffer('pe',pe)
    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
           x: Tensor, shape ``[seq_len, batch_size ,embedding_dim]``
        """
        x = x+self.pe[:x.size(0)]
        return self.dropout(x)

In [120]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,emb_size)
        self.emb_size = emb_size
    def forward(self,tokens: Tensor):
        return self.embedding(tokens.long())*math.sqrt(self.emb_size)

In [192]:
vocab_size=100
emb_size=64
e=nn.Embedding(vocab_size,emb_size)

In [199]:
e.weight

Parameter containing:
tensor([[ 0.2787, -0.5009, -0.6734,  ..., -0.3532,  0.2240,  0.9667],
        [ 2.4530,  1.0345, -1.7734,  ...,  1.3107, -0.1280, -1.8023],
        [ 1.3118, -2.0575,  0.8235,  ..., -1.1269,  0.2530, -0.7042],
        ...,
        [ 0.1233, -0.3748, -1.9727,  ..., -1.1965, -1.1139, -1.9265],
        [ 0.4807, -0.3565, -0.9671,  ..., -0.0933, -0.9233, -0.7316],
        [-0.7822, -1.1744, -0.4744,  ...,  2.6805,  0.7909, -0.1247]],
       requires_grad=True)

In [105]:
tokenizer.vocab_size*128*4/(1024*1024)

58.37255859375

In [112]:
# Sequence to Sequence Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self, 
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int, 
                 nhead: int, 
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super().__init__()
        self.transformer = Transformer(d_model = emb_size,
                                       nhead = nhead,
                                       num_encoder_layers = num_encoder_layers,
                                       num_decoder_layers = num_decoder_layers,
                                       dim_feedforward = dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size,tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size,dropout=dropout)

    def forward(self, 
                src: Tensor,
                tgt: Tensor, 
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask,
                                memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        x = self.positional_encoding(self.src_tok_emb(src))
        return self.transformer.encoder(x, src_mask)
    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        x = self.positional_encoding(self.tgt_tok_emb(tgt))
        return self.transformer.decoder(x, memory, tgt_mask)
               

In [121]:
num_encoder_layers=10
num_decoder_layers=20
emb_size=64
nhead=8
src_vocab_size=10000
tgt_vocab_size=10000
dim_feedforward=128
dropout=0.2
mod=Seq2SeqTransformer(num_encoder_layers,
                 num_decoder_layers,
                 emb_size, 
                 nhead, 
                 src_vocab_size,
                 tgt_vocab_size,
                 dim_feedforward,
                 dropout)

In [123]:
mod

Seq2SeqTransformer(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-9): 10 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
          )
          (linear1): Linear(in_features=64, out_features=128, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (linear2): Linear(in_features=128, out_features=64, bias=True)
          (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.2, inplace=False)
          (dropout2): Dropout(p=0.2, inplace=False)
        )
      )
      (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-19): 20 x TransformerDecoderLayer(
          (self_attn): MultiheadAttention(

In [30]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz,sz), device = DEVICE) == 1)).transpose(0,1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)
    src_padding_mask = (src == PAD_IDX).transpose(0,1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0,1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [31]:
torch.manual_seed(0)
src_vocab_size = 100
tgt_vocab_size = 200
emb_size = 64
nhead = 4
ffn_hid_dim = 128
batch_size = 32
num_encoder_layers = 2
num_decoder_layers = 3
transformer = Seq2SeqTransformer(num_encoder_layers, num_decoder_layers, emb_size, nhead, src_vocab_size, tgt_vocab_size, ffn_hid_dim)

In [36]:
for p in transformer.parameters():
    if p.dim()>1:
        nn.init.xavier_uniform_(p)
transformer = transformer.to(DEVICE)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9,0.98), eps=1e-9)

### Collation

In [40]:
from torch.nn.utils.rnn import pad_sequence

In [41]:
# Helper function to club together sequential operations
def sequential_transform(*transforms):
    def func(txt_input):
        for transform in transforms: 
            txt_input = transform(txt_input)
        return txt_input
    return func

In [42]:
# Function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    x1 = torch.tensor([BOS_IDX])
    x2 = torch.tensor(token_ids)
    x3 = torch.tensor([EOS_IDX])
    x = torch.cat((x1,x2,x3))
    return x

In [None]:
# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices 
text_transform = {}
for ln in [src_lang, tgt_lang]:
    text_transform[ln] = sequential_transforms(token_transform[ln], # Tokenization
                                               vocab_transform[ln], # Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor 

# Function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    

In [157]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [8]:
from transformers import pipeline

In [16]:
tokenizer.convert_tokens_to_ids("fake")

8406

In [163]:
tokenizer=AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")

In [155]:
txt="टी20 वर्ल्ड कप जीतकर लौटने वाली टीम इंडिया का मुंबई में काफी शानदार स्वागत किया गया। इस दौरान मुंबई की सड़कों पर जमकर भीड़ देखने को मिली। फैंस टीम इंडिया के सितारों की एक झलक पाने के लिए उतावले थे। इस दौरान फैंस ने कुछ ऐसा कर दिया जिससे उन्हें खुद काफी नुकसान हो सकता था। फैंस ने सितारों की झलक पाने के लिए अपनी जान जोखिम में डाली"

In [158]:
x=tokenizer.encode(txt)

In [159]:
x

[101,
 100,
 100,
 607,
 28519,
 610,
 28533,
 28515,
 28510,
 28524,
 100,
 623,
 28531,
 28525,
 28533,
 100,
 100,
 607,
 28531,
 100,
 619,
 28535,
 28508,
 100,
 624,
 28531,
 28518,
 28516,
 28531,
 28524,
 626,
 28537,
 28526,
 28531,
 28511,
 28515,
 607,
 28532,
 28523,
 28531,
 608,
 28523,
 28531,
 635,
 100,
 100,
 100,
 607,
 28533,
 100,
 616,
 28524,
 610,
 28522,
 28510,
 28524,
 100,
 100,
 607,
 28536,
 619,
 28532,
 28525,
 28533,
 635,
 100,
 100,
 100,
 607,
 28535,
 626,
 28532,
 28515,
 28531,
 28524,
 28536,
 28508,
 607,
 28533,
 100,
 100,
 616,
 28531,
 28518,
 28535,
 607,
 28535,
 100,
 100,
 100,
 635,
 100,
 100,
 100,
 615,
 28535,
 100,
 100,
 607,
 28524,
 613,
 28532,
 28523,
 28531,
 610,
 28532,
 28529,
 28529,
 28535,
 100,
 100,
 100,
 615,
 28534,
 28510,
 28529,
 28531,
 28518,
 627,
 28536,
 626,
 28510,
 28515,
 28531,
 100,
 635,
 100,
 615,
 28535,
 626,
 28532,
 28515,
 28531,
 28524,
 28536,
 28508,
 607,
 28533,
 100,
 616,
 28531,
 28518

In [164]:
tokenizer.decode(tokenizer.encode(txt))

'[CLS] टी20 वर्ल्ड कप जीतकर लौटने वाली टीम इंडिया का मुंबई में काफी शानदार स्वागत किया गया । इस दौरान मुंबई की सड़कों पर जमकर भीड़ देखने को मिली । फैंस टीम इंडिया के सितारों की एक झलक पाने के लिए उतावले थे । इस दौरान फैंस ने कुछ ऐसा कर दिया जिससे उन्हें खुद काफी नुकसान हो सकता था । फैंस ने सितारों की झलक पाने के लिए अपनी जान जोखिम में डाली [SEP]'

In [71]:
hin.encode(

['टी',
 '##20',
 'व',
 '##र',
 '##्ल',
 '##्ड',
 'क',
 '##प',
 'जी',
 '##त',
 '##कर',
 'ल',
 '##ौ',
 '##टन',
 '##े',
 'वाली',
 'टीम',
 'इंडिया',
 'का',
 'मुंबई',
 'में',
 'काफी',
 'श',
 '##ान',
 '##दार',
 'स',
 '##्व',
 '##ाग',
 '##त',
 'किया',
 'गया',
 '।',
 'इस',
 'दौरान',
 'मुंबई',
 'की',
 'स',
 '##ड़',
 '##कों',
 'पर',
 'ज',
 '##म',
 '##कर',
 'भी',
 '##ड़',
 'दे',
 '##खने',
 'को',
 'म',
 '##िली',
 '।',
 'फ',
 '##ैं',
 '##स',
 'टीम',
 'इंडिया',
 'के',
 'स',
 '##िता',
 '##रों',
 'की',
 'एक',
 'झ',
 '##ल',
 '##क',
 'प',
 '##ाने',
 'के',
 'लिए',
 'उ',
 '##ता',
 '##वले',
 'थे',
 '।',
 'इस',
 'दौरान',
 'फ',
 '##ैं',
 '##स',
 'ने',
 'कुछ',
 'ऐसा',
 'कर',
 'दिया',
 'जिससे',
 'उन्हें',
 'ख',
 '##ु',
 '##द',
 'काफी',
 'न',
 '##ुक',
 '##सा',
 '##न',
 'हो',
 'सकता',
 'था',
 '।',
 'फ',
 '##ैं',
 '##स',
 'ने',
 'स',
 '##िता',
 '##रों',
 'की',
 'झ',
 '##ल',
 '##क',
 'प',
 '##ाने',
 'के',
 'लिए',
 'अपनी',
 'जा',
 '##न',
 'जो',
 '##ख',
 '##िम',
 'में',
 'ड',
 '##ाली']

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

# Push the tokenizer to your namespace with the name "my-finetuned-bert".
tokenizer.push_to_hub("my-finetuned-bert")

# Push the tokenizer to an organization with the name "my-finetuned-bert".
tokenizer.push_to_hub("huggingface/my-finetuned-bert")

In [19]:
tokenizer.tokenize("टी20 वर्ल्ड कप जीतकर लौटने वाली टीम इंडिया का मुंबई में काफी शानदार स्वागत किया गया। इस दौरान मुंबई की सड़कों पर जमकर भीड़ देखने को मिली। फैंस टीम इंडिया के सितारों की एक झलक पाने के लिए उतावले थे। इस दौरान फैंस ने कुछ ऐसा कर दिया जिससे उन्हें खुद काफी नुकसान हो सकता था। फैंस ने सितारों की झलक पाने के लिए अपनी जान जोखिम में डाली")

['[UNK]',
 '[UNK]',
 'क',
 '##प',
 'ज',
 '##ी',
 '##त',
 '##क',
 '##र',
 '[UNK]',
 'व',
 '##ा',
 '##ल',
 '##ी',
 '[UNK]',
 '[UNK]',
 'क',
 '##ा',
 '[UNK]',
 'म',
 '##े',
 '##ं',
 '[UNK]',
 'श',
 '##ा',
 '##न',
 '##द',
 '##ा',
 '##र',
 'स',
 '##्',
 '##व',
 '##ा',
 '##ग',
 '##त',
 'क',
 '##ि',
 '##य',
 '##ा',
 'ग',
 '##य',
 '##ा',
 '।',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 'क',
 '##ी',
 '[UNK]',
 'प',
 '##र',
 'ज',
 '##म',
 '##क',
 '##र',
 '[UNK]',
 '[UNK]',
 'क',
 '##ो',
 'म',
 '##ि',
 '##ल',
 '##ी',
 '।',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 'क',
 '##े',
 'स',
 '##ि',
 '##त',
 '##ा',
 '##र',
 '##ो',
 '##ं',
 'क',
 '##ी',
 '[UNK]',
 '[UNK]',
 'प',
 '##ा',
 '##न',
 '##े',
 'क',
 '##े',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '।',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 'न',
 '##े',
 '[UNK]',
 '[UNK]',
 'क',
 '##र',
 'द',
 '##ि',
 '##य',
 '##ा',
 'ज',
 '##ि',
 '##स',
 '##स',
 '##े',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 'न',
 '##ु',
 '##क',
 '##स',
 '##ा',
 '##न',
 'ह',
 '##ो',
 'स',
 '##क',
 '##त',
 '##ा',
 '[UNK]',
 '।',


In [21]:
len(tokenizer.vocab)

28996

In [22]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_additional_special_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_bos_token',
 '_call_one',
 '_cls_token',
 '_compile_jinja_template',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 

In [23]:
tokenizer.vocab_size

28996

In [25]:
tokenizer.vocab_files_names

{'vocab_file': 'vocab.txt', 'tokenizer_file': 'tokenizer.json'}

In [48]:
torch.ones(1)

tensor([1.])

In [49]:
torch.zeros(1)

tensor([0.])

In [50]:
import torch

# Create a random tensor of size (32, 512, 64) with values from a uniform distribution
random_tensor = torch.rand(32, 512, 64)

print(random_tensor)


tensor([[[2.4389e-01, 9.9934e-01, 5.8120e-01,  ..., 3.2883e-02,
          3.8624e-01, 2.2267e-01],
         [4.4363e-01, 1.3324e-01, 6.2764e-01,  ..., 7.6594e-01,
          4.0209e-01, 1.9251e-01],
         [4.3578e-01, 1.7083e-01, 7.6945e-01,  ..., 6.6378e-01,
          6.0378e-01, 5.4441e-01],
         ...,
         [3.7827e-01, 8.2754e-01, 7.2267e-01,  ..., 2.1232e-01,
          7.3114e-01, 8.4807e-01],
         [9.5113e-02, 6.7128e-01, 1.9636e-01,  ..., 7.5465e-01,
          1.5344e-01, 5.9019e-01],
         [3.0075e-02, 7.9916e-01, 4.9934e-01,  ..., 8.5592e-01,
          6.6930e-01, 3.5631e-01]],

        [[8.0079e-01, 7.2917e-01, 2.2525e-02,  ..., 9.4114e-01,
          9.9348e-01, 5.4295e-02],
         [5.2764e-01, 8.0727e-01, 6.2938e-01,  ..., 6.8766e-01,
          8.5788e-01, 7.7981e-01],
         [1.0472e-01, 4.3805e-01, 6.8010e-01,  ..., 6.3962e-01,
          6.6393e-01, 1.1014e-01],
         ...,
         [9.2923e-01, 9.7174e-01, 6.2213e-01,  ..., 2.3156e-01,
          5.551

In [51]:
random_tensor.shape

torch.Size([32, 512, 64])

In [66]:
random_tensor.mean(dim=-1)

tensor([[0.4557, 0.4555, 0.4623,  ..., 0.5218, 0.5288, 0.5052],
        [0.5422, 0.4675, 0.5210,  ..., 0.5028, 0.4792, 0.4816],
        [0.5538, 0.4657, 0.4767,  ..., 0.5809, 0.4994, 0.4191],
        ...,
        [0.4831, 0.5221, 0.4784,  ..., 0.5474, 0.4724, 0.4819],
        [0.5093, 0.4988, 0.4657,  ..., 0.5025, 0.4853, 0.4703],
        [0.4731, 0.4723, 0.4725,  ..., 0.5648, 0.4904, 0.5949]])

In [67]:
list_data=[[1,2,3],[4,5,6]]
x=torch.tensor(list_data).float()

In [75]:
x.mean(dim=0, keepdim=True)

tensor([[2.5000, 3.5000, 4.5000]])

In [74]:
x.mean(dim=-1).shape

torch.Size([2])

In [72]:
x.mean(dim=-1)

tensor([2., 5.])

In [83]:
class A:
    def __init__(self, a, b): 
        self.a = a
        self.b = b
    @staticmethod
    def hello():
        print("How are you")

In [85]:
A.hello()

How are you


In [1]:
import torch 
import torch.nn as nn
import math


# Reference Link: https://www.youtube.com/watch?v=ISNdQcPhsts 
# Github: https://github.com/hkproj/pytorch-transformer 



# Embedding
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size 
        self.embedding = nn.Embedding(vocab_size, d_model)
    
    def forward(self, x):
        return self.embedding(x)*math.sqrt(self.d_model)
                              


# Positional Embedding 
class PositionalEncoding(nn.Module):

    def __init__(self,d_model: int, seq_len: int, dropout: float) -> None: 
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len 
        self.dropout = nn.Dropout(dropout)

        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)

        # creata a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/d_model))
        # Apply the sin to even position 
        pe[:, 0::2] = torch.sin(position*div_term)
        pe[:,1::2] = torch.cos(position*div_term)
        pe = pe.unsqueeze(0)  # (1, seq_len, d_model)

        self.register_buffer('pe',pe) # Saved along with the model  (Along with the file)
    
    def forward(self,x):
        x = x + (self.pe[:, x.shape[1],:]).requries_grad_(False)  # It is not as a Trainable parameter
        return self.dropout(x)
    



 # Layer Normalization 
class LayerNormalization(nn.Module):
    def __init__(self, eps:float=10**-6) -> None:
        """
        X = X_normalized*gamma + bias
        """
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(1))  # Make it learnable parameter 
        self.bias = nn.Parameter(torch.zeros(1)) # Added ( This is bias term) 

    def forward(self,x):
        mean = x.mean(dim = -1, keepdim=True)
        std = x.std(dim = -1, keepdim= True)
        return self.gamma*(x-mean)/(std+self.eps) + self.bias
     
# Feed Forward Block 
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float ) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)  # W1 and B1 
        self.dropout = nn.Dropout(dropout)
        self.Linear_2=nn.Linear(d_ff, d_model) # W2 and B2 

    def forward(self,x):
        # (Batch, Seq_Leln, d_model) --> (Batch, Seq_Len, d_ff) ---> (Batch, Seq_Len, d_model)
        x = self.linear_1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.linear_2(x)
        return x 


# MultiHead Attention: 
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self,d_model:int, h: int, dropout: float)->None:
        super().__init__()
        self.d_model = d_model 
        self.h = h 
        assert d_model % h ==0, "d_model is not divisible by h i.e number of heads"

        self.d_k = d_model//h 
        self.w_q = nn.Linear(d_model, d_model) # Wq
        self.w_k = nn.Linear(d_model, d_model) # Wk
        self.w_v = nn.Linear(d_model, d_model) # Wv
        self.w_o = nn.Linear(d_model, d_model) # Wo
        self.dropout = nn.Dropout(dropout)


    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]

        # ( Batch, h, seq_len, d_k) --> (Batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2,-1))
        if mask is not None: 
            attention_scores.masked_fill_(mask ==0, -1e9)
        attention_scores = attention_scores.softmax(dim = -1 )  # (Batch, h, seq_len, seq_len)
        if dropout is not None: 
            attention_scores = dropout(attention_scores)
        return (attention_scores @ value), attention_scores


    def forward(self, q, k, v, mask):
        """ 
        If we don't want any word to interact with other word then we can mask them
        """
        query = self.w_q(q)  #  (Batch, Seq_Len, d_model ) --> (Batch, Seq_Len, d_model)
        key = self.w_k(k)  #  (Batch, Seq_Len, d_model ) --> (Batch, Seq_Len, d_model)
        value = self.w_v(v) #  (Batch, Seq_Len, d_model ) --> (Batch, Seq_Len, d_model)

        # (Batch, Seq_Len, d_model) --View--> (Batch, Seq_Len, h, d_k) ---Transpose-> (Batch, h, Seq_Len, d_k)
        query = query.view(query.shape[0],query.shape[1],self.h, self.d_k).transpose(1,2)
        key = query.view(query.shape[0],query.shape[1],self.h, self.d_k).transpose(1,2)
        value = query.view(value.shape[0],value.shape[1], self.h, self.d_k).transpose(1,2)

        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # (Batch, h, seq_len, d_k) --Transpose--> (Batch, Seq_Len, h, d_k) ---> (Batch, seq_len, d_model)

        x = x.transpose(1,2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # (Batch, Seq_Len, d_model) --> (Batch, Seq_Len, d_model)

        return self.w_o(x)

# Build Residual Connection 

class ResidualConnection(nn.Module):
    
    def __init__(self, dropout:float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()
    
    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))




class EncoderBlock(nn.Module):

    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout:float ) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block 
        self.residual_coonnections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])
    

    def forward(self, x, src_mask):
        """
        src_mask: Hide the interaction of padding words with other words"
        """
        x = self.residual_coonnections[0](x, lambda x: self.self_attention_block(x,x,x, src_mask))
        x = self.residual_coonnections[1](x, lambda x: self.feed_forward_block)
        return x



class Encoder(nn.Module):

    def __init__(self, layers: nn.ModuleList) ->None:
        super().__init__()
        self.layers = layers 
        self.norm = LayerNormalization()

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x) 
    



## ----Decoder Block -------------

class DecoderBlock(nn.Module):
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block:FeedForwardBlock, dropout:float) -> None: 
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block 
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])


    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x,x,x,tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x,encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x 
    


## Repeat Decoder block n-times 

class Decoder(nn.Module): 

    def __init__(self, layers: nn.ModuleList) ->  None: 
        super().__init__()
        self.layers = layers 
        self.norm = LayerNormalization()

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)


# Projection Layear 
class ProjectionLayer(nn.Module):
    def __init__(self, d_model:int, vocab_size: int) -> None: 
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        # (Batch, seq_len, d_model) -->(Batch, seq_len, vocab_size) 

        return torch.log_softmax(self.proj(x),dim = -1)  # log_softmax for numerical stability 
    



# Define Transformer Block 

class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos:PositionalEncoding, tgt_pos:PositionalEncoding, projection_layer:ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder 
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos 
        self.tgt_pos = tgt_pos 
        self.projection_layer = projection_layer
    
    def encode(self, src, src_mask):
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, encoder_ouptut, src_mask, tgt, tgt_mask):
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_ouptut, src_mask, tgt_mask)

    def project(self, x): 
        return self.projection_layer(x)

# Initialize the transformer
def build_transformer(src_vocab_size:int, tgt_vocab_size: int, src_seq_len:int, tgt_seq_len:int,d_model: int =512, N: int =6 , h: int = 8, dropout:float = 0.1, d_ff: int =2048):
    # Create the embdeeing layers 
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the Positional Encoding 
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)

    # Create the encoder blocks
    encoder_blocks =[]

    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks 
    decoder_blocks = [] 
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(decoder_self_attention_block,decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)
    
    # Create Encoer and the Decoder Block 

    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))

    # Create the projection Layer 

    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # Create the transformer 
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # Initialize the parameters 
    for p in transformer.parameters():
        if p.dim() > 1: 
            nn.init.xavier_uniform_(p) 
    return transformer




In [2]:
src_vocab_size=100000
tgt_vocab_size=280000
src_seq_len=100
tgt_seq_len=150
d_model=64
N=6 
h= 8
dropout = 0.1
d_ff=2048
trans=build_transformer(src_vocab_size, tgt_vocab_size, src_seq_len, tgt_seq_len,d_model, N , h, dropout, d_ff)

In [5]:
trans

Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=64, out_features=64, bias=True)
          (w_k): Linear(in_features=64, out_features=64, bias=True)
          (w_v): Linear(in_features=64, out_features=64, bias=True)
          (w_o): Linear(in_features=64, out_features=64, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_block): FeedForwardBlock(
          (linear_1): Linear(in_features=64, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (Linear_2): Linear(in_features=2048, out_features=64, bias=True)
        )
        (residual_coonnections): ModuleList(
          (0-1): 2 x ResidualConnection(
            (dropout): Dropout(p=0.1, inplace=False)
            (norm): LayerNormalization()
          )
        )
      )
    )
    (norm): LayerNormalization()
  )
  (dec

In [10]:
trans.parameters

<bound method Module.parameters of Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=64, out_features=64, bias=True)
          (w_k): Linear(in_features=64, out_features=64, bias=True)
          (w_v): Linear(in_features=64, out_features=64, bias=True)
          (w_o): Linear(in_features=64, out_features=64, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_block): FeedForwardBlock(
          (linear_1): Linear(in_features=64, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (Linear_2): Linear(in_features=2048, out_features=64, bias=True)
        )
        (residual_coonnections): ModuleList(
          (0-1): 2 x ResidualConnection(
            (dropout): Dropout(p=0.1, inplace=False)
            (norm): LayerNormalization()
          )
        )
      )
    )
    (nor

In [13]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size 
        self.embedding = nn.Embedding(vocab_size, d_model)
    
    def forward(self, x):
        return self.embedding(x)*math.sqrt(self.d_model)
                              

In [15]:
emb=InputEmbeddings(d_model=64,vocab_size=10000)

In [33]:
x=torch.tensor(1)

In [34]:
emb(x).shape

torch.Size([64])

In [43]:
emb(torch.tensor([[10],[20],[3]])).shape

torch.Size([3, 1, 64])

In [44]:
x=torch.tensor([[10],[20],[3]])

In [46]:
x.shape

torch.Size([3, 1])

In [47]:
x

tensor([[10],
        [20],
        [ 3]])

In [68]:
x=[[1,2,3],[4,5,6],[7,8,9],[10,5,6]]

In [69]:
x=torch.tensor(x).float()

In [70]:
x.shape

torch.Size([4, 3])

In [75]:
x.mean(dim = 0, keepdim=True).shape

torch.Size([1, 3])

In [74]:
x.mean(dim = -1, keepdim=True)

tensor([[2.],
        [5.],
        [8.],
        [7.]])

In [94]:
d_model =3
a=nn.Linear(d_model, d_ff)

In [95]:
a.weight.shape

torch.Size([2048, 3])

In [97]:
x=torch.tensor([[[1,2,3],[4,5,6],[7,8,9]], [[1,2,3],[4,5,6],[7,8,9]]]).float()

In [99]:
a(x).shape

torch.Size([2, 3, 2048])

In [101]:
from transformers import AutoTokenizer, BertForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("AkshatSurolia/ICD-10-Code-Prediction")
model = BertForSequenceClassification.from_pretrained("AkshatSurolia/ICD-10-Code-Prediction")
config = model.config

In [109]:
text = "IMMUNIZATION ADMINISTRATION (INCLUDES PERCUTANEOUS, INTRADERMAL, SUBCUTANEOUS,OR INTRAMUSCULAR INJECTIONS); 1 VACCINE (SINGLE OR COMBINATION VACCINE/TOXOID)"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)


In [110]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-0.2510,  0.2822, -0.9486,  ..., -0.2305, -0.3414, -0.6066]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [111]:
results = output.logits.detach().cpu().numpy()[0].argsort()[::-1][:5]

In [112]:
results

array([ 9734,  2692, 12591,  9656,  9874])

In [113]:
[config.id2label[ids] for ids in results]

['S00.86', 'G40.00', 'T45.93', 'R89.4', 'S05.51']

In [29]:
# MultiHead Attention: 
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self,d_model:int, h: int, dropout: float)->None:
        super().__init__()
        self.d_model = d_model 
        self.h = h 
        assert d_model % h ==0, "d_model is not divisible by h i.e number of heads"

        self.d_k = d_model//h 
        self.w_q = nn.Linear(d_model, d_model) # Wq
        self.w_k = nn.Linear(d_model, d_model) # Wk
        self.w_v = nn.Linear(d_model, d_model) # Wv
        self.w_o = nn.Linear(d_model, d_model) # Wo
        self.dropout = nn.Dropout(dropout)


    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]

        # ( Batch, h, seq_len, d_k) --> (Batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2,-1))
        if mask is not None: 
            attention_scores.masked_fill_(mask ==0, -1e9)
        attention_scores = attention_scores.softmax(dim = -1 )  # (Batch, h, seq_len, seq_len)
        if dropout is not None: 
            attention_scores = dropout(attention_scores)
        return (attention_scores @ value), attention_scores


    def forward(self, q, k, v, mask):
        """ 
        If we don't want any word to interact with other word then we can mask them
        """
        query = self.w_q(q)  #  (Batch, Seq_Len, d_model ) --> (Batch, Seq_Len, d_model)
        key = self.w_k(k)  #  (Batch, Seq_Len, d_model ) --> (Batch, Seq_Len, d_model)
        value = self.w_v(v) #  (Batch, Seq_Len, d_model ) --> (Batch, Seq_Len, d_model)

        # (Batch, Seq_Len, d_model) --View--> (Batch, Seq_Len, h, d_k) ---Transpose-> (Batch, h, Seq_Len, d_k)
        query = query.view(query.shape[0],query.shape[1],self.h, self.d_k).transpose(1,2)
        key = key.view(key.shape[0],key.shape[1],self.h, self.d_k).transpose(1,2)
        value = value.view(value.shape[0],value.shape[1], self.h, self.d_k).transpose(1,2)

        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # (Batch, h, seq_len, d_k) --Transpose--> (Batch, Seq_Len, h, d_k) ---> (Batch, seq_len, d_model)

        x = x.transpose(1,2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # (Batch, Seq_Len, d_model) --> (Batch, Seq_Len, d_model)

        return self.w_o(x)

In [30]:
h=MultiHeadAttentionBlock(128,4,0.1)

In [31]:
x=[[[i for i in range(0,128)],[i for i in range(0,128)],[i for i in range(0,128)]]]

In [32]:
x=torch.tensor(x).float()

In [33]:
h.forward(x,x,x,mask=None)

tensor([[[-2.8514e+01,  1.3590e+01,  1.9115e+01, -8.1927e+00,  2.2603e+01,
          -8.1634e+00,  1.5603e+01, -1.4543e+01,  4.2188e+01,  1.9583e+01,
           4.2932e+01,  1.4161e+01,  3.6501e+01, -3.6716e+01, -3.1962e+01,
          -2.6261e+01,  3.7146e+00, -2.5213e+01, -4.0339e+01,  1.4971e+01,
           5.2246e+01,  3.8348e+01, -1.9493e+00,  9.3151e+00,  3.8483e+01,
          -8.2403e+00,  2.2796e+01, -1.9584e+01, -9.6835e+00,  3.2506e+00,
           5.6873e+00, -1.1017e+01,  2.5132e+01, -3.7777e+00,  1.0076e+01,
          -2.5283e+01,  2.3744e+01,  7.1734e+00,  4.6779e+01,  3.0813e+01,
          -2.1130e+01, -5.6546e+01, -3.7198e+00,  8.2197e+00,  2.9527e+01,
           2.3327e+01, -1.7878e+01, -3.4118e+01, -1.5041e+01, -7.0320e+00,
           1.4487e+01, -2.9288e+01,  4.8180e-01,  2.7899e+01, -1.0182e+01,
           3.6482e+01,  4.7782e+00, -1.5621e+01,  1.8825e+01,  2.9894e+01,
           2.7437e+00,  3.4271e+01, -2.5421e+01,  3.1532e+01,  9.2379e+00,
           6.1850e+01, -4

In [53]:
h.attention_scores[0][1]

tensor([[0.3704, 0.3704, 0.3704],
        [0.0000, 0.3704, 0.3704],
        [0.0000, 0.3704, 0.3704]], grad_fn=<SelectBackward0>)

In [23]:
16*32

512

In [35]:
x.shape

torch.Size([1, 3, 128])

In [45]:
x.transpose(0,1).is_contiguous()

True

In [46]:
x.transpose(0,1).contiguous().view(1,3,128)

tensor([[[  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
           11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
           22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
           33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
           44.,  45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,
           55.,  56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,
           66.,  67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,
           77.,  78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,
           88.,  89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,
           99., 100., 101., 102., 103., 104., 105., 106., 107., 108., 109.,
          110., 111., 112., 113., 114., 115., 116., 117., 118., 119., 120.,
          121., 122., 123., 124., 125., 126., 127.],
         [  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
           11.,  12.,  13.,  14.,  

In [57]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float ) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)  # W1 and B1 
        self.dropout = nn.Dropout(dropout)
        self.linear_2=nn.Linear(d_ff, d_model) # W2 and B2 

    def forward(self,x):
        # (Batch, Seq_Leln, d_model) --> (Batch, Seq_Len, d_ff) ---> (Batch, Seq_Len, d_model)
        x = self.linear_1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.linear_2(x)
        return x 

In [58]:
f=FeedForwardBlock(128,256,0.1)

In [60]:
f.forward(x).shape

torch.Size([1, 3, 128])

In [91]:
class A: 
    def __init__(self,a,b):
        self.a=a
        self.b=b
        self.ds=[1,2,3,4,5,5,34,3,4,6]
    def su_(self,c):
        return self.a+self.b+self.c
    def __len__(self):
        return len(self.ds)

In [89]:
A(3,4).len()

AttributeError: 'A' object has no attribute 'len'

In [92]:
len(A(3,4))

10

In [76]:
dir(B)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'new_sum']

In [83]:
def iterator(a):
    for i in range(a):
        yield i

In [85]:
for i in iterator(10):
    print(i)

0
1
2
3
4
5
6
7
8
9


In [93]:
def causal_mask(size):
    mask = torch.triu(torch.ones(1,size,size), diagonal=1).type(torch.int)

    return mask == 0 

In [94]:
causal_mask(25)

tensor([[[ True, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False],
         [ True,  True, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False],
         [ True,  True,  True, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False],
         [ True,  True,  True,  True, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False],
         [ True,  True,  True,  True,  True, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False,

In [102]:
size=4
mask = torch.triu(torch.ones(1,size,size), diagonal=1).type(torch.int)

In [103]:
mask

tensor([[[0, 1, 1, 1],
         [0, 0, 1, 1],
         [0, 0, 0, 1],
         [0, 0, 0, 0]]], dtype=torch.int32)

In [106]:
import torch 
import torch.nn as nn 
from torch.utils.data import Dataset

class BilingualDataset(Dataset):

    def __init__(self, ds, tokenizer_src, tokenizer_tgt,src_lang, tgt_lang, seq_len) -> None:
        super().__init__()
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt 
        self.src_lang = src_lang 
        self.tgt_lang = src_lang 

        self.sos_token = torch.Tensor([tokenizer_src.token_to_id(['[SOS]'])], dtype=torch.int64)
        self.eos_token = torch.Tensor([tokenizer_src.token_to_id(['[EOS]'])], dtype=torch.int64)
        self.pad_token = torch.Tensor([tokenizer_src.token_to_id(['[PAD]'])], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)
    
    def __getitem__(self, index):
        src_target_pair = self.ds[index]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        if enc_num_padding_tokens < 0 or dec_num_padding_tokens <0: 
            raise ValueError('sentence is too long') 
        
        # Add SOS and EOS to the source text
        encoder_input = torch.cat(
            [self.sos_token,
             torch.tensor(enc_input_tokens, dtype=torch.int64),
             self.eos_token,
             torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64)
             ]
        )
        # Add SOS to the decoder input
        decoder_input = torch.cat(
            [ 
                self.sos_token, 
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)

            ]
        )

        # Add EOS to the label (What we expect as output from the decoder)

        label = torch.cat(

            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)


            ]
        )

        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input":encoder_input, # (Seq_Len)
            "decoder_input":decoder_input , # (Seq_Len)
            "encoder_mask":(encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1,1, Seq_Len)
            "decoder_mask":(decoder_input != self.pad_token).unseueeze(0).unseueeze(0).int() & causal_mask(decoder_input.size(0))  # (1, Seq_Len)  &  (1,Seq_Len, Seq_Len)

        } 

def causal_mask(size):
    mask = torch.triu(torch.ones(1,size,size), diagonal=1).type(torch.int)

    return mask == 0 



In [3]:
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader, random_split

from datasets import load_dataset
from tokenizers import Tokenizer 
from tokenizers.models import WordLevel 
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace 
from dataset import causal_mask
from model import build_transformer

from torch.utils.data import Dataset, DataLoader, random_split
from pathlib import Path
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import warnings
from config import get_config, get_weights_file_path


In [4]:
import warnings

In [116]:
def get_config():
    return {
        "batch_size":8, 
        "num_epochs":20,
        "lr":10**-4, 
        "seq_len":350, 
        "d_model":512,
        "lang_src": "en",
        "lang_tgt":"it",
        "model_folder":"weights",
        "model_filename":"tmodel_",
        "preload":None,
        "tokenizer_file":"tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"

    }

def get_weights_file_path(config,epoch: str):
    model_folder = config['model_folder']
    model_basename = config['model_basename']
    model_filename = f"{model_basename}{epoch}.pt"
    return str(path('.') / model_folder / model_filename)



In [4]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float ) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)  # W1 and B1 
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # W2 and B2 

    def forward(self,x):
        # (Batch, Seq_Leln, d_model) --> (Batch, Seq_Len, d_ff) ---> (Batch, Seq_Len, d_model)
        #x = self.linear_1(x)
        #x = torch.relu(x)
        #x = self.dropout(x)
        #x = self.linear_2(x)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x)))) 
class ResidualConnection(nn.Module):
    
    def __init__(self, dropout:float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()
    
    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))



In [19]:
model = FeedForwardBlock(64, 2048, 0.1)

In [20]:
x = [[[i for i in range(0,64)],[i for i in range(0,64)],[i for i in range(0,64)]]]

In [21]:
x=torch.tensor(x).float()

In [87]:
df = pd.DataFrame()

In [72]:
df["hindi"]

In [103]:
f1=open("data/train.en")
f2=open("data/train.hi")
hindi=[]
english=[]
count=0

for eng in f1.readlines():
    #eng=eng.strip()
    english.append(eng)
    if count>50000:
        break
    count=count+1
count=0
for hin in f2.readlines():
    #hin=hin.strip()
    hindi.append(hin)
    if count>50000:
        break
    count=count+1

In [107]:
df.to_csv("data/train.csv",index=False)

In [108]:
df=pd.read_csv("data/train.csv")

In [110]:

df.head(20)

Unnamed: 0,english,hindi
0,"However, Paes, who was partnering Australia's Paul Hanley, could only go as far as the quarterfinals where they lost to Bhupathi and Knowles",आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाले पेस मियामी में क्वार्टरफाइनल तक ही पहुंच सके क्योंकि इस दौर में उन्हें भूपति और नोल्स ने हराया था।
1,"Whosoever desires the reward of the world, with Allah is the reward of the world and of the Everlasting Life. Allah is the Hearer, the Seer.",और जो शख्स (अपने आमाल का) बदला दुनिया ही में चाहता है तो ख़ुदा के पास दुनिया व आख़िरत दोनों का अज्र मौजूद है और ख़ुदा तो हर शख्स की सुनता और सबको देखता है
2,The value of insects in the biosphere is enormous because they outnumber all other living groups in measure of species richness.,"जैव-मंडल में कीड़ों का मूल्य बहुत है, क्योंकि प्रजातियों की समृद्धि के मामले में उनकी संख्या अन्य जीव समूहों से ज़्यादा है।"
3,Mithali To Anchor Indian Team Against Australia in ODIs,आस्ट्रेलिया के खिलाफ वनडे टीम की कमान मिताली को
4,"After the assent of the Honble President on 8thSeptember, 2016, the 101thConstitutional Amendment Act, 2016 came into existence","8 सितम्‍बर, 2016 को माननीय राष्‍ट्रपति की स्‍वीकृति मिलने के बाद 101वां संविधान संशोधन अधिनियम, 2016 अस्तित्‍व में आया"
5,The court has fixed a hearing for February 12,अदालत ने इस मामले में आगे की सुनवाई के लिए एक फरवरी की तारीख़ तय की
6,Please select the position where the track should be split.,"जहाँ पर ट्रैक को विभाजित किया जाना है, कृपया वह स्थान चुनें."
7,"As per police, armys 22RR, special operation Group (SOG) of police and the Central Reserve Police Force (CRPF) cordoned the village and launched search operation in the area.","इसके तुरंत बाद सेना की 22 राष्ट्रीय राइफल्स (आरआर), सीआरपीएफ और पुलिस के स्पेशल ऑपरेशन ग्रुप (एसओजी) के जवानों द्वारा इलाके की घेराबंदी कर तलाशी अभियान चलाया।"
8,Jharkhand chief minister Hemant Soren,झारखंड के मुख्यमंत्री हेमंत सोरेन (फोटोः पीटीआई)
9,"Arvind Kumar, SHO of the sector 55/56 police station, said a case has been registered under section 376-D (gang rape) of the Indian Penal Code.",सेक्टर 55/56 के एसएचओ अरविंद कुमार ने बताया कि इस मामले में आईपीसी की धारा 376-डी (गैंगरेप) के तहत मामला दर्ज कर लिया गया है।


In [132]:
def x(file):
    for i in file.readlines():
        yield i.strip()

In [108]:
f1=open("data/train.en")
f2=open("data/train_sample.en","w")
count=0
for i in f1.readlines():
    count=count+1
    f2.write(i)
    if count>5000:
        break
f1.close()
f2.close()

In [22]:
f1=open("data/train.hi")
count=0
for i in f1.readlines():
    count=count+1
print(count)

10125706


In [6]:

from tokenizers import Tokenizer 
from tokenizers.models import WordLevel 
from tokenizers.trainers import WordLevelTrainer
from pathlib import Path
from tokenizers.pre_tokenizers import Whitespace


def get_all_sentences(ds,lang):
    file=open(ds,'r')
    for i in file.readlines():
        yield i.strip()

def get_or_build_tokenizer(ds,lang):
    tokenizer_path = Path("tokenizer_{}.json".format(lang))
    if not Path.exists(tokenizer_path):
        tokenizer =Tokenizer(WordLevel(unk_token='[UNK]'))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]","[PAD]","[SOS]","[SOS]","[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds,lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    
    return tokenizer 

In [7]:
lang="english"
ds="data/train_sample.en"
get_or_build_tokenizer(ds,lang)

<tokenizers.Tokenizer at 0x34d359630>

In [191]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],min_frequency=2,vocab_size=10000)
tokenizer_path = Path("bpe_tokenizer_{}.json".format(lang))
tokenizer.pre_tokenizer = Whitespace()
tokenizer.train_from_iterator(get_all_sentences(ds,lang), trainer=trainer)
tokenizer.save(str(tokenizer_path))







In [201]:
x=tokenizer.encode("If the sequence has length n, there are n−1 possible pairs.").ids

In [202]:
tokenizer.decode(x)

'If the sequence has length n , there are n 1 possible pair s .'

In [188]:
BpeTrainer?

[0;31mInit signature:[0m [0mBpeTrainer[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m/[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Trainer capable of training a BPE model

Args:
    vocab_size (:obj:`int`, `optional`):
        The size of the final vocabulary, including all tokens and alphabet.

    min_frequency (:obj:`int`, `optional`):
        The minimum frequency a pair should have in order to be merged.

    show_progress (:obj:`bool`, `optional`):
        Whether to show progress bars while training.

    special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
        A list of special tokens the model should know of.

    limit_alphabet (:obj:`int`, `optional`):
        The maximum different characters to keep in the alphabet.

    initial_alphabet (:obj:`List[str]`, `optional`):
        A list of characters to include in the initial alphabet, even
        if not 

In [8]:
from config import get_config

In [9]:
x=get_config()

In [10]:
x

{'batch_size': 8,
 'num_epochs': 20,
 'lr': 0.0001,
 'seq_len': 350,
 'd_model': 8,
 'head': 8,
 'encoder': 6,
 'decoder': 6,
 'datasource': 'data',
 'lang_src': 'en',
 'lang_tgt': 'hi',
 'model_folder': 'weights',
 'model_basename': 'tmodel_',
 'preload': 'latest',
 'tokenizer_file': 'tokenizer_{0}.json',
 'experiment_name': 'runs/tmodel'}

In [12]:
from utils import bpe_tokenizer

In [17]:
from config import get_config as config

In [21]:
config()

{'batch_size': 8,
 'num_epochs': 20,
 'lr': 0.0001,
 'seq_len': 350,
 'd_model': 8,
 'head': 8,
 'encoder': 6,
 'decoder': 6,
 'datasource': 'data',
 'lang_src': 'en',
 'lang_tgt': 'hi',
 'model_folder': 'weights',
 'model_basename': 'tmodel_',
 'preload': 'latest',
 'tokenizer_file': 'tokenizer_{0}.json',
 'experiment_name': 'runs/tmodel'}

In [42]:
from tokenizer import bpe_tokenizer

In [3]:

tokenizer_path="bpe_tokenizer_en.json"
tokenizer = Tokenizer.from_file(str(tokenizer_path))

NameError: name 'Tokenizer' is not defined

[4448,
 3084,
 4148,
 3091,
 5110,
 16,
 5110,
 9335,
 7221,
 5065,
 15,
 12137,
 16,
 9632,
 3091,
 4899,
 9323,
 22088,
 3148,
 10599,
 15,
 22161,
 8343,
 3088,
 3155,
 9199]

In [1]:
from tokenizer import bpe_tokenizer   #,get_all_sentences
from config import get_config

In [2]:
config = get_config() 
lang="en"
data_path =config['datasource'].format(lang)
ds = open(data_path,'r')
bpe_tokenizer(config, ds, lang)






<tokenizers.Tokenizer at 0x110e41030>

In [65]:
from pathlib import Path

In [71]:
Path(config['datasource'].format("lang")

SyntaxError: incomplete input (3357829416.py, line 1)

In [75]:
config

{'batch_size': 8,
 'num_epochs': 20,
 'lr': 0.0001,
 'seq_len': 350,
 'd_model': 8,
 'head': 8,
 'encoder': 6,
 'decoder': 6,
 'datasource': 'data',
 'lang_src': 'en',
 'lang_tgt': 'hi',
 'model_folder': 'weights',
 'model_basename': 'tmodel_',
 'preload': 'latest',
 'tokenizer_file': 'tokenizer_{0}.json',
 'experiment_name': 'runs/tmodel'}

In [7]:
from datasets import load_dataset 

In [9]:

lang_src="en"
lang_tgt="it"
ds_raw = load_dataset("opus_books",f'{"en"}-{"it"}', split='train')

In [10]:
type(ds_raw)

datasets.arrow_dataset.Dataset

In [12]:
dir(ds_raw)

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguo

In [13]:
import os
import pandas as pd
from torchvision.io import read_image
from pathlib import PATH

class dataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
        self.file = PATH("data/train_sample.en")
    def __len__(self):
        return len(self.file)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = read_image(img_path)
        label = self.img_labels.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

(32332, 2)

In [31]:
import os
import pandas as pd
from torch.utils.data import Dataset

class dataset(Dataset):
    def __init__(self):
        self.file = pd.read_csv("data/train.csv")
    def __len__(self):
        return len(self.file)
    def __getitem__(self,idx):
        src = self.file.iloc[idx]['english']
        tgt = self.file.iloc[idx]['hindi']
        return src,tgt
        

In [97]:
import os
import pandas as pd
from torch.utils.data import Dataset

class dataset(Dataset):
    def __init__(self):
        
        self.file = open("data/train.en").readlines()
    def __len__(self):
        return len(self.file.readlines())
    def __getitem__(self,idx):
        return self.file.readlines()[idx].strip()

In [101]:
file = open("data/train.en")

In [43]:
len(f.readlines())

50001

In [59]:
import random

In [104]:
a=[i for i in range(0,1000)]

In [105]:
a,b=a[:700],a[700:]

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [19]:
import requests

# URL of the file
url = "https://drive.google.com/uc?export=download&id=1Tit_Fz9pfQhQxlSGGOkGRgw59TuVtpF7"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Open a local file with write-binary mode
    with open("data/sample_new.en", "wb") as file:
        # Write the content of the response (the file) to the local file
        file.write(response.content)
    print("File downloaded successfully.")
else:
    print("Failed to download the file. Status code:", response.status_code)


File downloaded successfully.


In [23]:
import pandas as pd

In [24]:
f=pd.read_csv("https://drive.google.com/uc?export=download&id=1Tit_Fz9pfQhQxlSGGOkGRgw59TuVtpF7")

Unnamed: 0,"<!DOCTYPE html><html><head><title>Google Drive - Virus scan warning</title><meta http-equiv=""content-type"" content=""text/html; charset=utf-8""/><style nonce=""M6ccpFyNkyZIZzZSx29_dw"">.goog-link-button{position:relative;color:#15c;text-decoration:underline;cursor:pointer}.goog-link-button-disabled{color:#ccc;text-decoration:none;cursor:default}body{color:#222;font:normal 13px/1.4 arial",sans-serif;margin:0}.grecaptcha-badge{visibility:hidden}.uc-main{padding-top:50px;text-align:center}#uc-dl-icon{display:inline-block;margin-top:16px;padding-right:1em;vertical-align:top}#uc-text{display:inline-block;max-width:68ex;text-align:left}.uc-error-caption,".uc-warning-caption{color:#222;font-size:16px}#uc-download-link{text-decoration:none}.uc-name-size a{color:#15c;text-decoration:none}.uc-name-size a:visited{color:#61c;text-decoration:none}.uc-name-size a:active{color:#d14836;text-decoration:none}.uc-footer{color:#777;font-size:11px;padding-bottom:5ex;padding-top:5ex;text-align:center}.uc-footer a{color:#15c}.uc-footer a:visited{color:#61c}.uc-footer a:active{color:#d14836}.uc-footer-divider{color:#ccc;width:100%}.goog-inline-block{position:relative;display:-moz-inline-box;display:inline-block}* html .goog-inline-block{display:inline}*:first-child+html .goog-inline-block{display:inline}sentinel{}</style><link rel=""icon"" href=""//ssl.gstatic.com/docs/doclist/images/drive_2022q3_32dp.png""/></head><body><div class=""uc-main""><div id=""uc-dl-icon"" class=""image-container""><div class=""drive-sprite-aux-download-file""></div></div><div id=""uc-text""><p class=""uc-warning-caption"">Google Drive can't scan this file for viruses.</p><p class=""uc-warning-subcaption""><span class=""uc-name-size""><a href=""/open?id=1Tit_Fz9pfQhQxlSGGOkGRgw59TuVtpF7"">train.en</a> (954M)</span> is too large for Google to scan for viruses. Would you still like to download this file?</p><form id=""download-form"" action=""https://drive.usercontent.google.com/download"" method=""get""><input type=""submit"" id=""uc-download-link"" class=""goog-inline-block jfk-button jfk-button-action"" value=""Download anyway""/><input type=""hidden"" name=""id"" value=""1Tit_Fz9pfQhQxlSGGOkGRgw59TuVtpF7""><input type=""hidden"" name=""export"" value=""download""><input type=""hidden"" name=""confirm"" value=""t""><input type=""hidden"" name=""uuid"" value=""8f8d43f2-7ccf-41f7-bfe2-c699ab864b26""></form></div></div><div class=""uc-footer""><hr class=""uc-footer-divider""></div></body></html>"


In [18]:
import requests
from tqdm import tqdm

# URL of the file
url = "https://drive.google.com/uc?export=download&id=1Tit_Fz9pfQhQxlSGGOkGRgw59TuVtpF7"

# Start a session
session = requests.Session()

# Get the initial response to handle redirections and fetch cookies
response = session.get(url, stream=True)

# Find the confirm token in the response text
for key, value in response.cookies.items():
    if key.startswith('download_warning'):
        confirm_token = value
        break
else:
    confirm_token = None

if confirm_token:
    params = {'id': '1Tit_Fz9pfQhQxlSGGOkGRgw59TuVtpF7', 'confirm': confirm_token}
    response = session.get(url, params=params, stream=True)

# Total file size in bytes
file_size = int(response.headers.get('Content-Length', 0))

# Download the file in chunks and display progress
chunk_size = 10240
with open("data/downloaded_file.en", "wb") as file:
    for data in tqdm(response.iter_content(chunk_size=chunk_size), total=file_size // chunk_size, unit='MB'):
        file.write(data)

print("File downloaded successfully.")


1MB [00:00, 1940.91MB/s]

File downloaded successfully.





In [12]:
import requests 
import time 
from multiprocessing import cpu_count 
from multiprocessing.pool import ThreadPool


In [26]:
import requests
from tqdm import tqdm

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value
    return None

def download_file_from_google_drive(file_id, destination):
    URL = "https://drive.google.com/uc?export=download"
    
    with requests.Session() as session:
        response = session.get(URL, params={'id': file_id}, stream=True)
        token = get_confirm_token(response)

        if token:
            params = {'id': file_id, 'confirm': token}
            response = session.get(URL, params=params, stream=True)
        
        save_response_content(response, destination)

def save_response_content(response, destination):
    chunk_size = 32 * 1024
    total_size = int(response.headers.get('content-length', 0))
    with open(destination, "wb") as f:
        for chunk in tqdm(response.iter_content(chunk_size), total=total_size // chunk_size, unit='KB'):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)

file_id = '1Tit_Fz9pfQhQxlSGGOkGRgw59TuVtpF7'
destination = 'downloaded_file'

download_file_from_google_drive(file_id, destination)

print("File downloaded successfully.")


1KB [00:00, 1154.82KB/s]

File downloaded successfully.





In [28]:
#taken from this StackOverflow answer: https://stackoverflow.com/a/39225039
import requests

def download_file_from_google_drive(id, destination):
    URL = "https://drive.google.com/uc?export=download&id=1Tit_Fz9pfQhQxlSGGOkGRgw59TuVtpF7"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in tqdm(response.iter_content(CHUNK_SIZE)):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

In [29]:
file_id = '1Tit_Fz9pfQhQxlSGGOkGRgw59TuVtpF'
destination = 'data/myfile.en'
download_file_from_google_drive(file_id, destination)

1it [00:00, 1911.72it/s]


In [2]:
from datasets import load_dataset

ds = load_dataset("cfilt/iitb-english-hindi")

In [6]:
x=ds['train'][:10]

In [7]:
x

{'translation': [{'en': 'Give your application an accessibility workout',
   'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'},
  {'en': 'Accerciser Accessibility Explorer',
   'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'},
  {'en': 'The default plugin layout for the bottom panel',
   'hi': 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका'},
  {'en': 'The default plugin layout for the top panel',
   'hi': 'ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका'},
  {'en': 'A list of plugins that are disabled by default',
   'hi': 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है'},
  {'en': 'Highlight duration', 'hi': 'अवधि को हाइलाइट रकें'},
  {'en': 'The duration of the highlight box when selecting accessible nodes',
   'hi': 'पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि'},
  {'en': 'Highlight border color',
   'hi': 'सीमांत (बोर्डर) के रंग को हाइलाइट करें'},
  {'en': 'The color and opacity of the highlight border.',
   'hi': 'हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। '},
  {'en

In [17]:
x=ds['train'][:50]

In [18]:
x

{'translation': [{'en': 'Give your application an accessibility workout',
   'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'},
  {'en': 'Accerciser Accessibility Explorer',
   'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'},
  {'en': 'The default plugin layout for the bottom panel',
   'hi': 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका'},
  {'en': 'The default plugin layout for the top panel',
   'hi': 'ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका'},
  {'en': 'A list of plugins that are disabled by default',
   'hi': 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है'},
  {'en': 'Highlight duration', 'hi': 'अवधि को हाइलाइट रकें'},
  {'en': 'The duration of the highlight box when selecting accessible nodes',
   'hi': 'पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि'},
  {'en': 'Highlight border color',
   'hi': 'सीमांत (बोर्डर) के रंग को हाइलाइट करें'},
  {'en': 'The color and opacity of the highlight border.',
   'hi': 'हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। '},
  {'en

In [43]:
from config import get_config
config=get_config()
ds_raw = load_dataset(config["datasource"])['train']

In [49]:
from tokenizer import bpe_tokenizer

In [58]:
tokenizer_src = bpe_tokenizer(config, ds_raw, config['lang_tgt'])

In [65]:
tokenizer_src.encode("चोरी नहीं हत्या करना था मकसद, नफरत से भरे थे हत्‍यारे; हिरासत में चार संदिग्ध").ids

[6781,
 594,
 4370,
 769,
 572,
 13978,
 17,
 11890,
 534,
 4248,
 700,
 369,
 8949,
 642,
 32,
 21983,
 531,
 764,
 11070]

In [62]:
print(x)

चोरी नहीं हत्या करना था मकसद , नफरत से भरे थे ह त्‍य ारे ; हिरासत में चार संदिग्ध
