In [1]:
from transformers import AutoTokenizer, AutoModel

bnbert_tokenizer = AutoTokenizer.from_pretrained('sagorsarker/bangla-bert-base')

bnbert_tokenizer

PreTrainedTokenizerFast(name_or_path='sagorsarker/bangla-bert-base', vocab_size=101975, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [2]:
text = "আমি বাংলায় গান গাই।"
bnbert_tokenizer.tokenize(text)

['আমি', 'বাংলা', '##য', 'গান', 'গাই', '।']

In [3]:
from transformers import MarianTokenizer, MarianMTModel
from typing import List

In [4]:
src='bn'
trg = 'en'
sample_text = "আমি বাংলায় গান গাই।"
model_name = f'Helsinki-NLP/opus-mt-{src}-{trg}'

model_name

'Helsinki-NLP/opus-mt-bn-en'

In [5]:
model = MarianMTModel.from_pretrained(model_name)
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(63597, 512, padding_idx=63596)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(63597, 512, padding_idx=63596)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
   

In [6]:
try:
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    print(tokenizer)
except Exception as e:
    print(e)
    print('installing')
    !pip install sentencepiece
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    print(tokenizer)

Downloading:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/806k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.25M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

PreTrainedTokenizer(name_or_path='Helsinki-NLP/opus-mt-bn-en', vocab_size=63597, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})


In [7]:
batch = tokenizer([sample_text], return_tensors="pt")
batch

{'input_ids': tensor([[   51, 48920,  2622, 18393,    12,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [8]:
gen = model.generate(**batch)
gen

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ..\aten\src\ATen\native\BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


tensor([[63596,    35,  9517,    15, 18428,     4,     0]])

In [9]:
tokenizer.batch_decode(gen,skip_special_tokens=True)


['I sing in Bangla.']

In [10]:
tokenizer.save_pretrained(model_name)

('Helsinki-NLP/opus-mt-bn-en\\tokenizer_config.json',
 'Helsinki-NLP/opus-mt-bn-en\\special_tokens_map.json',
 WindowsPath('Helsinki-NLP/opus-mt-bn-en/source_spm'),
 WindowsPath('Helsinki-NLP/opus-mt-bn-en/target_spm'),
 WindowsPath('Helsinki-NLP/opus-mt-bn-en/vocab'),
 WindowsPath('Helsinki-NLP/opus-mt-bn-en/tokenizer_config_file'),
 'Helsinki-NLP/opus-mt-bn-en\\added_tokens.json')

In [11]:
model.save_pretrained(model_name)

In [12]:
sample_text = "আমি বাংলায় গান গাই।"
batch = tokenizer([sample_text], return_tensors="pt")
gen = model.generate(**batch)
tokenizer.batch_decode(gen,skip_special_tokens=True)

['I sing in Bangla.']