In [1]:
!pip install transformers datasets evaluate sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacrebleu, evaluate
Successfully installed evaluate-0.4.0 sacrebleu-2.3.1
[0m

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

Global imports

In [3]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import torch
import torch.nn as nn

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# # https://huggingface.co/datasets/wmt14
# from datasets import load_dataset

# fren_ds = load_dataset("wmt14", 'fr-en')
books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.2)

NameError: name 'load_dataset' is not defined

# CANINE: Translation Task

As CANINE isn't made for Seq2Seq tasks, we have to use it another way. We decided to use CANINE as an encoder, to use its features, and stack a Decoder for Translation on it.

In order to do that, we used the EncoderDecoder of HuggingFace, in order to use another pre-trained decoder.
The resulting model being too large, we weren't able to tran it on our computers, on kaggle or on google colab.

Thus, we had to freeze the layers of either the encoder (Canine) or the decoder. We choose to freeze Canine for several reasons:
* The decoder is pre-trained with its tokenizer. Since Canine's objective is to get rid of any tokenizer, we have to fine-tune the decoder to make it forget its embedding.
* CANINE's papers prones its pre-trained features, then it's coherent to use it without fine-tunning.

Note that we have to adapt the embedding dimension of the decoder since CANINE is vocabulary free.

## Using BART - For English to French (on Books)

To familiarize with the architecture, we first try to fine-tune our model on the Books dataset, to translate from English to French.

Results were not pretty good but as good to glimpse a possibility of being good on translation.

### Model configuration

Load model

In [5]:
from transformers import CanineTokenizer, BartTokenizer, CanineConfig, BartConfig, EncoderDecoderConfig, EncoderDecoderModel

config_encoder = CanineConfig().from_pretrained("google/canine-c")
config_decoder = BartConfig().from_pretrained("facebook/bart-base")
config_decoder.vocab_size = config_encoder.eos_token_id+1
config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

tokenizer_encoder = CanineTokenizer.from_pretrained("google/canine-c")
tokenizer_decoder = BartTokenizer.from_pretrained("facebook/bart-base")

model = EncoderDecoderModel(config).to(device)
model.config.decoder_start_token_id = model.decoder.config.decoder_start_token_id
model.config.pad_token_id = model.decoder.config.pad_token_id

Downloading (…)lve/main/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/657 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Config of the decoder: <class 'transformers.models.bart.modeling_bart.BartForCausalLM'> is overwritten by shared decoder config: BartConfig {
  "_name_or_path": "bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_cross_attention": true,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,

Freeze encoder

In [6]:
for param in model.encoder.parameters():
    param.requires_grad = False

### Data loading

In [7]:
def preprocess_function(examples):
    inputs = [example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer_encoder(inputs, text_target=targets, max_length=128, truncation=True, padding="max_length")
    return model_inputs

In [8]:
from datasets import load_dataset
from torch.utils.data import DataLoader

source_lang = "en"
target_lang = "fr"

books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.2)

tokenized_books = books.map(preprocess_function, batched=True)
tokenized_books = tokenized_books.remove_columns(["translation", "id"])
tokenized_books.set_format("torch")

small_train_dataset = tokenized_books["train"].shuffle(seed=42)#.select(range(1000))
small_eval_dataset = tokenized_books["test"].shuffle(seed=42)#.select(range(1000))

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

Downloading builder script:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.98k [00:00<?, ?B/s]

Downloading and preparing dataset opus_books/en-fr (download: 11.45 MiB, generated: 31.47 MiB, post-processed: Unknown size, total: 42.92 MiB) to /root/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf...


Downloading data:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

Dataset opus_books downloaded and prepared to /root/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?ba/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

### Load metrics

In [27]:
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(preds, labels):
#     preds, labels = eval_preds
    preds = preds.detach().cpu()
    labels = labels.detach().cpu()
    decoded_preds = tokenizer_encoder.batch_decode(preds, skip_special_tokens=True)
    
#     labels = np.where(labels != -100, labels, tokenizer_decoder.pad_token_id)
#     print(labels)
    decoded_labels = tokenizer_encoder.batch_decode(labels, skip_special_tokens=True)
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer_encoder.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    
    return result

### Training

In [10]:
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [13]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/12709 [00:00<?, ?it/s]

In [12]:
 trainer.save_model("/kaggle/working/saves/")

NameError: name 'trainer' is not defined

In [49]:
for i, batch in enumerate(eval_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}

    inputs = tokenizer_encoder.batch_decode(batch['input_ids'])
    truth = tokenizer_encoder.batch_decode(batch['labels'])
    outputs = model.generate(batch['input_ids'], max_length=150)
    
#     print(compute_metrics(outputs, batch['labels']))
    
    dec_outputs = tokenizer_encoder.batch_decode(outputs)
    print(f"""
> {inputs[1]}
* {truth[1]}
< {dec_outputs[1]}
    """)
    if i == 5:
        break

# decoded_labels = tokenizer_decoder.batch_decode(labels, skip_special_tokens=True)


> [CLS]"You saw?" he said.[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
* [CLS]--Tu as vu? dit-il.[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][

In [46]:
bleu_scores = []
for i, batch in enumerate(eval_dataloader):
    if i%200 == 0:
        print(i)
    batch = {k: v.to(device) for k, v in batch.items()}
    output = model.generate(batch['input_ids'], max_length=100)
    
    bleu_score = compute_metrics(output, batch['labels'])['bleu']
    bleu_scores.append(bleu_score)

0


KeyboardInterrupt: 

In [42]:
np.save('/kaggle/working/bleu_score200.npy', np.array(bleu_scores))

## Multilanguage (French/Hindi/Czech) to English (on wmt14)

To exploit the fully potentiel of being vocabulary-free, we have to train on different languages at once. Thus we learn to translate from french, hindi or czech (which all have very different languages properties) to english.

Note that it would have been interested to enable the translation to any language, but it would have demanded to use a specific token on input, and then to fine-tune CANINE.

### Model Configuration

Load model

In [None]:
from transformers import CanineTokenizer, CanineConfig, BartConfig, EncoderDecoderConfig, EncoderDecoderModel


tokenizer_encoder = CanineTokenizer.from_pretrained("google/canine-c")

config_encoder = CanineConfig().from_pretrained("google/canine-c")
config_decoder = BartConfig().from_pretrained("facebook/bart-base")
config_decoder.vocab_size = tokenizer_encoder.vocab_size
config_decoder.max_position_embeddings = 128

config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

model = EncoderDecoderModel(config).to(device)
model.config.decoder_start_token_id = model.decoder.config.decoder_start_token_id
model.config.pad_token_id = model.decoder.config.pad_token_id

Freeze encoder

In [None]:
for param in model.encoder.parameters():
    param.requires_grad = False

### Data loading

In [None]:
from datasets import load_dataset
from torch.utils.data import DataLoader

target_lang = "en"
 
fren_ds = load_dataset("wmt14", 'fr-en') # 40836715
hien_ds = load_dataset("wmt14", 'hi-en') # 32863
csen_ds = load_dataset("wmt14", 'cs-en') # 953621

Let's make a dataset containing 30000 examples of each languages

In [None]:
sample_fren_ds_train = fren_ds["train"].shuffle().select(range(10000))
sample_hien_ds_train = hien_ds["train"].shuffle().select(range(500))
sample_csen_ds_train = csen_ds["train"].shuffle().select(range(500))

# Convert language key into 'src' to be able to merge datasets
sample_fren_ds_train = sample_fren_ds_train.map(lambda examples: format_lg(examples, "fr"), batched=True)
sample_hien_ds_train = sample_hien_ds_train.map(lambda examples: format_lg(examples, "hi"), batched=True)
sample_csen_ds_train = sample_csen_ds_train.map(lambda examples: format_lg(examples, "cs"), batched=True)

full_ds_train = concatenate_datasets((sample_fren_ds_train, sample_hien_ds_train, sample_csen_ds_train))

In [None]:
sample_fren_ds_valid = fren_ds["validation"].shuffle().select(range(500))
sample_hien_ds_valid = hien_ds["validation"].shuffle().select(range(500))
sample_csen_ds_valid = csen_ds["validation"].shuffle().select(range(500))

# Convert language key into 'src' to be able to merge datasets
sample_fren_ds_valid = sample_fren_ds_valid.map(lambda examples: format_lg(examples, "fr"), batched=True)
sample_hien_ds_valid = sample_hien_ds_valid.map(lambda examples: format_lg(examples, "hi"), batched=True)
sample_csen_ds_valid = sample_csen_ds_valid.map(lambda examples: format_lg(examples, "cs"), batched=True)

full_ds_valid = concatenate_datasets((sample_fren_ds_valid, sample_hien_ds_valid, sample_csen_ds_valid))

Data Loader

In [None]:
from torch.utils.data import DataLoader

def preprocess_function(examples):
    inputs = [example["src"] for example in examples["translation"]]
    targets = [example["en"] for example in examples["translation"]]
    model_inputs = tokenizer_encoder(inputs, text_target=targets, max_length=128, truncation=True, padding="max_length")
    return model_inputs

tokenized_ds_train = full_ds_train.map(preprocess_function, batched=True)
tokenized_ds_train = tokenized_ds_train.remove_columns(["translation"])
tokenized_ds_train.set_format("torch")

tokenized_ds_valid = full_ds_valid.map(preprocess_function, batched=True)
tokenized_ds_valid = tokenized_ds_valid.remove_columns(["translation"])
tokenized_ds_valid.set_format("torch")

train_dataloader = DataLoader(tokenized_ds_train, shuffle=True, batch_size=1)
valid_dataloader = DataLoader(tokenized_ds_valid, shuffle=True, batch_size=1)

### Training

In [None]:
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

optimizer = AdamW(model.parameters(), lr=1e-5)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
model.save_pretrained("/kaggle/working/")

In [None]:
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    inputs = tokenizer_encoder.batch_decode(batch['input_ids'])
    truth = tokenizer_encoder.batch_decode(batch['labels'])
    outputs = model.generate(batch['input_ids'], max_length=100)
    dec_outputs = tokenizer_encoder.batch_decode(outputs)
    break

## Only French to English

As the embedding is too expensive, we have to reduce its length, and then only use "french" unicode characters

### Model Configuration

Load the model

In [None]:
from transformers import CanineTokenizer, CanineConfig, BartConfig, EncoderDecoderConfig, EncoderDecoderModel

tokenizer_encoder = CanineTokenizer.from_pretrained("google/canine-c")

config_encoder = CanineConfig().from_pretrained("google/canine-c")
config_decoder = BartConfig().from_pretrained("facebook/bart-base")
config_decoder.vocab_size = 2*tokenizer_encoder.eos_token_id + 1
config_decoder.max_position_embeddings = 128

config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

model = EncoderDecoderModel(config).to(device)
model.config.decoder_start_token_id = model.decoder.config.decoder_start_token_id
model.config.pad_token_id = model.decoder.config.pad_token_id;

Freeze encoder

In [None]:
for param in model.encoder.parameters():
    param.requires_grad = False

### Data Loading

In [None]:
from datasets import load_dataset
from torch.utils.data import DataLoader

fren_ds = load_dataset("wmt14", 'de-en') # 40836715

Create Dataset

In [None]:
sample_ds_train = fren_ds["train"].shuffle().select(range(10000))
sample_ds_valid = fren_ds["validation"].shuffle().select(range(500))

Create Dataloader

In [None]:
from torch.utils.data import DataLoader

def preprocess_function(examples):
    inputs = [example["de"] for example in examples["translation"]]
    targets = [example["en"] for example in examples["translation"]]
    model_inputs = tokenizer_encoder(inputs, text_target=targets, max_length=128, truncation=True, padding="max_length")
    return model_inputs

tokenized_ds_train = sample_ds_train.map(preprocess_function, batched=True)
tokenized_ds_train = tokenized_ds_train.remove_columns(["translation"])
tokenized_ds_train.set_format("torch")

tokenized_ds_valid = sample_ds_valid.map(preprocess_function, batched=True)
tokenized_ds_valid = tokenized_ds_valid.remove_columns(["translation"])
tokenized_ds_valid.set_format("torch")

train_dataloader = DataLoader(tokenized_ds_train, shuffle=True, batch_size=1)
valid_dataloader = DataLoader(tokenized_ds_valid, shuffle=True, batch_size=1)

### Training

In [None]:
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

optimizer = AdamW(model.parameters(), lr=1e-5)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
model.save_pretrained("/kaggle/working/")

In [None]:
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    inputs = tokenizer_encoder.batch_decode(batch['input_ids'])
    truth = tokenizer_encoder.batch_decode(batch['labels'])
    outputs = model.generate(batch['input_ids'], max_length=100)
    dec_outputs = tokenizer_encoder.batch_decode(outputs)
    break

## Using T5

In [None]:
model2 = EncoderDecoderModel.from_pretrained('/kaggle/working/save').to(device)

In [None]:
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
#     print(batch)
    inputs = tokenizer_encoder.batch_decode(batch['input_ids'])
    truth = tokenizer_encoder.batch_decode(batch['labels'])
    print(inputs[0:3])
    print('----------')
    print(truth[0:3])
    print('----------')
    outputs = model2.generate(batch['input_ids'], max_length=100)
    print(outputs.shape)
    dec_outputs = tokenizer_encoder.batch_decode(outputs)
    print(dec_outputs[0:3])
#     print('----------')
    break

# decoded_labels = tokenizer_decoder.batch_decode(labels, skip_special_tokens=True)

In [None]:
!mkdir /kaggle/working/save

In [None]:
!mv /kaggle/working/*.json /kaggle/working/save

## Using Custom Decoder

In [None]:
from transformers import CanineTokenizer, CanineConfig, CanineModel

config = CanineConfig()
tokenizer = CanineTokenizer.from_pretrained("google/canine-c")
model = CanineModel.from_pretrained("google/canine-c").to(device)

In [None]:
MAX_LENGTH=500

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, inputs, hidden):
        print(inputs)
        embedded = self.embedding(inputs).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout=0.1, max_length=MAX_LENGTH):
        super().__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout = dropout
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, inputs, hidden, encoder_outputs):
        print(inputs)
        embedded = self.embedding(inputs).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
# tokenizer = CanineTokenizer.from_pretrained("google/canine-c")

fr_seqs = ["Je suis un homme"]
en_seqs = ["I am a man"]
input_tensor = tokenizer(fr_seqs, padding=True, return_tensors='pt')
output_tensor = tokenizer(en_seqs, padding=True, return_tensors='pt')

In [None]:
hidden_size = 256
encoder = EncoderRNN(config.eos_token_id+1, hidden_size).to(device)
attn_decoder = AttnDecoderRNN(hidden_size, config.eos_token_id+1, dropout=0.1).to(device)

encoder_hidden = encoder.init_hidden()
for ei in range(input_tensor.input_ids.shape[1]):
    encoder_output, encoder_hidden = encoder(input_tensor.input_ids[ei], encoder_hidden)
    encoder_outputs[ei] = encoder_output[0, 0]

In [None]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.init_hidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

___________

## Model

In [None]:
config = CanineConfig.from_pretrained("google/canine-c")
canine_c = CanineModel.from_pretrained('google/canine-c')
tokenizer = CanineTokenizer.from_pretrained('google/canine-c')

toklen = len(tokenizer)
hid = config.hidden_size

________

## CANINE

In [None]:
from transformers import CanineTokenizer, CanineModel

model = CanineModel.from_pretrained("google/canine-c")
tokenizer = CanineTokenizer.from_pretrained("google/canine-c")

inputs = ["Life is like a box of chocolates.", "You never know what you gonna get.", "I juste want to try something with a long sentence you know."]
encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt")

outputs = model(**encoding)  # forward pass
pooled_output = outputs.pooler_output 
sequence_output = outputs.last_hidden_state  # (batch_size, seq_length, hidden_dim)

In [None]:
outputs.last_hidden_state.shape

In [None]:
from datasets import load_dataset

# fren_ds = load_dataset("wmt14", 'fr-en')
books = load_dataset(";:<<<<<", "en-fr")
books = books["train"].train_test_split(test_size=0.2)

_________________