In [1]:
# since this file only test 2 models together, and train_moe_labels.txt contains numbers from
# all 5 models, we gotta force it all into 2 numbers only. (Or else cuda will crash xpp)

with open('train_moe_labels.txt','r') as f, open('examples.txt', 'w') as w:
    for line in f:
        num = int(line)
        if num == 3:
            w.write(str(1) + '\n')
        else:
            w.write(str(0) + '\n')

In [2]:
from transformers import AutoModelForSeq2SeqLM, M2M100ForConditionalGeneration, AutoTokenizer
from datasets import Dataset
import torch

models = [
    AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en").to("cuda"),
    M2M100ForConditionalGeneration.from_pretrained("alirezamsh/small100").to("cuda")
]

tokenizers = [
    AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en"),
    AutoTokenizer.from_pretrained("alirezamsh/small100"),
]

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
class EnsembleModel(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.l1 = torch.nn.Linear(193113, 512).to('cuda')
        self.l2 = torch.nn.LeakyReLU(0.1)
        self.l3 = torch.nn.Linear(512, 2).to('cuda')

    def forward(self, concatted_outputs):
        # print(len(concatted_outputs))
        x = self.l1(concatted_outputs)
        x = self.l2(x)
        x = self.l3(x)
        return x

In [35]:
from utils import read_file
from torch.utils.data import Dataset as Ds, Subset

class TrainingDataset(Ds):
    def __init__(self, text_path, lab_path, models, model_tokenizers):
        '''
        dataset_reduce_scale = reduce the sample size of the dataset. 
        E.g dataset_reduce_scale=5 on sample size 100, basically reduce sample size from 100 to 20.
        '''
        self.untranslated_texts = read_file(text_path)
        self.best_model_idx_labels = read_file(lab_path)

        self.model_tokenizers = model_tokenizers
        self.models = models

        start_token_ids = [model.config.decoder_start_token_id for model in self.models]
        self.decoder_input_ids_list = [torch.tensor([[start_token_id]]).to("cuda") for start_token_id in start_token_ids]

    def __len__(self):
        return len(self.untranslated_texts)
    
    def __getitem__(self, idx):
        untranslated_text = self.untranslated_texts[idx]
        concatted_outputs = self.create_model_input(untranslated_text)
    
        best_model_idx = torch.tensor(int(self.best_model_idx_labels[idx]))
        
        return concatted_outputs, best_model_idx
    
    def create_model_input(self, untranslated_text):
        with torch.no_grad():
            tokenized_texts = [tokenizer(untranslated_text, return_tensors="pt").to("cuda") for tokenizer in self.model_tokenizers]
            output_logits = [model(**tokenized_text, decoder_input_ids=decoder_input_ids).logits for model, tokenized_text, decoder_input_ids in zip(models, tokenized_texts, self.decoder_input_ids_list)]
            concatted_outputs = torch.cat(output_logits, dim=-1)
            concatted_outputs = concatted_outputs.squeeze()
        return concatted_outputs
    
    # def get_chosen_sentence(self, model_output, tokenized_texts):
    #     best_idx = torch.argmax(model_output)
    #     model_chosen = self.models[best_idx]
    #     model_tokenizer_chosen = self.model_tokenizers[best_idx]
    #     
    #     tokenized_text_chosen_of_model = tokenized_texts[best_idx]
    #     outputs = model_chosen.generate(**tokenized_text_chosen_of_model)
    #     decoded_outputs = model_tokenizer_chosen.decode(outputs[0], skip_special_tokens=True)
    #     return decoded_outputs

In [36]:
import datetime
from torch.utils.data import DataLoader

def train(model, dataset, batch_size, learning_rate, num_epoch, model_path=None):
    """
    Complete the training procedure below by specifying the loss function
    and optimizers with the specified learning rate and specified number of epoch.

    """
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    start = datetime.datetime.now()
    for epoch in range(num_epoch):
        model.train()
        running_loss = 0.0
        for step, data in enumerate(data_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            untranslated_text = data[0].to('cuda')
            best_model_idx = data[1].to('cuda')

            # zero the parameter gradients
            model.zero_grad()

            # do forward propagation
            probs = model(untranslated_text)

            # calculate the loss
            loss = criterion(probs, best_model_idx)


            # do backward propagation
            loss.backward()

            # do the parameter optimization
            optimizer.step()

            # calculate running loss value for non padding
            running_loss += loss.item()

            # print loss value every 100 iterations and reset running loss
            if step % 100 == 99:
                print('[%d, %5d] loss: %.10f' %
                    (epoch + 1, step + 1, running_loss / 100))
                running_loss = 0.0

    end = datetime.datetime.now()
    
    # define the checkpoint and save it to the model path
    # tip: the checkpoint can contain more than just the model
    checkpoint = {
        'model_state_dict': model.state_dict(),
    }
    torch.save(checkpoint, model_path)

    print('Model saved in ', model_path)
    print('Training finished in {} minutes.'.format((end - start).seconds / 60.0))

In [37]:
import numpy as np

# Init training data
subset_size = 1000
dataset = TrainingDataset("train.zh-en.zh", "examples.txt", models, tokenizers)
indices = list(range(subset_size))  # Define a list of indices
subset = Subset(dataset, indices)

In [38]:
train(EnsembleModel().to('cuda'), subset, 2, 0.00000005, 3, 'model.pt')

[1,   100] loss: 0.5386774942
[1,   200] loss: 0.5442860007
[1,   300] loss: 0.4632499961
[1,   400] loss: 0.4712505801
[1,   500] loss: 0.3522375752
[2,   100] loss: 0.3660105564
[2,   200] loss: 0.4962346432
[2,   300] loss: 0.4549467961
[2,   400] loss: 0.5734355169
[2,   500] loss: 0.4184263559
[3,   100] loss: 0.4012219616
[3,   200] loss: 0.4385924774
[3,   300] loss: 0.4125304909
[3,   400] loss: 0.4419923299
[3,   500] loss: 0.4435784630
Model saved in  model.pt
Training finished in 2.0833333333333335 minutes.


In [39]:
def predict_sentence_from_model(dataset, model, untranslated_text):
    model_input = dataset.create_model_input(untranslated_text)
    best_idx = torch.argmax(model(model_input))
    model_chosen = dataset.models[best_idx]
    model_tokenizer_chosen = dataset.model_tokenizers[best_idx]
    
    inputs = model_tokenizer_chosen(untranslated_text, return_tensors="pt").to("cuda")
    outputs = model_chosen.generate(**inputs)
    decoded_outputs = model_tokenizer_chosen.decode(outputs[0], skip_special_tokens=True)
    return decoded_outputs

# Load model

In [40]:
checkpoint = torch.load('model.pt')
model_state_dict = checkpoint['model_state_dict']

trained_model = EnsembleModel().to('cuda')
trained_model.load_state_dict(model_state_dict)

  checkpoint = torch.load('model.pt')


<All keys matched successfully>

In [None]:
with open('tatoeba.zh','r') as f, open('pred.txt', 'w') as w:
    for i, line in enumerate(f):
        pred = predict_sentence_from_model(dataset, EnsembleModel(), line)
        w.write(pred + '\n')
        print(i)

FileNotFoundError: [Errno 2] No such file or directory: 'tateoba.zh'

Experimental stuff just ignore


In [None]:
def translate_with_model(model, tokenizer, text, num_beams=5):
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, num_beams=num_beams, early_stopping=True)
    decoded_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(decoded_outputs)
    return outputs

statement_to_translate = "这个苹果怎么样"
outputs = translate_with_model(models[0], tokenizers[0], statement_to_translate)
print(outputs)

In [None]:
example = "这些成果的主要研究者都是学生，研究覆盖了环境、机械、能源、医疗、生命科学、人文教育等各大领域，同学们从一个好奇的点子开始，创造出了许多具有应用价值的高端发明，其中一些项目已在国内国际获奖。"

t1, t2 = [translate_with_model(model, tokenizer, example) for model, tokenizer in zip(models, tokenizers)]

In [None]:
start_token_id1 = models[0].config.decoder_start_token_id
decoder_input_ids1 = torch.tensor([[start_token_id1]], device="cuda")

inputs1 = tokenizers[0](example, return_tensors="pt").to('cuda')
logits1 = models[0](**inputs1, decoder_input_ids=decoder_input_ids1).logits


start_token_id2 = models[0].config.decoder_start_token_id
decoder_input_ids2 = torch.tensor([[start_token_id2]], device="cuda")

inputs2 = tokenizers[0](example, return_tensors="pt").to('cuda')
logits2 = models[1](**inputs2, decoder_input_ids=decoder_input_ids2).logits

print(logits1)
print(logits2)

In [None]:
print(logits1.size(), logits2.size())

input_size = logits1.shape[-1] + logits2.shape[-1]
hidden_size = 128

l1 = torch.nn.Linear(input_size, hidden_size).to("cuda")
l2 = torch.nn.Linear(hidden_size, 2).to("cuda")

catted_logits = torch.cat([logits1, logits2], dim=-1)
x = l1(catted_logits)
x = l2(x)
torch.nn.Softmax(dim=2)(x)

In [None]:
predict_sentence_from_model(dataset, EnsembleModel(), example)