In [1]:
# since this file only test 2 models together, and train_moe_labels.txt contains numbers from
# all 5 models, we gotta force it all into 2 numbers only. (Or else cuda will crash xpp)

with open('train.zh-en.zh', 'r') as train_moe_text_file, \
    open('train_moe_labels.txt','r') as train_moe_labels_file, \
    open('filtered_train_moe_text.txt', 'w') as filtered_train_moe_text_file, \
    open('filtered_train_moe_labels.txt', 'w') as filtered_train_moe_labels_file:
    
    for text_line, best_idx in zip(train_moe_text_file, train_moe_labels_file):
        best_idx = int(best_idx)
        if not (best_idx == 0 or best_idx == 3):
            continue

        filtered_train_moe_text_file.write(text_line)
        # small_100
        if best_idx == 3:
            filtered_train_moe_labels_file.write(str(1) + '\n')
        # mariant
        else:
            filtered_train_moe_labels_file.write(str(0) + '\n')

In [1]:
from transformers import AutoModelForSeq2SeqLM, M2M100ForConditionalGeneration, AutoTokenizer
from datasets import Dataset
import torch

models = [
    AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en").to("cuda"),
    M2M100ForConditionalGeneration.from_pretrained("alirezamsh/small100").to("cuda")
]

tokenizers = [
    AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en"),
    AutoTokenizer.from_pretrained("alirezamsh/small100"),
]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class EnsembleModel(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.l1 = torch.nn.Linear(193113, 1024).to('cuda')
        self.l2 = torch.nn.LeakyReLU(0.1)
        self.l3 = torch.nn.Dropout(0.2)
        self.l4 = torch.nn.Linear(1024, 128).to('cuda')
        self.l5 = torch.nn.LeakyReLU(0.1)
        self.l6 = torch.nn.Dropout(0.2)
        self.l7 = torch.nn.Linear(128, 2).to('cuda')

    def forward(self, concatted_outputs):
        # print(len(concatted_outputs))
        x = self.l1(concatted_outputs)
        x = self.l2(x)
        x = self.l3(x)
        x = self.l4(x)
        x = self.l5(x)
        x = self.l6(x)
        x = self.l7(x)
        return x

In [4]:
from utils import read_file
from torch.utils.data import Dataset as Ds, Subset

class TrainingDataset(Ds):
    def __init__(self, text_path, lab_path, models, model_tokenizers):
        '''
        dataset_reduce_scale = reduce the sample size of the dataset. 
        E.g dataset_reduce_scale=5 on sample size 100, basically reduce sample size from 100 to 20.
        '''
        self.untranslated_texts = read_file(text_path)
        self.best_model_idx_labels = read_file(lab_path)

        self.model_tokenizers = model_tokenizers
        self.models = models

        start_token_ids = [model.config.decoder_start_token_id for model in self.models]
        self.decoder_input_ids_list = [torch.tensor([[start_token_id]]).to("cuda") for start_token_id in start_token_ids]

    def __len__(self):
        return len(self.untranslated_texts)
    
    def __getitem__(self, idx):
        untranslated_text = self.untranslated_texts[idx]
        concatted_outputs = self.create_model_input(untranslated_text)
    
        best_model_idx = torch.tensor(int(self.best_model_idx_labels[idx]))
        
        return concatted_outputs, best_model_idx
    
    def create_model_input(self, untranslated_text):
        with torch.no_grad():
            tokenized_texts = [tokenizer(untranslated_text, return_tensors="pt").to("cuda") for tokenizer in self.model_tokenizers]
            output_logits = [model(**tokenized_text, decoder_input_ids=decoder_input_ids).logits for model, tokenized_text, decoder_input_ids in zip(models, tokenized_texts, self.decoder_input_ids_list)]
            concatted_outputs = torch.cat(output_logits, dim=-1)
            concatted_outputs = concatted_outputs.squeeze()
        return concatted_outputs


In [5]:
import datetime
from torch.utils.data import DataLoader

def train(model, dataset, batch_size, learning_rate, num_epoch, model_path=None):
    """
    Complete the training procedure below by specifying the loss function
    and optimizers with the specified learning rate and specified number of epoch.

    """
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    start = datetime.datetime.now()
    for epoch in range(num_epoch):
        model.train()
        running_loss = 0.0
        for step, data in enumerate(data_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            untranslated_text = data[0].to('cuda')
            best_model_idx = data[1].to('cuda')

            # zero the parameter gradients
            model.zero_grad()

            # do forward propagation
            probs = model(untranslated_text)

            # calculate the loss
            loss = criterion(probs, best_model_idx)


            # do backward propagation
            loss.backward()

            # do the parameter optimization
            optimizer.step()

            # calculate running loss value for non padding
            running_loss += loss.item()

            # print loss value every 100 iterations and reset running loss
            if step % 100 == 99:
                print('[%d, %5d] loss: %.10f' %
                    (epoch + 1, step + 1, running_loss / 100))
                running_loss = 0.0

    end = datetime.datetime.now()
    
    # define the checkpoint and save it to the model path
    # tip: the checkpoint can contain more than just the model
    checkpoint = {
        'model_state_dict': model.state_dict(),
    }
    torch.save(checkpoint, model_path)

    print('Model saved in ', model_path)
    print('Training finished in {} minutes.'.format((end - start).seconds / 60.0))

In [6]:
import numpy as np

# Init training data
subset_size = 1000
dataset = TrainingDataset("filtered_train_moe_text.txt", "filtered_train_moe_labels.txt", models, tokenizers)
indices = list(range(subset_size))  # Define a list of indices
subset = Subset(dataset, indices)

In [9]:
train(EnsembleModel().to('cuda'), subset, 2, 0.001, 3, 'model.pt')

[1,   100] loss: 882.2700651270
[1,   200] loss: 211.9515381194
[1,   300] loss: 176.9301564360
[1,   400] loss: 89.5297221100
[1,   500] loss: 48.2920158433
[2,   100] loss: 62.6752626082
[2,   200] loss: 24.5777661010
[2,   300] loss: 12.8263265242
[2,   400] loss: 61.0250398494
[2,   500] loss: 34.1846045462
[3,   100] loss: 20.6639028169
[3,   200] loss: 35.1202390841
[3,   300] loss: 31.6503650487
[3,   400] loss: 15.4539192250
[3,   500] loss: 21.2763187668
Model saved in  model.pt
Training finished in 22.333333333333332 minutes.


In [10]:
def predict_sentence_from_model(dataset, model, untranslated_text):
    model_input = dataset.create_model_input(untranslated_text)
    best_idx = torch.argmax(model(model_input))
    model_chosen = dataset.models[best_idx]
    model_tokenizer_chosen = dataset.model_tokenizers[best_idx]
    
    inputs = model_tokenizer_chosen(untranslated_text, return_tensors="pt").to("cuda")
    outputs = model_chosen.generate(**inputs)
    decoded_outputs = model_tokenizer_chosen.decode(outputs[0], skip_special_tokens=True)
    return decoded_outputs

# Load model

In [11]:
checkpoint = torch.load('model.pt')
model_state_dict = checkpoint['model_state_dict']

trained_model = EnsembleModel().to('cuda')
trained_model.load_state_dict(model_state_dict)

  checkpoint = torch.load('model.pt')


<All keys matched successfully>

In [None]:
with open('wmttest2022.zh','r') as train_moe_labels_file, open('pred.txt', 'w') as filtered_train_moe_labels_file:
    for i, best_idx in enumerate(train_moe_labels_file):
        pred = predict_sentence_from_model(dataset, EnsembleModel(), best_idx)
        filtered_train_moe_labels_file.write(pred + '\n')
        print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

Experimental stuff just ignore


In [None]:
def translate_with_model(model, tokenizer, text, num_beams=5):
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, num_beams=num_beams, early_stopping=True)
    decoded_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # print(inputs['input_ids'].size())
    # print(outputs.size())
    print(decoded_outputs)
    return outputs

statement_to_translate = "这个苹果怎么样"
outputs = translate_with_model(models[0], tokenizers[0], statement_to_translate)
print(outputs)

tensor([[65000,   904,   181,    56, 39307,    23,     0]], device='cuda:0')


In [27]:
example = "这些成果的主要研究者都是学生，研究覆盖了环境、机械、能源、医疗、生命科学、人文教育等各大领域，同学们从一个好奇的点子开始，创造出了许多具有应用价值的高端发明，其中一些项目已在国内国际获奖。"

t1, t2 = [translate_with_model(model, tokenizer, example) for model, tokenizer in zip(models, tokenizers)]

In [None]:
start_token_id1 = models[0].config.decoder_start_token_id
decoder_input_ids1 = torch.tensor([[start_token_id1]], device="cuda")

inputs1 = tokenizers[0](example, return_tensors="pt").to('cuda')
logits1 = models[0](**inputs1, decoder_input_ids=decoder_input_ids1).logits


start_token_id2 = models[0].config.decoder_start_token_id
decoder_input_ids2 = torch.tensor([[start_token_id2]], device="cuda")

inputs2 = tokenizers[0](example, return_tensors="pt").to('cuda')
logits2 = models[1](**inputs2, decoder_input_ids=decoder_input_ids2).logits

print(logits1)
print(logits2)

In [None]:
print(logits1.size(), logits2.size())

input_size = logits1.shape[-1] + logits2.shape[-1]
hidden_size = 128

l1 = torch.nn.Linear(input_size, hidden_size).to("cuda")
l2 = torch.nn.Linear(hidden_size, 2).to("cuda")

catted_logits = torch.cat([logits1, logits2], dim=-1)
x = l1(catted_logits)
x = l2(x)
torch.nn.Softmax(dim=2)(x)

In [None]:
predict_sentence_from_model(dataset, EnsembleModel(), example)