### Preprocess
I made changes to the read_data and convert_to_index functions from the demo, so that the train/test datasets are represented as tuples of (x,y) (so that they can be batch-loaded using DataLoaader), with paddings and start/end tags.

In [34]:
# import sys
# import os
import numpy as np

In [3]:
def read_chinese_data(inputfilename):
    '''
    Reads the input file, returns a list of [(x,y)...], 
    where x = a Chinese sentence string, y = a binary list [1,0...],
    indicating whether a character is the first of a Chinese word.
    Eg, ("這次遊行的特色", [1, 1, 1, 0, 1, 1, 0,]) for 這/次/遊行/的/特色
    '''
    with open(inputfilename, "r") as inputfile:
        xy_list, collection_words, collection_labels = [], [], []
        for line in inputfile:
            if line[0] == '#':
                continue
            columns = line.split()
            if columns == []: # When reading a blank line
                xy_list.append((''.join(collection_words), collection_labels)) # append the (x,y)
                collection_words, collection_labels = [], [] # Reset the x,y lists
                continue
            collection_words.append(columns[1])
            collection_labels += [1] + ([0] * (len(columns[1]) - 1)) # 1 for first char, 0 for the rest
    return xy_list

def get_chars_and_ids(sentences, extra_chars=['<PAD>','<START>','<END>']):
    '''
    Arg:
        sentences: a list of sentence-strings
    Returns:
        list of the character set, plus extras (default: pad, start & end tags), 
        dict of {character:char_id}, tag-and-id by default <PAD>:0, <START>:1, <END>:2
    '''
    char_set = set((char for sen in sentences for char in sen))
    char_list = extra_chars + list(char_set)
    ids_dict = {char:i for i,char in enumerate(char_list)}
    return char_list, ids_dict

def sentence_to_ids(sentence, ids_dict, add_tags=True, padding_len=512): 
    '''Turns a sentence-string into a [ids] array, adds start/end tag by default'''
    
    ids = np.array([ids_dict[char] for char in sentence]) # string to ids
    
    if add_tags and ('<START>' in ids_dict) and ('<END>' in ids_dict):
        start_id, end_id = ids_dict['<START>'], ids_dict['<END>']
        ids = np.pad(ids, (1, 1), 'constant', constant_values=(start_id, end_id)) # pad with start/end tags
    
    pad_id = ids_dict['<PAD>'] if '<PAD>' in ids_dict else len(ids_dict)+1 # pad_id or vocabsize+1
    paddings = np.repeat(pad_id, padding_len - len(ids)+1) # Make sure even the longest sen has one padding
    ids = np.concatenate((ids, paddings))
    return ids

def convert_and_pad(raw_xy_data, ids_dict, y_pad_id=-1):
    '''Turns a list of (sentence, labels) to (padded_sentence_ids, padded_labels)'''
    
    max_len = max((len(x) for x,y in raw_xy_data))+2 # num_chars + 2 tags
    id_and_pad = lambda x : sentence_to_ids(x, ids_dict=ids_dict, padding_len=max_len)
    add_tag_and_pad_y = lambda y : np.concatenate( 
                                        (np.pad(y, 1, 'constant', constant_values=1), # label <start>/<end> as 1
                                         np.repeat(y_pad_id, max_len-len(y)-1) # pads: max_len - len(y)- 2 tags +1
                                        ))
    
    ided_and_padded = ((id_and_pad(x), np.array(len(x)+2), add_tag_and_pad_y(y)) for x,y in raw_xy_data)
    return list(ided_and_padded) # (tagged_and_padded_x, len_x_plus_two_tags, padded_y_plus_two_more_labels)

In [4]:
train_xy_raw = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-train.conllu')
test_xy_raw = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-test.conllu')

In [5]:
import pandas as pd
pd.DataFrame(train_xy_raw, columns=['x','y'])[:5]

Unnamed: 0,x,y
0,看似簡單，只是二選一做決擇，但其實他們代表的是你周遭的親朋好友，試著給你不同的意見，但追根究...,"[1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ..."
1,其便當都是買來的，就算加熱也是由媽媽負責（後來揭曉其實是避免帶來厄運），父親則在電視台上班。,"[1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, ..."
2,這次遊行最大的特色，在於越來越多年輕人上街遊行，而且當中不乏行動激烈的躁少年。,"[1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, ..."
3,懷孕期為421至457日。,"[1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]"
4,婷婷向昏迷中的婆婆訴說，為什麼生活會與她想像的不一樣。,"[1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, ..."


In [6]:
# Sandwich x/y in start/end tags and pad to 184 long

# set of all chars and their ids
chars_list, ids_dict = get_chars_and_ids((x for x,y in train_xy_raw + test_xy_raw)) 

# Convert (sentence,labels) to (padded ids, len_padded_ids, padded labels)
train_xy = convert_and_pad(train_xy_raw, ids_dict)
test_xy = convert_and_pad(test_xy_raw, ids_dict)

pd.DataFrame(train_xy, columns=['x','x_len','y'])[:5]

Unnamed: 0,x,x_len,y
0,"[1, 1289, 1477, 1724, 3511, 128, 2476, 1233, 8...",60,"[1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ..."
1,"[1, 1640, 2879, 919, 3021, 1233, 329, 625, 249...",48,"[1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, ..."
2,"[1, 1349, 2371, 3034, 1605, 2805, 835, 2495, 8...",41,"[1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, ..."
3,"[1, 584, 643, 1515, 553, 1703, 567, 66, 37, 17...",15,"[1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, ..."
4,"[1, 2292, 2292, 920, 2793, 75, 1953, 2495, 356...",29,"[1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, ..."


In [7]:
import torch
import torch.nn as nn

class Segmenter(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        
        self.emb = nn.Embedding(self.vocab_size, self.emb_size, 0)
        self.lstm = nn.LSTM(self.emb_size, 150, batch_first=True) # in_size, hidden_size, layers=1
        self.sig1 = nn.Sigmoid()
        self.lin = nn.Linear(150, 2)
        self.softmax = nn.LogSoftmax(2)
        
    def forward(self, x, x_len):
        embs = self.emb(x) # B,185,200
        rnn_output, (_,_) = self.lstm(embs) # B, 185, 150
        
        # 150=>linear 2=>softmax
        output = self.sig1(rnn_output) # B, 185, 150
        output = self.lin(output) # B, 185, 150
        output = self.softmax(output) # B, 185, 2
        output = output[:, :max(x_len), :] # outshape B, max_x_len_of_batch(<=184), 2
        return output

In [10]:
from torch.utils.data import DataLoader
import torch.optim as optim

In [7]:
def train_model(train_xy, epochs, device, model, model_fn, batch_size=50, lr=0.005):
    
    m = model.to(device)
    m.train()
    
    batching = DataLoader(train_xy, batch_size=batch_size, shuffle=True)

    loss_fn = nn.NLLLoss(ignore_index=-1) # ignore y padding
    optimizer = optim.Adam(m.parameters(), lr=0.005)
    
    for e in range(epochs):
        total_loss = 0
        for i, (x, x_len, y) in enumerate(batching):
            optimizer.zero_grad()
            
            x, y = x.to(device), y.to(device)
            
            out = m(x, x_len) # B, max_x_len, 2 => permute to B,2,max_x_len
            expect = y[:, :max(x_len)] # B, max_x_len
            
            loss = loss_fn(out.permute(0,2,1), expect) 
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            
            print(f"Epoch {e+1} avg loss {total_loss/(i+1)}", end='\r')
        print()
        torch.save(m, model_fn)

    return m

In [21]:
vocab_size = len(ids_dict)
model_A_init = Segmenter(vocab_size=vocab_size, emb_size=200)
gpu = 'cuda:3'

model_A = train_model(train_xy, epochs=30, device=gpu, model=model_A_init, model_fn='model_A')

Epoch 1 avg loss 0.38333669602870946
Epoch 2 avg loss 0.20807320140302182
Epoch 3 avg loss 0.15759735815227033
Epoch 4 avg loss 0.12261073337867856
Epoch 5 avg loss 0.09514971654862166
Epoch 6 avg loss 0.07417137511074542
Epoch 7 avg loss 0.057458917796611795
Epoch 8 avg loss 0.046411604667082426
Epoch 9 avg loss 0.036904990044422456
Epoch 10 avg loss 0.029521618806757032
Epoch 11 avg loss 0.026479475363157693
Epoch 12 avg loss 0.027870772918686275
Epoch 13 avg loss 0.023977186786942183
Epoch 14 avg loss 0.019785709120333193
Epoch 15 avg loss 0.021285586035810412
Epoch 16 avg loss 0.016378332156455143
Epoch 17 avg loss 0.014341231202706695
Epoch 18 avg loss 0.011373420292511583
Epoch 19 avg loss 0.010060079692630097
Epoch 20 avg loss 0.0074300358886830505
Epoch 21 avg loss 0.0047712409170344476
Epoch 22 avg loss 0.0028920972792548126
Epoch 23 avg loss 0.0019565121176128743
Epoch 24 avg loss 0.0016522447862371338
Epoch 25 avg loss 0.0012530678715847899
Epoch 26 avg loss 0.00108067854889

In [8]:
# Model class must be defined somewhere (no need to give model(*args))
model_A = torch.load('model_A').to('cpu')
model_A.eval()

Segmenter(
  (emb): Embedding(3650, 200, padding_idx=0)
  (lstm): LSTM(200, 150, batch_first=True)
  (sig1): Sigmoid()
  (lin): Linear(in_features=150, out_features=2, bias=True)
  (softmax): LogSoftmax(dim=2)
)

In [11]:
with torch.no_grad():
    test_batching = DataLoader(test_xy, batch_size=10)
    for i,(x,x_len,y) in enumerate(test_batching):
        rawpredictions = model_A(x, x_len)
        break

In [12]:
[ [chars_list[i] for i in sen_ids] for sen_ids in x.detach().tolist()]

[['<START>',
  '然',
  '而',
  '，',
  '這',
  '樣',
  '的',
  '處',
  '理',
  '也',
  '衍',
  '生',
  '了',
  '一',
  '些',
  '問',
  '題',
  '。',
  '<END>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',

In [13]:
rawpredictions.argmax(2)

tensor([[1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
         0, 1, 0, 1, 0, 1],
        [1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
         0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
         1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
         1, 0, 1, 0, 1, 0],
        [1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
         1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
         0, 1, 0, 1, 0, 1],
        [1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
         0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
         1, 0, 1, 0, 1, 0

### Part 1 - Sentence generation (15 points).
Convert the model in Demo 2.1 into a character-based sentence generator. (Strip out the word segmentation objective.)  The model should, given a start symbol, produce a variety of sentences that terminate with a stop symbol (you will have to add these to the data).  The sentences that it generates should be of reasonable average length compared to the sentences in the training corpus (this needn't be precise). 

Report and discuss the changes you made to the notebook using Markdown inside the notebook.

In [130]:
# https://www.analyticsvidhya.com/blog/2020/08/build-a-natural-language-generation-nlg-system-using-pytorch/
class SentenceGenerator(nn.Module):
    def __init__(self, vocab_size, emb_size=200, lstm_size=150):
        super(Sentence_generator, self).__init__()
        self.lstm_size = lstm_size
        self.emb_size = emb_size
        self.num_layers = 1
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(self.vocab_size, self.emb_size, padding_idx=0) #V=>200
        
        self.lstm = nn.LSTM( #200=>150
            input_size=self.emb_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            batch_first=True,
#             dropout=0.2,
        )
        
        self.fc = nn.Linear(self.lstm_size, self.vocab_size) #150=>V

    def forward(self, x, hidden_state):
        embed = self.embedding(x) #B,184,200
        output, hidden_state = self.lstm(embed)
        logits = self.fc(output)
        return logits, hidden_state

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return (weight.new(self.num_layers, batch_size, self.lstm_size).zero_(),
                weight.new(self.num_layers, batch_size, self.lstm_size).zero_())


In [131]:
def train_model_B(train_xy, model, epochs, device, model_fn, batch_size=50):
    model = model.to(device)
    model.train()

    batching = DataLoader(train_xy, batch_size=batch_size, shuffle=True)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for e in range(epochs):
        total_loss = 0
        
        hidden = model.init_hidden(batch_size)
        
        for i, (x,x_len,_) in enumerate(batching):
            optimizer.zero_grad()
            
            hidden = tuple([s.data for s in hidden])
            x = x.to(device)
            
            # output preditions and update hidden state
            predictions, hidden = model(x[:,:max(x_len)], hidden) # B,max_x_len => B,max_x_len,V
            expect = x[:, 1:max(x_len)+1] # next words
            
            loss = criterion(predictions.transpose(1, 2), expect)
            total_loss += loss.item()

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            print(f"epoch {e+1} avg loss: {total_loss/(i+1)}", end='\r')

        print()
        torch.save(model, model_fn)
    return model

In [132]:
vocab_size = len(ids_dict)
model_B_init = Sentence_generator(vocab_size=vocab_size)
gpu = 'cuda:0'

model_B = train_model_B(train_xy, model=model_B_init, epochs=30, device=gpu, model_fn='model_B')

epoch 1 avg loss: 6.8489125132560735
epoch 2 avg loss: 6.1349561989307435
epoch 3 avg loss: 5.8704470455646515
epoch 4 avg loss: 5.6516322493553165
epoch 5 avg loss: 5.4676197111606655
epoch 6 avg loss: 5.3089046180248265
epoch 7 avg loss: 5.1739695847034455
epoch 8 avg loss: 5.0540517389774325
epoch 9 avg loss: 4.9488528907299045
epoch 10 avg loss: 4.8528389096260075
epoch 11 avg loss: 4.7643896341323855
epoch 12 avg loss: 4.6818116128444675
epoch 13 avg loss: 4.6036221206188245
epoch 14 avg loss: 4.5322993218898775
epoch 15 avg loss: 4.4612283766269684
epoch 16 avg loss: 4.3950440645217895
epoch 17 avg loss: 4.3312052309513095
epoch 18 avg loss: 4.2702874422073365
epoch 19 avg loss: 4.2089480727911955
epoch 20 avg loss: 4.1515483319759365
epoch 21 avg loss: 4.0961863338947385
epoch 22 avg loss: 4.0424893707036975
epoch 23 avg loss: 3.9883403956890104
epoch 24 avg loss: 3.9372513562440874
epoch 25 avg loss: 3.8862759709358214
epoch 26 avg loss: 3.8368305832147637
epoch 27 avg loss: 3.

In [133]:
model_B = torch.load('model_B').to('cpu')
model_B.eval()

Sentence_generator(
  (embedding): Embedding(3650, 200, padding_idx=0)
  (lstm): LSTM(200, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=3650, bias=True)
)

In [63]:
def generate_pred_sentence(ids_dict, chars_list, model, text, consider_prev=0):
    model.eval()

    chars = text
    hidden = model.init_hidden(1) #batchsize=1

    while True:
        # (if consider_prev=0 or greater than the nr of chars, the whole sentence so far is considered)
        current_ids = [ids_dict[c] for c in chars[-consider_prev:]] 
        
        x = torch.tensor([current_ids]).long() # Consider <start> ~ last predicted char
        with torch.no_grad():
            y_pred, hidden = model(x, hidden)

        last_char_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_char_logits, dim=0).detach().numpy()
        sample_char_index = np.random.choice(len(last_char_logits), p=p)
        next_char = chars_list[sample_char_index]
#         if next_char == '<PAD>':
#             continue
        chars.append(next_char)
        if next_char=='<END>':
            break

    return chars

In [134]:
print( generate_pred_sentence(ids_dict, chars_list, model=model_B, text=['<START>','這']) )

['<START>', '這', '種', '和', '社', '芳', '了', '巨', '大', '滿', '，', '保', '核', '雲', '酸', '劑', '礦', '育', '被', '評', '為', '稱', '為', '歌', '地', '，', '也', '進', '不', '給', '目', '前', '，', '就', '西', '班', '牙', '殖', '民', '國', '富', '神', '派', '獎', '。', '<END>']


In [65]:
def generate_n_sentences(consider_prev, n_sentences = 20, print_sentences=False):

    current_len = 0
    for i in range(n_sentences):
        words = generate_pred_sentence(ids_dict, chars_list, model=model_B, text=['<START>'],
                                      consider_prev=consider_prev)
        
        if print_sentences:
            print(''.join(words), len(words))
        current_len+=len(words)
    print('Avg sentence length:', current_len/n_sentences)

In [135]:
# Trying considering different previous nr of chars, and see the avg length of generated sentences
for i in (0,1,5,10,20,30,60,100,150,184):
    print(f'consider the previous {i} characters:', end='\t')
    generate_n_sentences(consider_prev=i)


consider the previous 0 characters:	Avg sentence length: 27.0
consider the previous 1 characters:	Avg sentence length: 1341.65
consider the previous 5 characters:	Avg sentence length: 1153.0
consider the previous 10 characters:	Avg sentence length: 418.0
consider the previous 20 characters:	Avg sentence length: 70.75
consider the previous 30 characters:	Avg sentence length: 44.65
consider the previous 60 characters:	Avg sentence length: 49.1
consider the previous 100 characters:	Avg sentence length: 42.15
consider the previous 150 characters:	Avg sentence length: 38.0
consider the previous 184 characters:	Avg sentence length: 44.55


In [142]:
generate_n_sentences(consider_prev=0, print_sentences=True)

<START>分整謂十現代榮譽事誕生在唐王希遺城的席會。<END> 23
<START>在2005年8月2郡，安清朝政府為主人、美軍人運與周圍1500萬位。<END> 36
<START>大戰期的「洲藝指出1日光棍節目客紀、賈瓷、密等等同反設有些長端猶力「綠譜長漂者」稱下，美洲女歌藝子溫解死統一，可以澳門前主要性盟屬於1狀但實帶東流、澳車變化和公言（英國玉的尖）對帕爾勳為使用病支演出李八役，於1942年他藉他火星的左委託上他，所以寶幕高45秦國元，為華東方有成半丹。<END> 143
<START>8克電視列高由站盛頓全國指揮計工作的過資。<END> 23
<START>這種蜥期時間的載劉作為人可以取得白色襲觀。<END> 23
<START>新波蘭堡的自經開展合書記運動會合約，原本上將航水流變檢隱60被承人於2007年4月5日。<END> 46
<START>1993年10月20日，旨車站第一任舞人，拔於1950年首全變保護單位。<END> 38
<START>為了許多芬受的罵先後，被奇難小思化作了建立的劃動，分別就養由「泥塑安」在個人類軍傳一t。<END> 46
<START>神廳還言電塔未轉移早位。<END> 14
<START>1980年2月，故宮立國全國旗陳新國家立後歐·相投資子為青7，港登西班牙奪出家那舉行肖構我們工程的運作為，要求出來自白統射豐這面的效果，他們沒有受到照越天李回獨立黛研作。<END> 87
<START>後來另一個人爲更明他的核心，但他們只會接著他的女兒和儀式劇。<END> 32
<START>他的胡予划化堂，對當三本時與日本時期的收拾將軍足球觀藝完危。<END> 32
<START>在普問體便收太，厭靈由不可以提鑄高愛或因大都遭到解。<END> 28
<START>根據20077年人口87萬，GonipernagustertsbitellfAWore，核終無是假說「酸」以上規斯在上的翻唱，但我們已很一定貢增加菌益的成功。<END> 81
<START>例如，與瑞典再基出系統與當時他們知識需要原來自由陸系列球的率宿效體，中國歷史心市是抗議，長至10到舊村。<END> 54
<START>他往後早在知裡變成透高的所肯，使關長奧斯·馬爾將軍發達9%的飛行的面業會的指石。<END> 42
<START>該市政權在墨西哥的數呼劇家合併，各角炮頂

In [55]:
avg_sent_len = np.array([l for x,l,y in train_xy]).mean()
print('Average sentence length in train set:', avg_sent_len)

Average sentence length in train set: 41.10357768326244


### Part 2 - Dual objectives (10 points)
Copy the notebook from part 1 and augment the copy by adding back the word segmentation objective, as a second objective with its own loss.  (You could also in theory do Part 1 and Part 2 in reverse, by adding sentence generation with dual objectives first and then stripping out the word segmentation objective; this is equivalent.)  Note that multiple losses can be combined by simple, possibly weighted addition -- backpropagation works entirely correctly on the combined loss.

Report and discuss the changes you made to the notebook using Markdown inside the notebook.

In [127]:
# model(vocabsize)(x, h_state) => out1:[B,maxlen,2] ; out2:([B,maxlen-1,vocabsize], h_state)
class DualModel(nn.Module):
    def __init__(self, vocab_size, emb_size=200, lstm_size=150):
        super(DualModel, self).__init__()
        # Model Params
        self.lstm_size = lstm_size
        self.emb_size = emb_size
        self.num_layers = 1
        self.vocab_size = vocab_size
        
        # Shared functions: embed, LSTM
        self.embedding = nn.Embedding(self.vocab_size, self.emb_size, padding_idx=0) #V=>200
        self.lstm = nn.LSTM( #200=>150
            input_size=self.emb_size, hidden_size=self.lstm_size, num_layers=self.num_layers,
            batch_first=True,  )
        
        # For output1: sigmoid, linear( h_size->2 ), softmax
        self.sig1 = nn.Sigmoid()
        self.lin_to_binary = nn.Linear(self.lstm_size, 2)
        self.softmax = nn.LogSoftmax(2)
        
        # For output2: linear( h_size->voc_size )
        self.lin_to_v = nn.Linear(self.lstm_size, self.vocab_size) #150=>V
    
    def forward(self, x, hidden_state):
        # original x: B,185
        
        # shared: embedding & lstm
        embed = self.embedding(x) # B,185,200
        rnn_out, hidden_state = self.lstm(embed) # B,185,150 & h_state
        
        # output1
        output1 = self.sig1(rnn_out) # B, 185, 150
        output1 = self.lin_to_binary(output1) # B, 185, 2
        output1 = self.softmax(output1) # B, 185, 2  
            # To be sliced outside model to => output[:, :max(x_len), :], shape = B, maxlen, 2
        
        # output2
        logits = self.lin_to_v(rnn_out) #B,185,V
        output2 = (logits, hidden_state)
        
        return output1, output2
        
    #====================================================
        
    # Func to initilize a random hidden state  
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return (weight.new(self.num_layers, batch_size, self.lstm_size).zero_(),
                weight.new(self.num_layers, batch_size, self.lstm_size).zero_())
    

In [128]:
def train_dualmodel(train_xy, model, epochs, device, model_fn, batch_size=50):
    model = model.to(device)
    model.train()

    batching = DataLoader(train_xy, batch_size=batch_size, shuffle=True)
    
    cross_entropy_fn = nn.CrossEntropyLoss(ignore_index=0) # ignore sentence padding
    nll_fn = nn.NLLLoss(ignore_index=-1) # ignore y padding
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for e in range(epochs):
        total_loss = 0
        
        hidden = model.init_hidden(batch_size) # the initial random hidden state
        
        for i, (x, x_len, y) in enumerate(batching):
            optimizer.zero_grad()
            
            hidden = tuple([s.data for s in hidden])
            x, y = x.to(device), y.to(device)
            
            out1, out2 = model(x[:,:max(x_len)], hidden)
            
            # output1 loss
            out1 = out1[:, :max(x_len), :] # B, maxlen, 2
            expect1 = y[:, :max(x_len)] # B, max_x_len
            loss1 = nll_fn(out1.transpose(1, 2), expect1)
            
            # output2 loss
            pred, hidden = out2
            expect2 = x[:, 1:max(x_len)+1] # next words
            loss2 = cross_entropy_fn(pred.transpose(1, 2), expect2)
            

            loss = loss1+loss2
            total_loss += loss.item()

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            print(f"epoch {e+1} avg loss: {total_loss/(i+1)}", end='\r')

        print()
        torch.save(model, model_fn)
    return model


In [129]:
vocab_size = len(ids_dict)
dualmodel_init = DualModel(vocab_size=vocab_size)
gpu = 'cuda:3'

dualmodel = train_dualmodel(train_xy, model=dualmodel_init, epochs=30, device=gpu, model_fn='dualmodel')

epoch 1 avg loss: 7.5126067101955415
epoch 2 avg loss: 6.6897525012493135
epoch 3 avg loss: 6.3669405341148374
epoch 4 avg loss: 6.1060999870300375
epoch 5 avg loss: 5.8664906799793245
epoch 6 avg loss: 5.6555634975433355
epoch 7 avg loss: 5.4856294631958015
epoch 8 avg loss: 5.3436616778373725
epoch 9 avg loss: 5.2204542994499215
epoch 10 avg loss: 5.1105767369270325
epoch 11 avg loss: 5.0105303943157245
epoch 12 avg loss: 4.9183861732482915
epoch 13 avg loss: 4.8325890243053445
epoch 14 avg loss: 4.7522506237030035
epoch 15 avg loss: 4.6761632800102245
epoch 16 avg loss: 4.6035708576440815
epoch 17 avg loss: 4.5343680679798135
epoch 18 avg loss: 4.4683866947889335
epoch 19 avg loss: 4.4051617622375495
epoch 20 avg loss: 4.3449637979269035
epoch 21 avg loss: 4.2878345817327555
epoch 22 avg loss: 4.2329756230115895
epoch 23 avg loss: 4.1799312323331845
epoch 24 avg loss: 4.1291833281517025
epoch 25 avg loss: 4.0809933781623844
epoch 26 avg loss: 4.0340838760137564
epoch 27 avg loss: 3.

### Part 3 - Analysis (5 points)
You now have three models.  The original word segmentation model, a sentence generation model, and a dual sentence-generation/word segmentation model. 

Compare the performance on the test data of the original word segmentation model between the original objective and the dual objective model.  In how many iterations do the models converge?  What are their final F1 and accuracy scores once they've converged? Are they any different?  If so, why?

Make the same comparison between the sentence generation model and the dual-objective model, except the performance measure is the per-word perplexity on the text corpus.

Report your findings in one of the notebooks.

In [118]:
dualmodel = torch.load('dualmodel').to('cpu')
dualmodel.eval()

DualModel(
  (embedding): Embedding(3650, 200, padding_idx=0)
  (lstm): LSTM(200, 150, num_layers=3, batch_first=True, dropout=0.2)
  (sig1): Sigmoid()
  (lin_to_binary): Linear(in_features=150, out_features=2, bias=True)
  (softmax): LogSoftmax(dim=2)
  (lin_to_v): Linear(in_features=150, out_features=3650, bias=True)
)

In [None]:
# Evaluate Segmenter vs DualModel (output1 only) & compare accuracy/f1

def train_and_eval(modelA, modelB, epochs, trainXY, testXY, device, batch_size=50, lr=0.001):
    
    # Train/test batches
    train_batches = DataLoader(train_xy, batch_size=batch_size, shuffle=True)
    test_batches = DataLoader(test_xy, batch_size=batch_size)
    
    # Models
    modelA.to(device)
    modelB.to(device)
    
    # Loss funcs & optimizers
    cross_entropy_fn = nn.CrossEntropyLoss(ignore_index=0) # ignore sentence padding
    nll_fn = nn.NLLLoss(ignore_index=-1) # ignore y padding
    optimizerA = optim.Adam(modelA.parameters(), lr=lr)
    optimizerB = optim.Adam(modelB.parameters(), lr=lr)
    
    
    # Each epoch: train models -> eval models, print accuracy and f1
    for e in range(epochs):
        hidden = modelB.init_hidden(batch_size) # the initial random hidden state
        
        # TRAIN
        modelA.train()
        modelB.train()
        for i, (x, x_len, y) in enumerate(train_batches):
            x, y = x.to(device), y.to(device)
            
            # ====Train A==========================================
            optimizerA.zero_grad()
            
            out = modelA(x, x_len) 
            expect = y[:, :max(x_len)] 
            
            lossA = nll_fn(out.permute(0,2,1), expect) 
            lossA.backward()
            optimizerA.step()
            
            
            # ====Train B==========================================
            optimizerB.zero_grad()
            
            hidden = tuple([s.data for s in hidden])
            out1, out2 = modelB(x[:,:max(x_len)], hidden)
            
            # output1 loss
            out1 = out1[:, :max(x_len), :] 
            expect1 = y[:, :max(x_len)] 
            loss1 = nll_fn(out1.transpose(1, 2), expect1)
            # output2 loss
            pred, hidden = out2
            expect2 = x[:, 1:max(x_len)+1] # next words
            loss2 = cross_entropy_fn(pred.transpose(1, 2), expect2)

            lossB = loss1+loss2
            lossB.backward()
            nn.utils.clip_grad_norm_(modelB.parameters(), 1)
            optimizerB.step()
            
        
        # TEST
        modelA.eval()
        modelB.eval()
        with torch.no_grad():
            for i, (x, x_len, y) in enumerate(test_batches):
                x, y = x.to(device), y.to(device) # input-sentence, gold-labels
                
                # ModelA predictions => compare with y
                raw_predsA = modelA(x, x_len)
                binary_predsA = raw_predsA.argmax(2)
                
                # ModelB predictions => compare with y
                hidden = modelB.init_hidden(batch_size)
                raw_predsB, _ = modelB(x[:,:max(x_len)], hidden) # only need the first output
                binary_predsB = raw_predsB.argmax(2)
    