In [None]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [None]:
!mkdir -p drive
!google-drive-ocamlfuse drive

In [2]:
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from collections import Counter
import numpy as np
import random
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader

In [3]:
import random

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
train_path = "data/senti.train.tsv"
dev_path = "data/senti.dev.tsv"
test_path = "data/senti.test.tsv"

In [6]:
from nltk.stem import WordNetLemmatizer
#from nltk.corpus import wordnet
def getCleanData(x):

    # Reduce each word into common base
    lem = WordNetLemmatizer()
    x = [lem.lemmatize(word) for word in x] 
    x = [lem.lemmatize(word,'v') for word in x]
    x = [lem.lemmatize(word,'r') for word in x]
    return x

In [7]:
def read_corpus(path):
    with open(path, 'r', encoding='utf-8') as f:
        sents = []
        labels = []
        for line in f.readlines():
            sent = line.split('\t')[0].lower()
            sent = getCleanData(sent.split(' '))
            label = line.split('\t')[1].strip('\n')
            sents.append(sent)
            labels.append(label)
    return sents, labels

def build_vocab(sents):
    dic = {}
    word_counter = Counter()
    dic['PAD'] = 0
    dic['UNK'] = 1
    for sent in sents:
        for word in sent:
            word_counter[word] += 1
    itos = [w for w, c in word_counter.items()]
    for w in itos:
        dic[w] = len(dic)
    return dic

def vectorize(sents):
    vecs = [[wtoi.get(word, wtoi.get("UNK")) for word in sent] for sent in sents]
    return vecs

In [8]:
train_data,train_label = read_corpus(train_path)
dev_data,dev_label = read_corpus(dev_path)
test_data,test_label = read_corpus(test_path)

In [9]:
wtoi= build_vocab(train_data)
itow = dict((v,k) for k, v in wtoi.items())

In [10]:
train_set = vectorize(train_data)
dev_set = vectorize(dev_data)
test_set = vectorize(test_data)

In [11]:
print('train set size: {} \ndev set size: {} \ntest set size: {}'.format(len(train_data),len(dev_data),len(test_data)))
print('vocab size: ', len(wtoi))

train set size: 67349 
dev set size: 872 
test set size: 1821
vocab size:  12179


In [12]:
class LoadData(torch.utils.data.Dataset):
    def __init__(self, data, labels):
       # super(LoadData.self).__init__()
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        X = self.data[idx]
        Y = int(self.labels[idx])
        return X, Y

In [13]:
def collate_fn(batch):
    
    batch = list(zip(*batch))
    
   # lengths = torch.LongTensor([len(t) for t in batch[0]]).to(device)
    inputs = [torch.LongTensor(t).to(device) for t in batch[0]]
    inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True) 
    labels = torch.LongTensor(batch[1]).to(device)
    mask = (inputs != 0).to(device)
    
    return inputs, labels, mask

In [14]:
trains = LoadData(train_set, train_label)
devs = LoadData(dev_set, dev_label)
tests = LoadData(test_set, test_label)

In [15]:
trains[9]

([75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 64], 1)

In [18]:
train_loader = torch.utils.data.DataLoader(
                    dataset=trains,
                    batch_size=batch_size,
                    shuffle=True,
                    collate_fn=collate_fn)
dev_loader = torch.utils.data.DataLoader(
                    dataset=devs,
                    batch_size=batch_size,
                    shuffle=True,
                    collate_fn=collate_fn)
test_loader = torch.utils.data.DataLoader(
                    dataset=tests,
                    batch_size=batch_size,
                    shuffle=False,
                    collate_fn=collate_fn)

In [19]:
next(iter(train_loader))

(tensor([[ 449,  244,   75,  ...,    0,    0,    0],
         [  55,  181, 1488,  ...,    0,    0,    0],
         [ 938,    0,    0,  ...,    0,    0,    0],
         ...,
         [  55, 6189,   20,  ...,    0,    0,    0],
         [ 919, 1554,   12,  ...,    0,    0,    0],
         [2812,   75,   55,  ...,  119,  206,   69]], device='cuda:0'),
 tensor([0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
         1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,
         1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1], device='cuda:0'),
 tensor([[ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         [ True, False, False,  ..., False, False, False],
         ...,
         [ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ...,  True,  True,  True]], device='cuda:0'))

In [20]:
x,y,mask = next(iter(train_loader))
print('input shape: {},\nlabel shpae{},\nmask shape:{}'.format(x.shape, y.shape, mask.shape))

input shape: torch.Size([64, 31]),
label shpaetorch.Size([64]),
mask shape:torch.Size([64, 31])


In [17]:
vocab_size = len(wtoi)
batch_size = 64
emb_size = 200
pad_idx = wtoi['PAD']
output_size = 1
print(vocab_size,pad_idx)

12179 0


### Word Averaging Model

In [25]:
class WordAVGModel(nn.Module):
    def __init__(self, vocab_size, emb_size, output_size, pad_idx, dropout=0.5):
        super(WordAVGModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_size, padding_idx=pad_idx)
        self.linear = nn.Linear(emb_size, output_size)
        self.dropout = nn.Dropout(dropout)
        
        self.embed.weight.data.uniform_(-0.1, 0.1) 
        self.linear.weight.data.uniform_(-0.1, 0.1) 
    
    def forward(self, inputs, mask): #(bsz*seq_len)
        # (batch_size, seq_len, emb)
        x_emb = self.dropout(self.embed(inputs))
        mask = mask.float().unsqueeze(-1) # (batch_size, seq_len, 1), 1 represents word, 0 represents padding
        embedded = x_emb * mask # (batch_size, seq_len, embed_size)
       
        # 求平均
        sent_emb = embedded.sum(1) / (mask.sum(1) + 1e-9) 
        # dropout
        return self.linear(self.dropout(sent_emb)).squeeze(-1)

In [26]:
model = WordAVGModel(vocab_size, emb_size, output_size, pad_idx,dropout=0.5)
model=model.to(device)

In [27]:
output = model(x,mask)
print(output, output.shape)

tensor([-0.0286, -0.0434, -0.0678, -0.1135,  0.0018, -0.0761, -0.0151, -0.0532,
         0.0480, -0.0612,  0.0101, -0.0332, -0.0384, -0.0170, -0.1445,  0.0229,
        -0.0710, -0.1052, -0.0340, -0.0372,  0.0260, -0.0494, -0.0689,  0.0790,
        -0.0305, -0.0862,  0.0145, -0.0765,  0.0054, -0.0302, -0.0409,  0.0166,
        -0.0386, -0.0019, -0.0179, -0.0327,  0.0323, -0.0241, -0.0460,  0.0379,
        -0.0461, -0.0482, -0.0360, -0.0079, -0.0216, -0.0319, -0.0174, -0.0408,
        -0.0032, -0.0864, -0.0164,  0.0213, -0.0219, -0.0177,  0.1211, -0.0192,
        -0.2161, -0.0124, -0.0280, -0.0069, -0.0269, -0.0612, -0.0351,  0.0038],
       device='cuda:0', grad_fn=<SqueezeBackward1>) torch.Size([64])


In [28]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
count_parameters(model)

2436001

In [25]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [26]:
def binary_acc(y_cap, y):
    preds = torch.round(torch.sigmoid(y_cap))
    correct = (preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [31]:
def train(model, data, optimizer, criterion):
    epoch_loss, epoch_acc = 0., 0.
    model.train()

    for _, (inputs, labels, mask) in enumerate(data):
        outputs = model(inputs, mask)  # (batch_size)
        loss = criterion(outputs, labels.float()) 
        acc = binary_acc(outputs, labels)
        
        # sgd
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
#         print("batch loss: {}".format(loss.item()))
        
        #epoch_loss += loss.item() * len(labels)
        #epoch_acc += acc.item() * len(labels)
        #total_len += len(labels)
        epoch_loss += loss.item() 
        epoch_acc += acc.item() 
        epoch_len = len(data)
        
    return epoch_loss / epoch_len, epoch_acc / epoch_len

In [32]:
def evaluate(model, data, criterion):
    epoch_loss, epoch_acc = 0., 0.
    model.eval()
    total_len = 0.
    for  _, (inputs, labels, mask) in enumerate(data):
        outputs = model(inputs, mask) 
    
        with torch.no_grad():
            preds = model(inputs, mask)
        loss = criterion(outputs, labels.float()) 
        acc = binary_acc(outputs, labels)
        
       # epoch_loss += loss.item() * len(labels)
        #epoch_acc += acc.item() * len(labels)
        #total_len += len(labels)
        
        epoch_loss += loss.item() 
        epoch_acc += acc.item() 
        epoch_len = len(data)
        
    model.train()
   
    return epoch_loss / epoch_len, epoch_acc / epoch_len

In [35]:
N_EPOCHS = 5
best_valid_acc = 0.
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dev_loader,criterion)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(), "wordavg-model.pth")
        
    print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
    print("Epoch", epoch, "Valid Loss", valid_loss, "Valid Acc", valid_acc)

Epoch 0 Train Loss 0.28133587612781996 Train Acc 0.9006558641975309
Epoch 0 Valid Loss 0.42027771898678373 Valid Acc 0.8125
Epoch 1 Train Loss 0.2570802859487923 Train Acc 0.9107305374693441
Epoch 1 Valid Loss 0.41676579628671917 Valid Acc 0.8183035722800663
Epoch 2 Train Loss 0.24010873313529765 Train Acc 0.9179717530438929
Epoch 2 Valid Loss 0.4218461215496063 Valid Acc 0.8156249991485051
Epoch 3 Train Loss 0.2271915827112773 Train Acc 0.9215330066051465
Epoch 3 Valid Loss 0.42400288794721874 Valid Acc 0.8037946437086377
Epoch 4 Train Loss 0.21749075836831913 Train Acc 0.9250349059403792
Epoch 4 Valid Loss 0.43271871762616293 Valid Acc 0.8075892882687705


### Test

In [36]:
model.load_state_dict(torch.load('wordavg-model.pth'))
test_loss, test_acc = evaluate(model,test_loader, criterion)
print(test_loss, test_acc)

0.41115034756989316 0.8122584717027073


In [37]:
def print_mistake(model, data):
    model.eval()
    total_len = 0.
    for  _, (inputs, labels, mask) in enumerate(data):
        outputs = model(inputs, mask) 
    
        with torch.no_grad():
            preds = model(inputs, mask)
            preds = torch.round(torch.sigmoid(preds))
        
    wrong = (preds != labels)
    mistakes = inputs[wrong]
    correct = labels[wrong]
    for err, l in zip(mistakes,correct):
        sent = [itow[w.item()] for w in err]
        print(sent, l.item())

In [38]:
print_mistake(model, test_loader)

['i', 'saw', 'knockaround', 'guy', 'yesterday', ',', 'and', 'already', 'the', 'detail', 'have', 'fade', 'like', 'photograph', 'from', 'the', 'UNK', 'war', '...', 'it', "'s", 'so', 'unmemorable', 'that', 'it', 'turn', 'my', 'UNK', 'note', 'to', 'UNK', 'UNK', '.', 'PAD', 'PAD', 'PAD'] 0
['windtalker', 'blow', 'this', 'way', 'and', 'that', ',', 'but', 'there', "'s", 'no', 'mistake', 'the', 'filmmaker', 'in', 'the', 'tall', 'UNK', ',', 'true', 'to', 'himself', '.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'] 1
['the', 'UNK', 'bomb', 'of', 'reggio', "'s", 'image', 'and', 'glass', "'", 'evocative', 'music', '...', 'ultimately', 'leaf', 'viewer', 'with', 'the', 'task', 'of', 'divine', 'mean', '.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'] 0
['despite', 'a', 'powerful', 'portrayal', 'by', 'binoche', ',', 'it', "'s", 'a', 'period', 'romance', 'that', 'suffer', 'from', 'an', 'overly', 'deliberate', 'p

### Calculate word L2 Norm

In [39]:
model.eval()
embed_l2_norm = model.embed.weight.norm(p=2,dim=1)
word_embed_list = embed_l2_norm.data.cpu().tolist()
w_l= list(itow.values())
word_norm = zip(word_embed_list, w_l)
word_norm = sorted(word_norm)
print(word_norm[:15], '\n',word_norm[-15:])

[(0.7797574400901794, 'UNK'), (0.784145712852478, 'a'), (0.8030150532722473, 'all-french'), (0.8075856566429138, 'be'), (0.820330798625946, 'bailly'), (0.8212891817092896, 'the'), (0.8221193552017212, 'pandemonium'), (0.828968346118927, 'PAD'), (0.8394304513931274, 'schweig'), (0.8394737243652344, 'two-bit'), (0.8411467671394348, 'aan'), (0.8418119549751282, 'riviera'), (0.8430336117744446, 'fangoria'), (0.8450604677200317, 'k'), (0.8458319306373596, 'kidlets')] 
 [(3.9190773963928223, 'stupid'), (3.9440712928771973, 'poorly'), (3.991687297821045, 'refresh'), (4.050594329833984, 'failure'), (4.123385906219482, 'terrific'), (4.1326727867126465, 'hilarious'), (4.219963073730469, 'mess'), (4.239630699157715, 'waste'), (4.310248851776123, 'devoid'), (4.385274410247803, 'powerful'), (4.429516315460205, 'embrace'), (4.531133651733398, 'remarkable'), (4.71098518371582, 'flat'), (5.028470993041992, 'lack'), (5.328603744506836, 'worst')]


### Attention Weighted word averaging

In [22]:
class AttentionAVGModel(nn.Module):
    def __init__(self, vocab_size, emb_size, output_size, pad_idx, dropout=0.5):
        super(AttentionAVGModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_size, padding_idx=pad_idx)
        self.linear = nn.Linear(emb_size, output_size)
        self.dropout = nn.Dropout(dropout)
        self.u_param= nn.Parameter(torch.randn(emb_size))
        
        self.embed.weight.data.uniform_(-0.1, 0.1) 
        self.linear.weight.data.uniform_(-0.1, 0.1) 
    
    def forward(self, inputs, mask): #(bsz*seq_len)
        # (batch_size, seq_len, emb)
        embed = self.dropout(self.embed(inputs))
        mask = mask.to(float) 
        w_att, att_proj = self.attention(self.u_param, embed, mask)    
        out = self.linear(self.dropout(att_proj)).squeeze(-1) #(batch_size)
        
        return out, w_att#(batch_zize), (batch_size, seq_len)
    
    def attention(self, u_t, emb_t, mask=None):        
        u_t=u_t.unsqueeze(0).unsqueeze(0) #(1,1,emb)
        cos_sim = F.cosine_similarity(u_t, emb_t, dim=-1) #（batch_size, seq_len）
        if mask is not None:
            cos_sim.masked_fill_(mask == 0, -float('inf'))
        #print(cos_sim)
        alpha_t = F.softmax(cos_sim, dim=1) #(batch_size, seq_len）
        h_att = torch.bmm(
                   alpha_t.unsqueeze(1),               # (batch_size, 1, seq_len)
                   emb_t                                   #((batch_size, seq_len, emb))
                  ).squeeze(1)      #((batch_size, 1, emb)) ->(batch_size, emb))

        return alpha_t, h_att

In [23]:
model = AttentionAVGModel(vocab_size, emb_size, output_size, pad_idx,dropout=0.2)
model=model.to(device)

In [24]:
output, att_weight = model(x,mask)
print(output, output.shape)
print(att_weight, att_weight.shape)

tensor([-0.0259, -0.0274, -0.0400, -0.0674, -0.0277, -0.0259, -0.0117, -0.0420,
        -0.0205, -0.0734, -0.0941, -0.0294, -0.0164, -0.0233, -0.0210, -0.0046,
        -0.0448, -0.0006, -0.0240, -0.0576, -0.0348, -0.0453, -0.0803, -0.0322,
        -0.0879, -0.0279, -0.0059, -0.0396, -0.0401, -0.0352, -0.0221, -0.0449,
        -0.0387, -0.0424, -0.0183, -0.0491, -0.0104, -0.1005, -0.0262, -0.0264,
         0.0120, -0.0002, -0.0332, -0.0690, -0.0492, -0.0260, -0.0628, -0.0347,
         0.0068, -0.0479, -0.0155, -0.0347, -0.0414, -0.0331, -0.0576, -0.0164,
        -0.0754, -0.0382, -0.0134, -0.0032, -0.0244, -0.0582, -0.0235, -0.0511],
       device='cuda:0', grad_fn=<SqueezeBackward1>) torch.Size([64])
tensor([[0.0652, 0.0731, 0.0740,  ..., 0.0000, 0.0000, 0.0000],
        [0.0996, 0.0940, 0.0971,  ..., 0.0000, 0.0000, 0.0000],
        [0.3795, 0.3066, 0.3139,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0888, 0.0850, 0.0887,  ..., 0.0000, 0.0000, 0.0000],
        [0.1447, 0.14

In [27]:
def att_train(model, data, optimizer, criterion):
    epoch_loss, epoch_acc = 0., 0.
    model.train()

    for _, (inputs, labels, mask) in enumerate(data):
        outputs, _ = model(inputs, mask)  # (batch_size)
        loss = criterion(outputs, labels.float()) 
        acc = binary_acc(outputs, labels)
        
        # sgd
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
#         print("batch loss: {}".format(loss.item()))
        
        #epoch_loss += loss.item() * len(labels)
        #epoch_acc += acc.item() * len(labels)
        #total_len += len(labels)
        epoch_loss += loss.item() 
        epoch_acc += acc.item() 
        epoch_len = len(data)
        
    return epoch_loss / epoch_len, epoch_acc / epoch_len

In [28]:
def att_evaluate(model, data, criterion):
    epoch_loss, epoch_acc = 0., 0.
    model.eval()
    total_len = 0.
    for  _, (inputs, labels, mask) in enumerate(data):
        outputs, _ = model(inputs, mask) 
    
        with torch.no_grad():
            preds = model(inputs, mask)
        loss = criterion(outputs, labels.float()) 
        acc = binary_acc(outputs, labels)
        
       # epoch_loss += loss.item() * len(labels)
        #epoch_acc += acc.item() * len(labels)
        #total_len += len(labels)
        
        epoch_loss += loss.item() 
        epoch_acc += acc.item() 
        epoch_len = len(data)
        
    model.train()
   
    return epoch_loss / epoch_len, epoch_acc / epoch_len

In [79]:
N_EPOCHS = 7
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
best_valid_acc = 0.
for epoch in range(N_EPOCHS):
    train_loss, train_acc = att_train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = att_evaluate(model, dev_loader, criterion)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(), "atten_avg-model.pth")
        
    print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
    print("Epoch", epoch, "Valid Loss", valid_loss, "Valid Acc", valid_acc)

Epoch 0 Train Loss 0.4120664218721906 Train Acc 0.8174249875692674
Epoch 0 Valid Loss 0.4083181193896702 Valid Acc 0.8167410705770765
Epoch 1 Train Loss 0.23678610113724458 Train Acc 0.9122709209649538
Epoch 1 Valid Loss 0.4108219657625471 Valid Acc 0.8111607134342194
Epoch 2 Train Loss 0.19610952709758156 Train Acc 0.9272606894161627
Epoch 2 Valid Loss 0.42518852225371767 Valid Acc 0.8136160714285714
Epoch 3 Train Loss 0.1738076253660582 Train Acc 0.9347993827160493
Epoch 3 Valid Loss 0.4489919479404177 Valid Acc 0.8020089311259133
Epoch 4 Train Loss 0.16016280703098007 Train Acc 0.9391435467959129
Epoch 4 Valid Loss 0.47708586496966227 Valid Acc 0.7991071428571429
Epoch 5 Train Loss 0.1506484037358933 Train Acc 0.9426037568425634
Epoch 5 Valid Loss 0.510719011936869 Valid Acc 0.7973214302744184
Epoch 6 Train Loss 0.1441687389196437 Train Acc 0.9439844491927826
Epoch 6 Valid Loss 0.5477445828063148 Valid Acc 0.7886160739830562


### Test

In [29]:
model.load_state_dict(torch.load('atten_avg-model.pth'))
test_loss, test_acc = att_evaluate(model,test_loader, criterion)
print(test_loss, test_acc)

0.40322708569723986 0.8268058854958107


In [30]:
def att_mistake(model, data):
    model.eval()
    total_len = 0.
    for  _, (inputs, labels, mask) in enumerate(data):
        with torch.no_grad():
            preds,_ = model(inputs, mask)
            preds = torch.round(torch.sigmoid(preds))
        
    wrong = (preds != labels)
    mistakes = inputs[wrong]
    correct = labels[wrong]
    for err, l in zip(mistakes,correct):
        sent = [itow[w.item()] for w in err]
        print(sent, l.item())
att_mistake(model,test_loader)

['windtalker', 'blow', 'this', 'way', 'and', 'that', ',', 'but', 'there', "'s", 'no', 'mistake', 'the', 'filmmaker', 'in', 'the', 'tall', 'UNK', ',', 'true', 'to', 'himself', '.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'] 1
['the', 'UNK', 'bomb', 'of', 'reggio', "'s", 'image', 'and', 'glass', "'", 'evocative', 'music', '...', 'ultimately', 'leaf', 'viewer', 'with', 'the', 'task', 'of', 'divine', 'mean', '.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'] 0
['i', 'keep', 'think', 'over', 'and', 'over', 'again', ',', "'", 'i', 'should', 'be', 'enjoy', 'this', '.', "'", 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'] 0
['a', 'real', 'clunker', '.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 

### Calculate cosine similarity

In [31]:
u= model.u_param.unsqueeze(0) #(1,emb)
embeddings = model.embed.weight #(vocab,emb)
sim = F.cosine_similarity(u, embeddings, dim=-1) 

In [82]:
print(sim.shape)

torch.Size([12179])


In [32]:
w_l= list(itow.values())
sim_pair = zip(sim.data.cpu().tolist(), w_l)
sim_pair = sorted(sim_pair)
print(sim_pair[:15], '\n', sim_pair[-15:])

[(-0.9813509583473206, 'a'), (-0.9758360385894775, 'the'), (-0.9748589396476746, ','), (-0.9656421542167664, 'and'), (-0.9563846588134766, 'that'), (-0.9348066449165344, 'it'), (-0.9140702486038208, 'to'), (-0.9091906547546387, "'s"), (-0.9046059846878052, '.'), (-0.8854402899742126, 'be'), (-0.875963568687439, 'of'), (-0.8756086826324463, 'in'), (-0.8679625391960144, 'movie'), (-0.8383538722991943, 'an'), (-0.8167715668678284, 'very')] 
 [(0.5006635785102844, 'le'), (0.5077340006828308, 'insult'), (0.5079037547111511, 'nor'), (0.5084050297737122, 'too'), (0.5107676386833191, 'tiresome'), (0.5214846730232239, 'lack'), (0.5248687863349915, 'tv'), (0.526983916759491, 'fail'), (0.5281763672828674, 'miss'), (0.5303130149841309, 'worst'), (0.5467455387115479, 'waste'), (0.5655426979064941, 'never'), (0.5771090388298035, 'bad'), (0.6221871972084045, "n't"), (0.6330024600028992, 'not')]


### Calculate attention weight

In [46]:
word_id_list = []
att_weight_list = []
with torch.no_grad():
     for inputs, labels, mask in train_loader:
        _, att_weights = model(inputs, mask)
        mask = mask.view(-1)
        input_list = inputs.view(-1)[mask]  
        att_weights = att_weights.view(-1)[mask]

        word_id_list.extend(input_list.data.cpu().tolist())
        att_weight_list.extend(att_weights.data.cpu().tolist())

In [50]:
assert len(word_id_list)==len(att_weight_list)

In [54]:
from collections import defaultdict
word_list = [itow[idx] for idx in word_id_list]
w_dict=defaultdict(list)
for word, weight in zip(word_list, att_weight_list):
    w_dict[word].append(weight)
w_dict

defaultdict(list,
            {'sappy': [0.6419708728790283,
              0.15014474093914032,
              0.4101825952529907,
              0.07818847894668579,
              0.08650040626525879,
              0.42362022399902344,
              0.3004382252693176,
              0.07249385863542557,
              0.15854473412036896,
              0.20349685847759247,
              0.10001284629106522,
              0.21791766583919525,
              0.06984962522983551,
              0.13020280003547668,
              0.2521701455116272,
              0.23361188173294067,
              0.44608402252197266,
              0.36805564165115356,
              0.10769175738096237,
              0.3167005479335785,
              0.401062548160553,
              0.14259429275989532,
              0.22789742052555084,
              0.11705324798822403,
              0.10280666500329971,
              0.07151404768228531,
              0.1704888492822647,
              0.38931116461753845,
 

In [62]:
import numpy as np
att_result = [(k, np.mean(wl), np.std(wl)) for k,wl in w_dict.items() if len(wl)>100]
att_result

[('dialogue', 0.12956856617238371, 0.10319338236123456),
 ('ca', 0.10676763127176787, 0.057200779199006214),
 ("n't", 0.1879284395987417, 0.1074869909554035),
 ('go', 0.09948800563398334, 0.07476684619407764),
 ('wrong', 0.26453911375540956, 0.17214568355126109),
 ('.', 0.051007037039778276, 0.04055953961262207),
 ('quite', 0.09962874552859782, 0.08823569945610198),
 ('impressive', 0.23888371056318283, 0.1871770270021039),
 ('with', 0.06058533360280717, 0.04854924168368229),
 ('his', 0.06640444236697368, 0.05764658731492266),
 ('and', 0.058662071178319404, 0.05175137895516084),
 ('pace', 0.12077912366155974, 0.09250204329242832),
 ('yet', 0.09227153274816335, 0.07679944525170444),
 ('completely', 0.15240174329845735, 0.12895366630383417),
 ('familiar', 0.09724785153896122, 0.0914253028751145),
 ('a', 0.054882132719242034, 0.04654573573323044),
 ('plot', 0.14413372339539185, 0.11512469271406095),
 ('sentimental', 0.15301215310818425, 0.15204000783941615),
 ('mess', 0.31523782664993005, 

In [65]:
sorted(att_result, key=lambda x: x[-1], reverse=True)[:30]

[('reward', 0.26386251680065537, 0.2428034969773464),
 ('refresh', 0.2732424546033144, 0.24234247734322278),
 ('disappoint', 0.3115876207636161, 0.23306639149113212),
 ('stupid', 0.3162631086298149, 0.231490880313401),
 ('depress', 0.325026648119092, 0.22633846598970186),
 ('tedious', 0.27617175288143614, 0.2242635840652626),
 ('awful', 0.29876659300951464, 0.22240156030581476),
 ('bland', 0.2604844791578575, 0.21800870605957287),
 ('excite', 0.2147583836597986, 0.2174006551074177),
 ('bore', 0.27282887119114596, 0.21626139065549027),
 ('terrific', 0.2763214860206995, 0.21253311495977203),
 ('flat', 0.26143402515633685, 0.2122346424620028),
 ('waste', 0.27724233019049815, 0.21221585787193675),
 ('mess', 0.31523782664993005, 0.21127170144188584),
 ('succeed', 0.18065797963544078, 0.21015328967278946),
 ('unfunny', 0.2917197069286236, 0.20978184264545935),
 ('masterpiece', 0.24519885360326582, 0.208299145876918),
 ('watchable', 0.24391756037084183, 0.20824170029472172),
 ('worse', 0.2320