In [4]:
import io
import torch
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics

In [5]:
def load_vectors(fname):
    # taken from: https://fasttext.cc/docs/en/english-vectors.html
    fin = io.open(
        fname,
        'r', encoding='utf-8', newline='\n', errors='ignore'
    )
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

In [6]:
df = pd.read_csv("../input/imdb_folds.csv")

In [7]:
df

Unnamed: 0,review,sentiment,kfold
0,This movie makes you think. It shows how a wom...,1,0
1,This is one of the movies that get better ever...,1,0
2,Bonny Hunt scores a coup with her directorial ...,1,0
3,"Along with the ""Maratonci trce pocasni krug"" f...",1,0
4,"""That 'Malcom' show on FOX is really making a ...",0,0
...,...,...,...
49995,The Human Tornado is a campy 70's Blaxploitati...,1,4
49996,"First of all , you should watch this only if y...",1,4
49997,This can be one of the most enjoyable movies e...,0,4
49998,Why do the powers that be continue to cast Jen...,0,4


In [11]:
df['review'].iloc[7]

"Wow! I loved this movie and LOVE Judy Marte!! This girl isn't just an awesome pretty face, she's funny and really really talented!! She made me laugh many times just by being very naturally rough with Victor who was desperately hitting on her! We'll be seeing her a lot in the next coming years... and probably also from director Peter Sollett and co-star Victor Rasuk!<br /><br />Raising Victor Vargas is one of the best film I saw in a long time! Very refreshing! It's true, nice, funny, well filmed, it got it all : good story, good actors, good film direction!<br /><br />If you like simple, slow paced, real life, urban movies, like maybe Jersey Girl from Kevin Smith, you'll love Victor Vargas! It's better!"

In [13]:
fold = 0
# fetch training dataframe
train_df = df[df.kfold != fold].reset_index(drop=True)

# fetch validation dataframe
valid_df = df[df.kfold == fold].reset_index(drop=True)

In [15]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()

In [17]:
tokenizer.fit_on_texts(df.review.values.tolist())

In [20]:
df.review.values.tolist()[7]

"Wow! I loved this movie and LOVE Judy Marte!! This girl isn't just an awesome pretty face, she's funny and really really talented!! She made me laugh many times just by being very naturally rough with Victor who was desperately hitting on her! We'll be seeing her a lot in the next coming years... and probably also from director Peter Sollett and co-star Victor Rasuk!<br /><br />Raising Victor Vargas is one of the best film I saw in a long time! Very refreshing! It's true, nice, funny, well filmed, it got it all : good story, good actors, good film direction!<br /><br />If you like simple, slow paced, real life, urban movies, like maybe Jersey Girl from Kevin Smith, you'll love Victor Vargas! It's better!"

In [22]:
# convert training data to sequences
# for example : "bad movie" gets converted to
# [24, 27] where 24 is the index for bad and 27 is the
# index for movie
xtrain = tokenizer.texts_to_sequences(train_df.review.values)

# similarly convert validation data to sequences
xtest = tokenizer.texts_to_sequences(valid_df.review.values)

In [24]:
type(xtrain)

list

In [25]:
xtrain[7]

[10,
 436,
 11,
 17,
 13,
 14,
 72,
 42,
 395,
 3041,
 14,
 231,
 329,
 723,
 4888,
 38,
 21,
 73,
 23,
 81,
 606,
 85,
 1874,
 2,
 24737,
 9,
 6,
 15,
 378,
 21,
 5,
 26,
 1578,
 5,
 138,
 3,
 34989,
 648,
 1560,
 1992,
 7462,
 1,
 16993,
 4,
 260,
 5578,
 5022,
 6410,
 10,
 894,
 36,
 2271,
 117,
 378,
 1575,
 189,
 1,
 203,
 5,
 2058,
 3599,
 153,
 604,
 2,
 654,
 1893,
 18746,
 23,
 3,
 1171,
 6136,
 26281,
 8,
 1967,
 70,
 281,
 29,
 910,
 1,
 965,
 181,
 13,
 456,
 122,
 3965,
 15,
 378,
 8,
 946,
 366,
 248,
 1023,
 2,
 378,
 8,
 2625,
 66,
 65,
 2000,
 2764,
 34,
 33,
 413,
 2238,
 2000,
 573,
 193,
 5,
 1047,
 65,
 6257,
 7306,
 6,
 6129,
 166,
 140,
 11,
 926,
 8,
 260,
 55,
 2,
 73,
 76,
 3,
 49,
 163,
 30,
 85,
 646,
 73,
 25,
 29,
 588,
 30,
 27,
 55,
 38,
 156,
 141,
 5,
 1,
 1772,
 4,
 3041,
 73,
 23,
 90,
 5,
 582,
 48,
 9,
 200,
 26,
 21751,
 42,
 378,
 12,
 94,
 93,
 20185,
 15,
 3,
 4763,
 465,
 1,
 4133,
 284,
 6,
 12,
 1,
 345,
 23,
 74,
 261,
 138,
 12335,
 535,
 

In [27]:
xtrain = tf.keras.preprocessing.sequence.pad_sequences(xtrain, maxlen=128)

# zero pad the validation sequences
xtest = tf.keras.preprocessing.sequence.pad_sequences(xtest, maxlen=128)

In [68]:
len(xtrain[7])

128

In [69]:
xtrain[7]

array([   95,   904,     5,  1389,    16,     1,  5028,  1870,   103,
         121,    93,   132,    66,  3629,  1016,    42,   378,     2,
       22124,    60,    70,   109,  4754,   408,    20,     1,   665,
           8,  2045,     6, 22441,    18,  4930,     7,     7,   234,
           1,    17,     6,  1061,  7306,    36,     1,    86,   565,
           9,   306,     3,   120, 61265,    82,    15,     1, 13780,
          18,     1,   102,    25,     3,   310,    95,     4,  3431,
         529,     5,    22,     2,    22,   127,    53,   386,     1,
         223,   435,     2,    57,  1759,     1,  7227, 12794,     2,
         197,    93,  9283,     1,   442,  1051,     6,     1,  5124,
       31430,     4,     1, 61266,    60,  2208,  5124,     2, 10915,
           2,  1279, 45543,     7,     7,  1332,    12,    11,    17,
          13,   748,   300,     1,   823,  1560,     4,     1,  7306,
       22294,  1027,   411,    68,     3,   558,     4,  4417,    15,
          29,   571]

In [28]:
class IMDBDataset:
    def __init__(self, reviews, targets):
        """
        :param reviews: this is a numpy array
        :param targets: a vector, numpy array
        """
        self.reviews = reviews
        self.target = targets

    def __len__(self):
        # returns length of the dataset
        return len(self.reviews)

    def __getitem__(self, item):
        # for any given item, which is an int,
        # return review and targets as torch tensor
        # item is the index of the item in concern
        review = self.reviews[item, :]
        target = self.target[item]

        return{"review": torch.tensor(review, dtype=torch.long),
               "target": torch.tensor(target, dtype=torch.float)}


In [29]:
train_dataset = IMDBDataset(reviews=xtrain,targets=train_df.sentiment.values)

In [32]:
train_dataset.__len__()

40000

In [36]:
len(train_dataset.reviews[7])

128

In [37]:
train_dataset.reviews[7]

array([   95,   904,     5,  1389,    16,     1,  5028,  1870,   103,
         121,    93,   132,    66,  3629,  1016,    42,   378,     2,
       22124,    60,    70,   109,  4754,   408,    20,     1,   665,
           8,  2045,     6, 22441,    18,  4930,     7,     7,   234,
           1,    17,     6,  1061,  7306,    36,     1,    86,   565,
           9,   306,     3,   120, 61265,    82,    15,     1, 13780,
          18,     1,   102,    25,     3,   310,    95,     4,  3431,
         529,     5,    22,     2,    22,   127,    53,   386,     1,
         223,   435,     2,    57,  1759,     1,  7227, 12794,     2,
         197,    93,  9283,     1,   442,  1051,     6,     1,  5124,
       31430,     4,     1, 61266,    60,  2208,  5124,     2, 10915,
           2,  1279, 45543,     7,     7,  1332,    12,    11,    17,
          13,   748,   300,     1,   823,  1560,     4,     1,  7306,
       22294,  1027,   411,    68,     3,   558,     4,  4417,    15,
          29,   571]

In [38]:
# create torch dataloader for training
# torch dataloader loads the data using dataset
# class in batches specified by batch size
train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=16, num_workers=2
)

In [40]:
# initialize dataset class for validation
valid_dataset = IMDBDataset(reviews=xtest,
                                    targets=valid_df.sentiment.values)

# create torch dataloader for validation
valid_data_loader = torch.utils.data.DataLoader(valid_dataset,
                                                batch_size=8,
                                                num_workers=1)

In [41]:
embedding_dict = load_vectors("../input/crawl-300d-2M.vec")

In [42]:
type(embedding_dict)

dict

In [44]:
len(embedding_dict['happy'])

300

In [45]:
def create_embedding_matrix(word_index, embedding_dict):
    """
    This function creates the embedding matrix.
    :param word_index: a dictionary with word:index_value
    :param embedding_dict: a dictionary with word:embedding_vector
    :return: a numpy array with embedding vectors for all known words
    """
    # initialize matrix with zeros
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    # loop over all the words
    for word, i in word_index.items():
        # if word is found in pre-trained embeddings,
        # update the matrix. if the word is not found,
        # the vector is zeros!
        if word in embedding_dict:
            embedding_matrix[i] = embedding_dict[word]
    # return embedding matrix
    return embedding_matrix

In [47]:
type(tokenizer.word_index)

dict

In [48]:
len(tokenizer.word_index)

124252

In [51]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [52]:
tokenizer.word_index['happy']

674

In [54]:
for word, i in tokenizer.word_index.items():
    print(word,i)

the 1
and 2
a 3
of 4
to 5
is 6
br 7
in 8
it 9
i 10
this 11
that 12
was 13
as 14
for 15
with 16
movie 17
but 18
film 19
on 20
not 21
you 22
are 23
his 24
have 25
be 26
one 27
he 28
all 29
at 30
by 31
an 32
they 33
so 34
who 35
from 36
like 37
or 38
just 39
her 40
out 41
about 42
if 43
it's 44
has 45
there 46
some 47
what 48
good 49
when 50
more 51
very 52
up 53
no 54
time 55
my 56
even 57
would 58
she 59
which 60
only 61
really 62
see 63
story 64
their 65
had 66
can 67
me 68
well 69
were 70
than 71
much 72
we 73
bad 74
been 75
get 76
do 77
great 78
other 79
will 80
also 81
into 82
people 83
because 84
how 85
first 86
him 87
most 88
don't 89
made 90
then 91
its 92
them 93
make 94
way 95
too 96
movies 97
could 98
any 99
after 100
think 101
characters 102
watch 103
films 104
two 105
many 106
seen 107
character 108
being 109
never 110
plot 111
love 112
acting 113
life 114
did 115
best 116
where 117
know 118
show 119
little 120
over 121
off 122
ever 123
does 124
your 125
better 126
end 127
m

unwieldy 35377
equalled 35378
beeb 35379
who' 35380
bolshevik 35381
pontificate 35382
consulting 35383
sadomasochism 35384
chavez's 35385
udder 35386
ruckus 35387
mimouni 35388
balling 35389
'special' 35390
ag 35391
lashley 35392
uruguay 35393
excalibur 35394
kunis 35395
midge 35396
kneeling 35397
counterbalance 35398
carnby 35399
mitra 35400
scrupulously 35401
churlish 35402
ney 35403
cuba's 35404
'deliverance' 35405
xi 35406
towels 35407
hergé 35408
bogard 35409
embezzled 35410
madhavan 35411
cologne 35412
codger 35413
shatters 35414
generation' 35415
dolemite's 35416
merges 35417
legolas 35418
bambino 35419
3rds 35420
brocéliande 35421
unappetizing 35422
rivalled 35423
salman's 35424
inflicts 35425
conquerer 35426
accordion 35427
faith's 35428
appreciable 35429
persisted 35430
macek 35431
megazone 35432
romance' 35433
fruitcake 35434
nationalistic 35435
fusing 35436
preliminary 35437
favoring 35438
barbers 35439
brockwell 35440
synchronize 35441
minnelli's 35442
ariana 35443
animani

superflous 68578
raiting 68579
brunson 68580
legislative 68581
dinsdale 68582
indochina 68583
lizard's 68584
priming 68585
fdr's 68586
imbibed 68587
videoasia 68588
maschera 68589
virgine 68590
roddam 68591
segement 68592
madhavi's 68593
hatin' 68594
piano's 68595
unrestricted 68596
surviver 68597
overmuch 68598
'shore 68599
trailed 68600
stoppingly 68601
purcell's 68602
pettyfer 68603
'prestige' 68604
maudy 68605
'crimes 68606
vena 68607
nowheresville 68608
idling 68609
lampoonery 68610
'impossible' 68611
north' 68612
parties' 68613
internet's 68614
variety's 68615
doane 68616
redressed 68617
nicolae 68618
thins 68619
spritely 68620
greenest 68621
veli 68622
vandervoort 68623
thomason 68624
'distant 68625
fez' 68626
malina 68627
franpyscho 68628
lost' 68629
wips 68630
thewlis' 68631
maude's 68632
mfers 68633
subdues 68634
watchword 68635
diorama 68636
suggestiveness 68637
ferdie's 68638
intersperses 68639
carli 68640
thorne's 68641
192 68642
bernson 68643
bellum 68644
'slice 68645
gla

nepolean 104316
lipton's 104317
frakking 104318
whateverness 104319
h2g2 104320
necheyev 104321
39th 104322
seemless 104323
nighclub 104324
poupees 104325
expetations 104326
gaillardia's 104327
humilitated 104328
interment 104329
kamikazes 104330
laudably 104331
bloss' 104332
casares' 104333
synapse's 104334
spirit 104335
underclothing 104336
gaetani 104337
grouchowho 104338
mendl 104339
flanders's 104340
'vtm' 104341
programmation 104342
renaldo's 104343
frisch 104344
the 104345
jerry 104346
    film 104347
blindspot's 104348
 journey 104349
 as 104350
  with 104351
 a 104352
 astounding 104353
swng 104354
tsubaki 104355
yojiro 104356
takita 104357
heyijustleftmycoatbehind 104358
balder 104359
cherub's 104360
discotheques 104361
knitwear 104362
'danger' 104363
negotiable 104364
abhorred 104365
bellarmine 104366
scourged 104367
careerism 104368
furbellowed 104369
phosphates 104370
inversely 104371
sorrentino's 104372
drvn 104373
professorly 104374
depricating 104375
totentanz 10437

In [55]:
embedding_matrix = create_embedding_matrix(tokenizer.word_index, embedding_dict)


In [56]:
embedding_matrix.shape

(124253, 300)

In [57]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [59]:
# lstm.py
import torch
import torch.nn as nn


class LSTM(nn.Module):
    def __init__(self, embedding_matrix):
        """
        :param embedding_matrix: numpy array with vectors for all words
        """
        super(LSTM, self).__init__()

        # number of words = number of rows in embedding matrix
        num_words = embedding_matrix.shape[0]

        # dimension of embedding is num of columns in the matrix
        embed_dim = embedding_matrix.shape[1]

        # we define an input embedding layer
        self.embedding = nn.Embedding(num_embeddings=num_words, embedding_dim=embed_dim)

        # embedding matrix is used as weights of
        # the embedding layer
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))

        # we dont want to train the pretrained embeddings
        self.embedding.weight.requires_grad = False

        # a simple bidirectional LSTM with hidden size of 128
        self.lstm = nn.LSTM(
            embed_dim,
            128,
            bidirectional=True,
            batch_first=True,)

        # output layer which is a linear layer
        # we have only one output
        # input (512) = 128 + 128 for mean and same for max pooling
        self.out = nn.Linear(512, 1)

    def forward(self, x):
        # pass data through embedding layer
        # the input is just the tokens
        x = self.embedding(x)

        # move embedding output to lstm
        x, _ = self.lstm(x)

        # apply mean and max pooling on lstm output
        avg_pool = torch.mean(x, 1)
        max_pool, _ = torch.max(x, 1)

        # concatenate mean and max pooling
        # this is why size is 512
        # 128 for each direction = 256
        # avg_pool = 256 and max_pool = 256
        out = torch.cat((avg_pool, max_pool), 1)

        # pass through the output layer and return the output
        out = self.out(out)
        # return linear output
        return out

In [60]:
type(embedding_matrix)

numpy.ndarray

In [61]:
embedding_matrix

array([[ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.0231,  0.017 ,  0.0157, ...,  0.0744, -0.1118,  0.0963],
       [-0.1081,  0.0191,  0.0354, ...,  0.1104,  0.0475, -0.0599],
       ...,
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [-0.24  ,  0.2371, -0.0136, ..., -0.1706, -0.1648,  0.1903],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]])

In [63]:
model = LSTM(embedding_matrix)
model.to(device)

# initialize Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

print("Training Model")
# set the best accuracy to zero
best_accuracy = 0
# set early stopping counter to zero
early_stopping_counter = 0

Training Model


In [64]:
# engine.py
import torch
import torch.nn as nn


def train(data_loader, model, optimizer, device):
    """
    This is the main training function that trains model for one epoch
    :param data_loader: this is the torch dataloader
    :param model: model (lstm model)
    :param optimizer: torch optimizer, e.g. adam, sgd, etc.
    :param device: this can be "cuda" or "cpu"
    """

    # set model to training mode
    model.train()

    # go through batches of data in data loader
    for data in data_loader:
        # fetch review and target from the dict
        reviews = data["review"]
        targets = data["target"]

        # move the data to device that we want to use
        reviews = reviews.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        # clear the gradients
        optimizer.zero_grad()

        # make predictions from the model
        predictions = model(reviews)

        # calculate the loss
        loss = nn.BCEWithLogitsLoss()(predictions,
                                      targets.view(-1, 1))

        # compute gradient of loss w.r.t.
        # all parameters of the model that are trainable
        loss.backward()

        # single optimization step
        optimizer.step()


def evaluate(data_loader, model, device):
    # initialize empty lists to store predictions and targets
    final_predictions = []
    final_targets = []

    # put the model in eval mode
    model.eval()

    # disable gradient calculation
    with torch.no_grad():
        for data in data_loader:
            reviews = data["review"]
            targets = data["target"]
            reviews = reviews.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            # make predictions
            predictions = model(reviews)

            # move predictions and targets to list
            # we need to move predictions and targets to cpu too
            predictions = predictions.cpu().numpy().tolist()
            targets = data["target"].cpu().numpy().tolist()
            final_predictions.extend(predictions)
            final_targets.extend(targets)

    # return final predictions and targets
    return final_predictions, final_targets


In [66]:
for epoch in range(1):
    # train one epoch
    train(train_data_loader, model, optimizer, device)
    # validate
    outputs, targets = evaluate(
        valid_data_loader, model, device
    )
    # use threshold of 0.5
    # please note we are using linear layer and no sigmoid
    # you should do this 0.5 threshold after sigmoid
    outputs = np.array(outputs) >= 0.5

    # calculate accuracy
    accuracy = metrics.accuracy_score(targets, outputs)
    print(f"FOLD:{fold}, Epoch: {epoch}, Accuracy Score = {accuracy}")

    # simple early stopping
    if accuracy > best_accuracy:
        best_accuracy = accuracy
    else:
        early_stopping_counter += 1
    if early_stopping_counter > 2:
        break


python(17989) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(17990) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(19262) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


FOLD:0, Epoch: 0, Accuracy Score = 0.8978


In [71]:
def load_embeddings(word_index, embedding_file, vector_length=300): 
    """
    A general function to create embedding matrix :param word_index: word:index dictionary :param embedding_file: path to embeddings file :param vector_length: length of vector
    """
    max_features = len(word_index) + 1 
    words_to_find = list(word_index.keys()) 
    more_words_to_find = []
    for wtf in words_to_find: 
        more_words_to_find.append(wtf) 
        more_words_to_find.append(str(wtf).capitalize())
    more_words_to_find = set(more_words_to_find) 
    
    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_file) if o.split(" ")[0]
    in more_words_to_find
    and len(o) > 100
    )
    embedding_matrix = np.zeros((max_features, vector_length))
    for word, i in word_index.items():
        if i >= max_features: 
            continue
        embedding_vector = embeddings_index.get(word) 
        if embedding_vector is None:
            embedding_vector = embeddings_index.get( 
                str(word).capitalize()
        )
        if embedding_vector is None:
            embedding_vector = embeddings_index.get( str(word).upper())
        if (embedding_vector is not None and len(embedding_vector) == vector_length):
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [77]:
dfx = pd.read_csv( "../input/IMDB_Dataset.csv").fillna("none")[:100]

In [78]:
dfx

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
95,Daniel Day-Lewis is the most versatile actor a...,positive
96,My guess would be this was originally going to...,negative
97,"Well, I like to watch bad horror B-Movies, cau...",negative
98,"This IS the worst movie I have ever seen, as w...",negative


In [79]:
import torch
import torch.nn.functional as F

# 假设我们有两个句子的嵌入表示
sentence_1 = torch.tensor([[0.1]*128]*4 + [[0]*128]*4) # 长度为4，后面填充4
sentence_2 = torch.tensor([[0.2]*128]*7 + [[0]*128]*1) # 长度为7，后面填充1

# 将两个句子放入一个批次
batch = torch.stack([sentence_1, sentence_2])

# 创建对应的掩码
mask = torch.tensor([[1, 1, 1, 1, 0, 0, 0, 0], 
                     [1, 1, 1, 1, 1, 1, 1, 0]])

# 在计算注意力时应用掩码
attention_scores = torch.matmul(batch, batch.transpose(1, 2)) # 自注意力计算
attention_scores = attention_scores.masked_fill(mask.unsqueeze(1) == 0, float('-inf')) # 应用掩码

# 使用 softmax 得到注意力权重
attention_weights = F.softmax(attention_scores, dim=-1)
print(attention_weights)


tensor([[[0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000]],

        [[0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
         [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
         [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
         [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
         [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
         [0.1429, 0.142