In [1]:
import numpy as np

import torch
import torch.nn as tnn
import torch.optim as topti

from torchtext import data
from torchtext.vocab import GloVe

# 检视数据集

In [2]:
textField = data.Field(lower=True, include_lengths=True, batch_first=True)
labelField = data.Field(sequential=False)

from imdb_dataloader import IMDB
train, dev = IMDB.splits(textField, labelField, train="train", validation="dev")

textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50))
labelField.build_vocab(train, dev)

trainLoader, testLoader = data.BucketIterator.splits((train, dev), shuffle=True, batch_size=64,
                                                     sort_key=lambda x: len(x.text), sort_within_batch=True)

In [3]:
def sample_des():
    tmp = iter(trainLoader)
    batch = next(tmp)
    inputs = textField.vocab.vectors[batch.text[0]]
    length = batch.text[1]
    labels = batch.label.type(torch.FloatTensor)
        
    print(f"inputs shape --> {inputs.shape}")
    print(f"\t Samples in batch -> {inputs.shape[0]}")
    print(f'\t Each sample has {inputs.shape[1]} words')
    print(f"\t Each word is represented by {inputs.shape[2]} length vector")

    print()
    print(f"labels shape --> {labels.shape}")
    print(f"\tlabels Counts --> {len(set(labels.numpy()))}")
    print(f"\tlabel types --> {set(labels.numpy().astype(int))}")
    # 二分类问题， log_sigmoid + MSE就完事r了

    print()
    print(f"Samples in this batch's ture length: \n{length}")

In [4]:
sample_des()

inputs shape --> torch.Size([64, 557, 50])
	 Samples in batch -> 64
	 Each sample has 557 words
	 Each word is represented by 50 length vector

labels shape --> torch.Size([64])
	labels Counts --> 2
	label types --> {1, 2}

Samples in this batch's ture length: 
tensor([557, 557, 556, 556, 556, 555, 555, 555, 554, 554, 554, 554, 554, 553,
        553, 552, 552, 552, 551, 551, 550, 550, 549, 549, 549, 546, 546, 546,
        545, 545, 544, 544, 544, 544, 543, 543, 543, 542, 541, 541, 540, 540,
        539, 539, 538, 538, 537, 537, 535, 534, 533, 531, 530, 530, 529, 529,
        528, 527, 526, 526, 526, 526, 525, 525])


In [115]:
print(inputs[-1][300:335][[x for x in range(35)]])
tmp = inputs[-1].numpy()
tmp = np.array((tmp != 0), dtype=int)
np.array([x.sum() for x in tmp])

tensor([[ 0.8794, -0.1118,  0.4338,  ...,  0.3718, -0.7137,  0.3018],
        [ 0.4270,  0.2086, -0.4812,  ...,  1.0771,  0.1696,  0.1380],
        [ 0.6805, -0.0393,  0.3019,  ..., -0.0733, -0.0647, -0.2604],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])


array([50, 50, 50,  0, 50, 50, 50, 50, 50, 50, 50, 50, 50,  0, 50, 50,  0,
       50, 50, 50, 50, 50,  0,  0, 50,  0, 50, 50, 50, 50, 50, 50, 50,  0,
        0, 50, 50, 50, 50, 50, 50, 50,  0, 50, 50, 50, 50, 50, 50, 50, 50,
       50,  0, 50,  0, 50, 50, 50, 50, 50, 50,  0, 50, 50, 50, 50, 50, 50,
       50, 50,  0, 50,  0, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,  0, 50,
       50, 50,  0, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,  0, 50, 50, 50,
        0, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,  0,  0, 50,  0, 50, 50,
       50, 50, 50,  0, 50, 50, 50, 50, 50,  0,  0, 50, 50,  0, 50, 50, 50,
       50, 50,  0, 50, 50,  0, 50, 50, 50,  0, 50, 50, 50, 50, 50, 50,  0,
       50,  0, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,  0,  0, 50, 50,
       50, 50, 50, 50, 50, 50,  0, 50, 50, 50, 50,  0, 50,  0, 50, 50, 50,
       50, 50, 50, 50,  0, 50, 50, 50, 50, 50, 50, 50,  0, 50, 50, 50,  0,
       50, 50, 50,  0, 50, 50, 50, 50,  0, 50, 50,  0,  0, 50, 50, 50, 50,
       50, 50, 50,  0, 50

In [106]:
list(set(np.array(labels).astype(int)))

[1, 2]