In [58]:
import pandas as pd
from torchtext import data
from torch.nn import init
import spacy

In [59]:
spacy_en = spacy.load('en_core_web_sm')

def tokenizer(text):
    """create a tokenizer function"""
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [60]:
LABEL = data.Field(sequential=False,
                   # Whether to use a Vocab object. If False, the data in this field should already be numerical. Default: True.
                   # ★★★★★若设置use_vocal=True,则必须使用build_vocab方法,否则会报错
                   use_vocab=False)
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)

In [61]:
train_df = pd.read_csv('test_text/train.csv', sep=',')
train_df.head() # 查看数据格式

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,99737,5233,The darker elements of misogyny and unprovoked...,1
1,10281,432,playing Malcolm McDowell,2
2,39662,1893,'re in luck .,3
3,72011,3677,these guys ' superhuman capacity,3
4,99388,5211,it 's waltzed itself into the art film pantheon .,4


In [62]:
train, val = data.TabularDataset.splits(path='test_text',
                                        train='train.csv',
                                        validation='val.csv',
                                        format='csv',
                                        skip_header=True,
                                        fields=[('PhraseId', None), # 通过设置PhraseID字段的Field为None,从而忽略该字段
                                                ('SentenceId', None),
                                                ('Phrase', TEXT),
                                                ('Sentiment', LABEL)])

In [63]:
train.examples[0].__dict__

{'Phrase': ['the',
  'darker',
  'elements',
  'of',
  'misogyny',
  'and',
  'unprovoked',
  'violence',
  'suffocate',
  'the',
  'illumination',
  'created',
  'by',
  'the',
  'two',
  'daughters',
  'and'],
 'Sentiment': '1'}

In [64]:
train[0].__dict__.keys() # 不含PhraseID和SentenceId字段

dict_keys(['Phrase', 'Sentiment'])

In [65]:
'''
预训练词向量有:
charngram.100d
fasttext.en.300d
fasttext.simple.300d

# glove词嵌入提供的预训练模型
glove.42B.300d
glove.840B.300d
glove.twitter.27B.25d
glove.twitter.27B.50d
glove.twitter.27B.100d
glove.twitter.27B.200d
glove.6B.50d
glove.6B.100d
glove.6B.200d
glove.6B.300d
'''
# 参数为类torchtext.vocab.Vocab的参数
# Construct the Vocab object for this field from one or more datasets
TEXT.build_vocab(train,
                 # 默认vectors=None
                 vectors='glove.6B.100d', #  One of either the available pretrained vectors or custom pretrained vectors (see Vocab.load_vectors)
                 # unk_init必须是函数;若没有设置vectors参数,则必须设置unk_init=None
                 unk_init = init.zeros_, # by default, initialize out-of-vocabulary word vectors to zero vectors; can be any function that takes in a Tensor and returns a Tensor of the same size. Default: torch.Tensor.zero_
                 max_size=2000, # The maximum size of the vocabulary, or None for no maximum. Default: None.
                 min_freq=1, # The minimum frequency needed to include a token in the vocabulary. Values less than 1 will be set to 1. Default: 1.
                 specials=['<unk>', '<pad>'], # The list of special tokens (e.g., padding or eos) that will be prepended to the vocabulary. Default: [‘<unk’>, ‘<pad>’]
                 vectors_cache='vector_cache/')

In [66]:
TEXT.vocab # 类型为torchtext.vocab.Vocab;

<torchtext.vocab.Vocab at 0x1754d3ad730>

In [67]:
TEXT.vocab.stoi # {word:id}

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x000001754D3AD730>>,
            {'<unk>': 0,
             '<pad>': 1,
             'the': 2,
             ',': 3,
             'a': 4,
             'of': 5,
             'and': 6,
             '-': 7,
             'to': 8,
             '.': 9,
             "'s": 10,
             'in': 11,
             'is': 12,
             'that': 13,
             'it': 14,
             'as': 15,
             'with': 16,
             'for': 17,
             'its': 18,
             'film': 19,
             'an': 20,
             'movie': 21,
             'this': 22,
             '`': 23,
             'but': 24,
             'be': 25,
             'on': 26,
             'you': 27,
             'by': 28,
             'more': 29,
             "n't": 30,
             "'": 31,
             'his': 32,
             'one': 33,
             'about': 34,
             'not': 35,
             'at': 36,
             'than': 37,

In [68]:
TEXT.vocab.itos # [word];索引为id

['<unk>',
 '<pad>',
 'the',
 ',',
 'a',
 'of',
 'and',
 '-',
 'to',
 '.',
 "'s",
 'in',
 'is',
 'that',
 'it',
 'as',
 'with',
 'for',
 'its',
 'film',
 'an',
 'movie',
 'this',
 '`',
 'but',
 'be',
 'on',
 'you',
 'by',
 'more',
 "n't",
 "'",
 'his',
 'one',
 'about',
 'not',
 'at',
 'than',
 'from',
 'or',
 'all',
 '--',
 'like',
 'have',
 'are',
 'has',
 'so',
 'out',
 'story',
 '-rrb-',
 'up',
 'who',
 'good',
 'too',
 'most',
 'into',
 '-lrb-',
 'if',
 'their',
 'what',
 'time',
 'no',
 'characters',
 '...',
 'much',
 "''",
 'comedy',
 'i',
 'can',
 'your',
 'just',
 'life',
 'some',
 'does',
 'funny',
 'even',
 'little',
 'will',
 'well',
 'way',
 'very',
 'any',
 'been',
 'make',
 'only',
 'which',
 'he',
 'movies',
 'love',
 'bad',
 'do',
 'there',
 'new',
 'director',
 'work',
 'own',
 'enough',
 'her',
 'was',
 'they',
 'us',
 'old',
 'made',
 'other',
 'something',
 'action',
 'two',
 'would',
 'never',
 'best',
 'we',
 'many',
 'through',
 'people',
 'when',
 'off',
 'self'

In [69]:
TEXT.vocab.freqs # 词频

Counter({'the': 41360,
         'darker': 58,
         'elements': 170,
         'of': 26108,
         'misogyny': 23,
         'and': 25674,
         'unprovoked': 7,
         'violence': 191,
         'suffocate': 4,
         'illumination': 6,
         'created': 74,
         'by': 3190,
         'two': 926,
         'daughters': 22,
         'playing': 159,
         'malcolm': 8,
         'mcdowell': 4,
         "'re": 650,
         'in': 11309,
         'luck': 40,
         '.': 14161,
         'these': 534,
         'guys': 165,
         "'": 3127,
         'superhuman': 15,
         'capacity': 45,
         'it': 9419,
         "'s": 13595,
         'waltzed': 6,
         'itself': 419,
         'into': 1745,
         'art': 386,
         'film': 5354,
         'pantheon': 30,
         'new': 1031,
         'to': 18267,
         'see': 825,
         'pretty': 348,
         'funny': 1284,
         'stomp': 7,
         'could': 764,
         'be': 4041,
         'this': 4559,
    

In [70]:
len(TEXT.vocab.itos)

2002

In [71]:
TEXT.vocab.vectors # 词向量矩阵

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.3620,  0.4363,  0.1054,  ...,  0.1954,  0.3780,  0.4060],
        [ 0.0186,  0.5835,  0.4911,  ..., -0.6450, -0.4011,  1.0333],
        [ 0.1054, -0.1024,  0.3480,  ..., -0.5547, -0.2831, -0.1007]])

In [72]:
TEXT.vocab.vectors.shape # 单词个数为2002,词向量维度为100

torch.Size([2002, 100])

In [73]:
TEXT.vocab.stoi['good'] # 单词'good'在单词表中的id

52

In [74]:
word_vec = TEXT.vocab.vectors[TEXT.vocab.stoi['good']]
print(word_vec.shape)
print(word_vec) # 单词'good'对应的词向量


torch.Size([100])
tensor([-0.0308,  0.1199,  0.5391, -0.4370, -0.7394, -0.1534,  0.0811, -0.3856,
        -0.6880, -0.4163, -0.1318, -0.2492,  0.4410,  0.0859,  0.2087, -0.0636,
         0.0622, -0.0512, -0.1340,  1.1418,  0.0365,  0.4903, -0.2457, -0.4120,
         0.1235,  0.4134, -0.4840, -0.5424, -0.2779, -0.2601, -0.3848,  0.7866,
         0.1023, -0.2071,  0.4075,  0.3203, -0.5105,  0.4836, -0.0099, -0.3868,
         0.0350, -0.1670,  0.4237, -0.5416, -0.3032, -0.3698,  0.0828, -0.5254,
        -0.0645, -1.3980, -0.1487, -0.3533, -0.1118,  1.0912,  0.0959, -2.8129,
         0.4524,  0.4621,  1.6012, -0.2084, -0.2738,  0.7120, -1.0754, -0.0470,
         0.6748, -0.0658,  0.7582,  0.3941,  0.1551, -0.6472,  0.3280, -0.0317,
         0.5290, -0.4389,  0.6740,  0.4214, -0.1198, -0.2178, -0.2976, -0.1351,
         0.5990,  0.4653, -0.5826, -0.0232, -1.5442,  0.0190, -0.0159,  0.0245,
        -0.5802, -0.6766, -0.0404, -0.4404,  0.0833,  0.2004, -0.7550,  0.1692,
        -0.2657, -0.52