In [2]:
import torch
import torchtext
import jieba
import re
import pandas as pd
from tqdm.notebook import tqdm

# 1.训练Glove中文词向量

### 1.1将文本分词并去除停用词后按行写入文件

In [3]:
df = pd.read_csv('../../datasets/THUCNews/train.csv').dropna().reset_index(drop=True)
stopwords = [line.strip() for line in open('../stopwords/cn_stopwords.txt', 'r', encoding='utf-8').readlines()]

f = open('./stanford-Glove/THUCNews.txt', 'w')
for title in tqdm(df['title']):   
    # 去除标点符号
    title = re.sub(r'[^\u4e00-\u9fa5]', '', title)
    tokens = [token for token in jieba.cut(title.strip()) if token not in stopwords]
    f.write(' '.join(tokens) + '\n')
f.close()

  0%|          | 0/501644 [00:00<?, ?it/s]

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/d1/4_gsqv2176z583_7rmpm27lh0000gn/T/jieba.cache
Loading model cost 0.300 seconds.
Prefix dict has been built successfully.


### 1.2使用standford/Glove工具训练

In [13]:
# 从github clone斯坦福德glove训练repo
# git clone https://github.com/stanfordnlp/GloVe.git
# mv GloVe stanford-glove
# 把demo.sh中的CORPUS=改成CORPUS=THUCNews.txt

!cd stanford-glove && make && sh demo.sh

mkdir -p build
mkdir -p build

$ build/vocab_count -min-count 5 -verbose 2 < THUCNews.txt > vocab.txt
BUILDING VOCABULARY
Processed 0 tokens.[11G100000 tokens.[11G200000 tokens.[11G300000 tokens.[11G400000 tokens.[11G500000 tokens.[11G600000 tokens.[11G700000 tokens.[11G800000 tokens.[11G900000 tokens.[11G1000000 tokens.[11G1100000 tokens.[11G1200000 tokens.[11G1300000 tokens.[11G1400000 tokens.[11G1500000 tokens.[11G1600000 tokens.[11G1700000 tokens.[11G1800000 tokens.[11G1900000 tokens.[11G2000000 tokens.[11G2100000 tokens.[11G2200000 tokens.[11G2300000 tokens.[11G2400000 tokens.[11G2500000 tokens.[11G2600000 tokens.[11G2700000 tokens.[11G2800000 tokens.[11G2900000 tokens.[11G3000000 tokens.[11G3100000 tokens.[11G3200000 tokens.[11G3300000 tokens.[11G3400000 tokens.[11G3500000 tokens.[11G3600000 tokens.[11G3700000 tokens.[0GProcessed 3745492 tokens.
Counted 192967 unique words.
Truncating vocabulary at min count 5.
Using vocabulary of size 52133.

# 2.加载Glove词向量

In [17]:
# 加载Glove预训练的词向量
embeddings = torchtext.vocab.Vectors(name ='./stanford-glove/vectors.txt')
embeddings.vectors.shape

torch.Size([52134, 50])

In [21]:
# 查看词向量
print(embeddings.get_vecs_by_tokens('中国'))
print(embeddings.get_vecs_by_tokens('自然'))

tensor([-0.0794, -1.5969, -0.3386,  0.5643, -0.2867, -0.1013, -0.8174,  1.5820,
        -0.5419, -0.0539, -0.6309, -1.0643, -0.1052,  0.0314,  2.1225,  1.0641,
        -0.2695, -0.9872, -0.5653, -1.5975,  0.1454,  0.0120,  0.3458,  0.3492,
        -0.0757,  1.1362, -1.1327, -0.5551, -1.0531,  1.4729,  0.0657, -1.4755,
        -2.0678,  0.5270, -0.9490,  1.6898,  0.4204, -2.2277, -0.3642, -0.6742,
        -1.1886,  0.3295,  0.2152, -0.1416, -0.9151,  0.2209,  0.0389, -0.1031,
        -0.8291, -0.8683])
tensor([-0.3979,  0.1319, -0.3189,  0.5688, -0.4871,  0.7348, -0.2840, -0.2068,
        -0.0486, -0.4415, -0.4795,  0.2905, -0.6084, -0.0958, -0.2738, -0.3969,
        -0.6119,  0.2108, -0.0191, -0.3205, -0.2233,  0.0657,  0.1794, -0.0613,
        -0.4956,  0.3792, -0.0049,  0.0338, -0.1669,  0.4913, -0.6773, -0.1883,
         0.5105,  0.1810, -0.7138,  0.0232, -0.5813, -0.1872, -0.4647, -0.6754,
         0.3009, -0.1071,  0.3422, -0.5923, -0.1766,  0.0090,  0.6996,  0.2216,
         0.44

In [22]:
# 将预训练vectors加载到Embdding网络中
# freeze为True，则冻结embed层的参数
embed = torch.nn.Embedding.from_pretrained(embeddings.vectors, freeze=True)  
print(embed.weight)
print(embed.weight.requires_grad)

Parameter containing:
tensor([[-1.1892,  0.0564, -0.9085,  ..., -1.6788,  0.6869, -0.0925],
        [-1.8635,  0.6784, -0.2948,  ..., -1.6540,  0.8157, -0.4789],
        [ 0.2817,  0.2448, -0.2553,  ..., -0.4893, -0.5367,  0.1656],
        ...,
        [-0.1367, -0.1138, -0.0513,  ...,  0.1718,  0.0350,  0.0153],
        [-0.0685, -0.4505, -0.0600,  ...,  0.3472,  0.0804,  0.1158],
        [ 0.0308,  0.0091,  0.0297,  ...,  0.0334, -0.0045,  0.0352]])
False
