# word2vec训练词向量

In [2]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

import nltk.data
#nltk.download()
#from nltk.corpus import stopwords

from gensim.models.word2vec import Word2Vec

In [2]:
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('..', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

### 读入无标签数据
用于训练生成word2vec词向量

In [22]:
df = load_dataset('unlabeled_train')
df.head()

Number of reviews: 50000


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


### 和第一个ipython notebook一样做数据的预处理
稍稍有一点不一样的是，我们留了个候选，可以去除停用词，也可以不去除停用词

In [3]:
#eng_stopwords = set(stopwords.words('english'))
eng_stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def print_call_counts(f):
    n = 0
    def wrapped(*args, **kwargs):
        nonlocal n
        n += 1
        if n % 1000 == 1:
            print('method {} called {} times'.format(f.__name__, n))
        return f(*args, **kwargs)
    return wrapped

@print_call_counts
def split_sentences(review):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = [clean_text(s) for s in raw_sentences if s]
    return sentences

In [None]:
%time sentences = sum(df.review.apply(split_sentences), [])
print('{} reviews -> {} sentences'.format(len(df), len(sentences)))

### 用gensim训练词嵌入模型

In [12]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
# 设定词向量训练的参数
num_features = 300    # Word vector dimensionality
min_word_count = 40   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)

In [14]:
print('Training model...')
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model.save(os.path.join('..', 'models', model_name))

Training model...


### 看看训练的词向量结果如何

In [6]:
print(model.doesnt_match("man woman child kitchen".split()))
print(model.doesnt_match('france england germany berlin'.split()))

kitchen
berlin


In [10]:
model.most_similar("man")

[('woman', 0.6256189346313477),
 ('lady', 0.5953349471092224),
 ('lad', 0.576863169670105),
 ('person', 0.5407935380935669),
 ('farmer', 0.5382746458053589),
 ('chap', 0.536788821220398),
 ('soldier', 0.5292650461196899),
 ('men', 0.5261573791503906),
 ('monk', 0.5237958431243896),
 ('guy', 0.5213091373443604)]

In [11]:
model.most_similar("queen")

[('princess', 0.6749982833862305),
 ('maid', 0.6223365068435669),
 ('bride', 0.6201028227806091),
 ('belle', 0.6200867891311646),
 ('temple', 0.6171057224273682),
 ('stripper', 0.608874499797821),
 ('catherine', 0.6072724461555481),
 ('eva', 0.6019693613052368),
 ('dancer', 0.594109833240509),
 ('sylvia', 0.5933606624603271)]

In [12]:
model.most_similar("awful")

[('terrible', 0.7551683187484741),
 ('atrocious', 0.7340768575668335),
 ('horrible', 0.7315883040428162),
 ('dreadful', 0.7080680131912231),
 ('abysmal', 0.7010548114776611),
 ('horrendous', 0.6951696872711182),
 ('appalling', 0.691646933555603),
 ('horrid', 0.6708598136901855),
 ('amateurish', 0.6481891870498657),
 ('embarrassing', 0.6306308507919312)]