# word2vec训练词向量

In [1]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

import nltk.data
# nltk.download()
# from nltk.corpus import stopwords

from gensim.models.word2vec import Word2Vec

In [2]:
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('..', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

### 读入无标签数据
用于训练生成word2vec词向量

In [3]:
df = load_dataset('unlabeled_train')
df.head()

Number of reviews: 50000


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


### 和第一个ipython notebook一样做数据的预处理
稍稍有一点不一样的是，我们留了个候选，可以去除停用词，也可以不去除停用词

In [5]:
#eng_stopwords = set(stopwords.words('english'))
eng_stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #加载英文的划分句子的模型

def print_call_counts(f):
    n = 0
    def wrapped(*args, **kwargs):
        nonlocal n
        n += 1
        if n % 1000 == 1:
            print('method {} called {} times'.format(f.__name__, n))
        return f(*args, **kwargs)
    return wrapped

@print_call_counts
def split_sentences(review):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = [clean_text(s) for s in raw_sentences if s]
    return sentences

In [6]:
%time sentences = sum(df.review.apply(split_sentences), [])
print('{} reviews -> {} sentences'.format(len(df), len(sentences)))

method split_sentences called 1 times


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)


method split_sentences called 1001 times
method split_sentences called 2001 times


  ' that document to Beautiful Soup.' % decoded_markup


method split_sentences called 3001 times


  ' Beautiful Soup.' % markup)


method split_sentences called 4001 times
method split_sentences called 5001 times
method split_sentences called 6001 times
method split_sentences called 7001 times


  ' Beautiful Soup.' % markup)


method split_sentences called 8001 times
method split_sentences called 9001 times


  ' Beautiful Soup.' % markup)


method split_sentences called 10001 times
method split_sentences called 11001 times
method split_sentences called 12001 times
method split_sentences called 13001 times
method split_sentences called 14001 times
method split_sentences called 15001 times
method split_sentences called 16001 times
method split_sentences called 17001 times
method split_sentences called 18001 times
method split_sentences called 19001 times
method split_sentences called 20001 times
method split_sentences called 21001 times


  ' that document to Beautiful Soup.' % decoded_markup


method split_sentences called 22001 times
method split_sentences called 23001 times
method split_sentences called 24001 times
method split_sentences called 25001 times
method split_sentences called 26001 times
method split_sentences called 27001 times
method split_sentences called 28001 times
method split_sentences called 29001 times
method split_sentences called 30001 times
method split_sentences called 31001 times
method split_sentences called 32001 times
method split_sentences called 33001 times
method split_sentences called 34001 times
method split_sentences called 35001 times


  ' that document to Beautiful Soup.' % decoded_markup


method split_sentences called 36001 times
method split_sentences called 37001 times
method split_sentences called 38001 times
method split_sentences called 39001 times
method split_sentences called 40001 times
method split_sentences called 41001 times
method split_sentences called 42001 times
method split_sentences called 43001 times
method split_sentences called 44001 times


  ' Beautiful Soup.' % markup)


method split_sentences called 45001 times
method split_sentences called 46001 times
method split_sentences called 47001 times
method split_sentences called 48001 times


  ' that document to Beautiful Soup.' % decoded_markup


method split_sentences called 49001 times
Wall time: 31min 45s
50000 reviews -> 537851 sentences


### 用gensim训练词嵌入模型

In [7]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [10]:
# 设定词向量训练的参数
num_features = 300    # Word vector dimensionality
min_word_count = 40   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)
print("model_name = ", model_name)

model_name =  300features_40minwords_10context.model


In [13]:
print('Training model...')
model = Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model.save(os.path.join('..', 'models', model_name))

2019-03-16 22:32:25,699 : INFO : collecting all words and their counts
2019-03-16 22:32:25,701 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model...


2019-03-16 22:32:25,939 : INFO : PROGRESS: at sentence #10000, processed 225072 words, keeping 17237 word types
2019-03-16 22:32:26,130 : INFO : PROGRESS: at sentence #20000, processed 443536 words, keeping 24570 word types
2019-03-16 22:32:26,287 : INFO : PROGRESS: at sentence #30000, processed 666343 words, keeping 29785 word types
2019-03-16 22:32:26,430 : INFO : PROGRESS: at sentence #40000, processed 886903 words, keeping 33939 word types
2019-03-16 22:32:26,558 : INFO : PROGRESS: at sentence #50000, processed 1103863 words, keeping 37503 word types
2019-03-16 22:32:26,687 : INFO : PROGRESS: at sentence #60000, processed 1327231 words, keeping 40738 word types
2019-03-16 22:32:26,840 : INFO : PROGRESS: at sentence #70000, processed 1550828 words, keeping 43603 word types
2019-03-16 22:32:26,983 : INFO : PROGRESS: at sentence #80000, processed 1772824 words, keeping 46155 word types
2019-03-16 22:32:27,136 : INFO : PROGRESS: at sentence #90000, processed 1987492 words, keeping 4832

2019-03-16 22:32:49,942 : INFO : EPOCH 1 - PROGRESS: at 35.48% examples, 242693 words/s, in_qsize 8, out_qsize 0
2019-03-16 22:32:50,955 : INFO : EPOCH 1 - PROGRESS: at 38.28% examples, 242239 words/s, in_qsize 8, out_qsize 1
2019-03-16 22:32:51,960 : INFO : EPOCH 1 - PROGRESS: at 41.55% examples, 244416 words/s, in_qsize 7, out_qsize 0
2019-03-16 22:32:52,979 : INFO : EPOCH 1 - PROGRESS: at 44.11% examples, 241986 words/s, in_qsize 8, out_qsize 0
2019-03-16 22:32:54,000 : INFO : EPOCH 1 - PROGRESS: at 48.07% examples, 247145 words/s, in_qsize 8, out_qsize 0
2019-03-16 22:32:55,013 : INFO : EPOCH 1 - PROGRESS: at 50.74% examples, 245744 words/s, in_qsize 7, out_qsize 0
2019-03-16 22:32:56,046 : INFO : EPOCH 1 - PROGRESS: at 53.69% examples, 245384 words/s, in_qsize 7, out_qsize 0
2019-03-16 22:32:57,056 : INFO : EPOCH 1 - PROGRESS: at 56.73% examples, 245695 words/s, in_qsize 8, out_qsize 0
2019-03-16 22:32:58,070 : INFO : EPOCH 1 - PROGRESS: at 59.32% examples, 244199 words/s, in_qsiz

2019-03-16 22:33:56,320 : INFO : EPOCH 3 - PROGRESS: at 41.48% examples, 241957 words/s, in_qsize 8, out_qsize 1
2019-03-16 22:33:57,380 : INFO : EPOCH 3 - PROGRESS: at 44.70% examples, 242728 words/s, in_qsize 7, out_qsize 0
2019-03-16 22:33:58,409 : INFO : EPOCH 3 - PROGRESS: at 47.30% examples, 240854 words/s, in_qsize 8, out_qsize 0
2019-03-16 22:33:59,443 : INFO : EPOCH 3 - PROGRESS: at 50.57% examples, 242318 words/s, in_qsize 8, out_qsize 0
2019-03-16 22:34:00,452 : INFO : EPOCH 3 - PROGRESS: at 52.97% examples, 239801 words/s, in_qsize 6, out_qsize 1
2019-03-16 22:34:01,510 : INFO : EPOCH 3 - PROGRESS: at 55.97% examples, 239828 words/s, in_qsize 7, out_qsize 0
2019-03-16 22:34:02,550 : INFO : EPOCH 3 - PROGRESS: at 59.49% examples, 242108 words/s, in_qsize 7, out_qsize 0
2019-03-16 22:34:03,567 : INFO : EPOCH 3 - PROGRESS: at 62.11% examples, 240826 words/s, in_qsize 8, out_qsize 0
2019-03-16 22:34:04,589 : INFO : EPOCH 3 - PROGRESS: at 64.66% examples, 239300 words/s, in_qsiz

2019-03-16 22:35:03,560 : INFO : EPOCH 5 - PROGRESS: at 33.33% examples, 249316 words/s, in_qsize 8, out_qsize 0
2019-03-16 22:35:04,581 : INFO : EPOCH 5 - PROGRESS: at 36.20% examples, 248662 words/s, in_qsize 7, out_qsize 0
2019-03-16 22:35:05,605 : INFO : EPOCH 5 - PROGRESS: at 39.45% examples, 250237 words/s, in_qsize 8, out_qsize 0
2019-03-16 22:35:06,647 : INFO : EPOCH 5 - PROGRESS: at 41.99% examples, 246756 words/s, in_qsize 7, out_qsize 0
2019-03-16 22:35:07,654 : INFO : EPOCH 5 - PROGRESS: at 44.61% examples, 244813 words/s, in_qsize 7, out_qsize 0
2019-03-16 22:35:08,667 : INFO : EPOCH 5 - PROGRESS: at 47.31% examples, 243467 words/s, in_qsize 7, out_qsize 0
2019-03-16 22:35:09,670 : INFO : EPOCH 5 - PROGRESS: at 49.92% examples, 242015 words/s, in_qsize 7, out_qsize 0
2019-03-16 22:35:10,682 : INFO : EPOCH 5 - PROGRESS: at 51.86% examples, 237512 words/s, in_qsize 8, out_qsize 0
2019-03-16 22:35:11,726 : INFO : EPOCH 5 - PROGRESS: at 54.54% examples, 236358 words/s, in_qsiz

### 看看训练的词向量结果如何

In [15]:
print(model.doesnt_match("man woman child kitchen".split()))
print(model.doesnt_match('france england germany berlin'.split()))

kitchen
berlin


  """Entry point for launching an IPython kernel.
  


In [17]:
model.most_similar("man")

  """Entry point for launching an IPython kernel.


[('woman', 0.6647160053253174),
 ('lady', 0.6140276193618774),
 ('lad', 0.5904245376586914),
 ('guy', 0.5348434448242188),
 ('person', 0.5302190780639648),
 ('chap', 0.5207657814025879),
 ('men', 0.5132219791412354),
 ('priest', 0.5081614255905151),
 ('soldier', 0.5078833103179932),
 ('monk', 0.5074741840362549)]

In [18]:
model.most_similar("queen")

  """Entry point for launching an IPython kernel.


[('princess', 0.6432243585586548),
 ('maid', 0.6326397657394409),
 ('temple', 0.6207026243209839),
 ('stripper', 0.6190794706344604),
 ('belle', 0.6153777837753296),
 ('eva', 0.614830493927002),
 ('bride', 0.6097807884216309),
 ('housekeeper', 0.6069656014442444),
 ('rose', 0.6029557585716248),
 ('katherine', 0.599675178527832)]

In [20]:
model.most_similar("awful")

  """Entry point for launching an IPython kernel.


[('terrible', 0.7801300287246704),
 ('atrocious', 0.7467272281646729),
 ('horrible', 0.7357441186904907),
 ('abysmal', 0.7168046832084656),
 ('dreadful', 0.7146291732788086),
 ('horrid', 0.6934999227523804),
 ('lousy', 0.6796210408210754),
 ('horrendous', 0.678307294845581),
 ('appalling', 0.6761072278022766),
 ('laughable', 0.6340218782424927)]