In [3]:
import pandas as pd
from bs4 import BeautifulSoup 
import re
import nltk
from nltk.corpus import stopwords
import nltk.data

In [4]:
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    
    # 1. Remove HTML
    review_text = BeautifulSoup(review, "lxml").get_text()
    
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    # 5. Return a list of words
    return(words)

In [5]:
#nltk.download('punkt')   
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [6]:
train = pd.read_csv( "unlabeledTrainData1.tsv", header=0, delimiter="\t", quoting=3 )

In [7]:
sentences = []  # Initialize an empty list of sentences

print "Parsing sentences from training set"
y = 0
print len(train["review"])
for review in train["review"]:
    if(y % 1000 == 0):
        print "line: %s" % y
    y += 1
    sentences += review_to_sentences(review, tokenizer) 

Parsing sentences from training set
24996
line:
0


  'Beautiful Soup.' % markup)


line:
1000


  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


line:
2000
line:
3000
line:
4000
line:
5000
line:
6000
line:
7000
line:
8000
line:
9000
line:
10000
line:
11000
line:
12000
line:
13000
line:
14000
line:
15000
line:
16000
line:
17000
line:
18000
line:
19000
line:
20000
line:
21000
line:
22000
line:
23000
line:
24000


In [9]:
print len(sentences)

266440


In [11]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 160   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)


2017-05-24 19:31:19,423 : INFO : collecting all words and their counts
2017-05-24 19:31:19,430 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-24 19:31:19,559 : INFO : PROGRESS: at sentence #10000, processed 225818 words, keeping 17776 word types
2017-05-24 19:31:19,617 : INFO : PROGRESS: at sentence #20000, processed 451941 words, keeping 24951 word types


Training model...


2017-05-24 19:31:19,674 : INFO : PROGRESS: at sentence #30000, processed 671563 words, keeping 30034 word types
2017-05-24 19:31:19,729 : INFO : PROGRESS: at sentence #40000, processed 898078 words, keeping 34352 word types
2017-05-24 19:31:19,785 : INFO : PROGRESS: at sentence #50000, processed 1117403 words, keeping 37765 word types
2017-05-24 19:31:19,841 : INFO : PROGRESS: at sentence #60000, processed 1338704 words, keeping 40724 word types
2017-05-24 19:31:19,895 : INFO : PROGRESS: at sentence #70000, processed 1561868 words, keeping 43334 word types
2017-05-24 19:31:19,953 : INFO : PROGRESS: at sentence #80000, processed 1781509 words, keeping 45720 word types
2017-05-24 19:31:20,007 : INFO : PROGRESS: at sentence #90000, processed 2005541 words, keeping 48138 word types
2017-05-24 19:31:20,062 : INFO : PROGRESS: at sentence #100000, processed 2227527 words, keeping 50213 word types
2017-05-24 19:31:20,112 : INFO : PROGRESS: at sentence #110000, processed 2447452 words, keeping 

In [13]:
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2017-05-24 19:31:45,927 : INFO : precomputing L2-norms of word weight vectors
2017-05-24 19:31:46,038 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2017-05-24 19:31:46,040 : INFO : not storing attribute syn0norm
2017-05-24 19:31:46,043 : INFO : not storing attribute cum_table
2017-05-24 19:31:46,147 : INFO : saved 300features_40minwords_10context


In [12]:
# model.doesnt_match("man woman child kitchen".split())

2017-05-24 19:31:41,155 : INFO : precomputing L2-norms of word weight vectors


'kitchen'

In [14]:
# model.doesnt_match("france england germany berlin".split())

'berlin'

In [15]:
# model.doesnt_match("paris berlin london austria".split())



'paris'

In [16]:
# model.most_similar("man")

[(u'woman', 0.6395095586776733),
 (u'boy', 0.6255266666412354),
 (u'doctor', 0.6172419786453247),
 (u'soldier', 0.6160985231399536),
 (u'cop', 0.5751296281814575),
 (u'scientist', 0.5625811219215393),
 (u'businessman', 0.5614330768585205),
 (u'lady', 0.5572729110717773),
 (u'guy', 0.5551478266716003),
 (u'person', 0.5418464541435242)]

In [17]:
# model.most_similar("queen")

[(u'princess', 0.8047722578048706),
 (u'aunt', 0.7661514282226562),
 (u'bride', 0.7654191851615906),
 (u'sophie', 0.7429999113082886),
 (u'widow', 0.7419686317443848),
 (u'victoria', 0.7381371259689331),
 (u'ann', 0.7119592428207397),
 (u'femme', 0.7118977904319763),
 (u'elizabeth', 0.7108350396156311),
 (u'nun', 0.7031704187393188)]

In [18]:
# model.most_similar("awful")

[(u'terrible', 0.8109188079833984),
 (u'horrible', 0.8041103482246399),
 (u'atrocious', 0.7122700214385986),
 (u'dreadful', 0.7075834274291992),
 (u'laughable', 0.7063020467758179),
 (u'lame', 0.6759577393531799),
 (u'pathetic', 0.6611124277114868),
 (u'bad', 0.6580395698547363),
 (u'amateurish', 0.6485338807106018),
 (u'horrendous', 0.6412642002105713)]

In [19]:
# model['awful']

array([ -1.50569500e-02,   9.38825607e-02,   1.98630095e-02,
         5.87078594e-02,  -5.75170256e-02,  -4.14183224e-03,
         4.93640155e-02,   1.46144824e-02,   6.17245138e-02,
         4.84266318e-02,  -2.91249584e-02,   5.47946291e-03,
         1.51135445e-01,  -2.22954825e-02,   4.33592536e-02,
        -6.61959201e-02,   1.73671376e-02,   5.40489890e-02,
         4.44530249e-02,   1.23167880e-01,   5.67670316e-02,
        -1.92467757e-02,  -5.59160672e-02,  -9.72371250e-02,
        -4.79278862e-02,   5.89737147e-02,   7.31465369e-02,
         4.14881222e-02,   9.73541569e-03,  -3.57583873e-02,
         8.78614001e-03,  -6.01675734e-02,  -1.74424481e-02,
        -8.02147090e-02,   1.60583537e-02,   9.05642435e-02,
         3.74523434e-03,  -2.48872135e-02,  -1.79199856e-02,
        -1.12637408e-01,   1.08669341e-01,  -4.69968393e-02,
        -8.93272832e-02,   3.11536132e-03,   6.28254563e-02,
         6.59570694e-02,  -7.91858435e-02,  -3.38165946e-02,
         1.74626932e-02,

In [20]:
print "Получены точки слов"

Получены точки слов


In [33]:
model.wv.syn0

array([[ 0.16232468,  0.15843681, -0.10505293, ..., -0.01522027,
         0.037686  ,  0.03744352],
       [ 0.15215264,  0.018271  ,  0.07339182, ..., -0.02757786,
         0.03397735, -0.11259236],
       [ 0.03003093,  0.0047795 , -0.11992312, ..., -0.04843698,
         0.07112975, -0.07356728],
       ..., 
       [ 0.13182379,  0.03662432, -0.03617692, ...,  0.0137189 ,
        -0.11438576,  0.09479734],
       [ 0.04299678,  0.06154632, -0.06060285, ...,  0.09284201,
        -0.04943876,  0.03279798],
       [ 0.02506608,  0.09877607, -0.02216223, ...,  0.08128311,
        -0.04876532,  0.04317164]], dtype=float32)