#### Word2Vec

Word2Vec is a neural network implementation that learns distributed representations for words / distributed word vectors.

__import data__

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [3]:
test = pd.read_csv("data/testData.tsv", header=0, delimiter="\t", quoting=3)

In [4]:
unlabeled_train = pd.read_csv("data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [5]:
test["review"].size

25000

In [6]:
train["review"].size

25000

In [7]:
unlabeled_train.size

100000

__clean data__

In [8]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [16]:
def review_to_wordlist(review, remove_stopwords=False):
    # Remove HTML
    review_text = BeautifulSoup(review).get_text()
    # Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    # Convert words to lower case and split them
    words = review_text.lower().split()
    # Remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        
    return words

_Word2Vec expects sentences as lists of words_

In [10]:
import nltk.data

In [11]:
# Load punkt tokenizer to break up a paragraph into sentences

In [12]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [17]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    '''
    Function to split a review into parsed sentences. Returns a list of sentences, 
    where each sentence is a list of words. 
    '''
    # Creates list of strings (sentences)
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    # convert list of strings into list of list of words
    for raw_sentence in raw_sentences:
        if(len(raw_sentence) > 0):
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    
    return sentences

In [18]:
sentences = []

In [20]:
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html.parser")

  markup_type=markup_type))
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [21]:
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html.parser")

  markup_type=markup_type))
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [22]:
len(sentences)

1062089

all reviews have been converted to sentences; one sentences is one list of words

In [23]:
print(sentences[:10])

[['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again'], ['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent'], ['moonwalker', 'is', 'part', 'biography', 'part', 'feature', 'film', 'which', 'i', 'remember', 'going', 'to', 'see', 'at', 'the', 'cinema', 'when', 'it', 'was', 'originally', 'released'], ['some', 'of', 'it', 'has', 'subtle', 'messages', 'about', 'mj', 's', 'feeling', 'towards', 'the', 'press', 'and', 'also', 'the', 'obvious', 'message', 'of', 'drugs', 'are', 'bad', 'm', 'kay', 'visually', 'impressive', 'but', 'of', 'course', 'this', 'is', 'all', 'a

#### training and saving the model

In [24]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [25]:
num_features = 300 # word vector dim
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

In [26]:
from gensim.models import word2vec

2017-02-08 14:48:36,677 : INFO : 'pattern' package not found; tag filters are not available for English


In [27]:
model = word2vec.Word2Vec(sentences, workers=num_workers, 
                         size=num_features, min_count=min_word_count,
                         window=context, sample=downsampling)

2017-02-08 14:48:36,687 : INFO : collecting all words and their counts
2017-02-08 14:48:36,688 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-02-08 14:48:36,740 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2017-02-08 14:48:36,792 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2017-02-08 14:48:36,845 : INFO : PROGRESS: at sentence #30000, processed 671314 words, keeping 30034 word types
2017-02-08 14:48:36,899 : INFO : PROGRESS: at sentence #40000, processed 897814 words, keeping 34348 word types
2017-02-08 14:48:36,953 : INFO : PROGRESS: at sentence #50000, processed 1116962 words, keeping 37761 word types
2017-02-08 14:48:37,012 : INFO : PROGRESS: at sentence #60000, processed 1338403 words, keeping 40723 word types
2017-02-08 14:48:37,067 : INFO : PROGRESS: at sentence #70000, processed 1561579 words, keeping 43333 word types
2017-02-08 14:48:37,123 : INFO : PROGRESS: 

In [28]:
model.init_sims(replace=True)

2017-02-08 14:50:04,824 : INFO : precomputing L2-norms of word weight vectors


In [29]:
model_name = "300features_40minwords_10context"
model.save(model_name)

2017-02-08 14:50:04,979 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2017-02-08 14:50:04,980 : INFO : not storing attribute cum_table
2017-02-08 14:50:04,981 : INFO : not storing attribute syn0norm
2017-02-08 14:50:05,563 : INFO : saved 300features_40minwords_10context


#### Exploring the model

In [30]:
model.doesnt_match("man woman child kitchen".split())

'kitchen'

In [31]:
model.doesnt_match("france england germany berlin".split())

'berlin'

In [32]:
model.doesnt_match("paris berlin london austria".split())

'paris'

In [33]:
model.most_similar("man")

[('woman', 0.6129059791564941),
 ('lad', 0.5809935927391052),
 ('lady', 0.5616000294685364),
 ('monk', 0.4992210268974304),
 ('farmer', 0.49889662861824036),
 ('men', 0.49541759490966797),
 ('guy', 0.4946671724319458),
 ('millionaire', 0.4924411475658417),
 ('businessman', 0.4894394278526306),
 ('person', 0.487354576587677)]

In [34]:
model.most_similar("queen")

[('princess', 0.6044710874557495),
 ('bride', 0.5868712663650513),
 ('mistress', 0.5574051141738892),
 ('goddess', 0.5508238673210144),
 ('duchess', 0.5441114902496338),
 ('victoria', 0.5438050031661987),
 ('stepmother', 0.5378952026367188),
 ('maid', 0.5345040559768677),
 ('showgirl', 0.5275346040725708),
 ('nun', 0.5255606174468994)]

In [35]:
model.most_similar("awful")

[('terrible', 0.7490795850753784),
 ('abysmal', 0.7023686170578003),
 ('atrocious', 0.7018066644668579),
 ('horrible', 0.7017901539802551),
 ('dreadful', 0.6742359399795532),
 ('horrendous', 0.6662415862083435),
 ('appalling', 0.6474866271018982),
 ('horrid', 0.6434177160263062),
 ('lousy', 0.5972024202346802),
 ('bad', 0.5899964570999146)]