# Purpose
The purpose of this kernel is to extract the vector for each word from large corpus 

# Importing Libraries

In [1]:
# importing dataframes and array operations
import pandas as pd
import numpy as np
# BeautifulSoup is used to remove html tags from the text
from bs4 import BeautifulSoup 
import re # for regular expression

# Stopwords can be useful to undersand the semantics of the sentence.
# Therefore stopwords are not removed while creating the word2vec model.
# But they will be removed  while averaging feature vectors.
from nltk.corpus import stopwords

# Reading the data

In [2]:
# reading .tsv file
train = pd.read_csv("../input/word2vec/unlabeledTrainData.tsv", header=0,\
                    delimiter="\t", quoting=3)

In [3]:
#visualize the context
train.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [4]:
# checking for Nan or empty strings
train.isnull().sum()

id        0
review    0
dtype: int64

In [5]:
# This function converts a text to a sequence of words.
def review_wordlist(review, remove_stopwords=False):
    # 1. Removing html tags
    review_text = BeautifulSoup(review).get_text()
    # 2. Removing non-letter.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    #5. lemma
    
    return(words)

In [6]:
# word2vec expects a list of lists.
# Using punkt tokenizer for better splitting of a paragraph into sentences.

import nltk.data
#nltk.download('popular')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [7]:
# This function splits a review into sentences
def review_sentences(review, tokenizer, remove_stopwords=False):
    # 1. Using nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    # 2. Loop for each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))

    # This returns the list of lists
    return sentences

# Pre-processing the reviews

In [8]:
sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_sentences(review, tokenizer)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


# Model creation  

In [9]:
# Creating the model and setting values for the various parameters
num_features = 300  # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

# Initializing the train model
from gensim.models import word2vec
print("Training model....")
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
print("Saving the model")
model_name = "300features_40minwords_10context"
model.save(model_name)


Training model....
Saving the model


# model eval

In [10]:
# Few tests: This will print the odd word among them 
model.wv.doesnt_match("man woman king queen princess dog".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'king'

In [11]:
model.wv.doesnt_match("europe africa USA turkey".split())

'turkey'

In [12]:
model.wv.most_similar("best")

[('finest', 0.7775355577468872),
 ('worst', 0.7266291975975037),
 ('funniest', 0.7038780450820923),
 ('greatest', 0.6990966200828552),
 ('weakest', 0.678032398223877),
 ('scariest', 0.5999232530593872),
 ('dumbest', 0.5607209205627441),
 ('strongest', 0.5182027816772461),
 ('poorest', 0.5136731266975403),
 ('coolest', 0.5106767416000366)]

In [13]:
model.wv.most_similar("boring")

[('dull', 0.8061192631721497),
 ('tedious', 0.7716568112373352),
 ('pointless', 0.7284297943115234),
 ('uninteresting', 0.6732499599456787),
 ('predictable', 0.6696414351463318),
 ('repetitive', 0.6669901609420776),
 ('bland', 0.6216309070587158),
 ('pretentious', 0.6124736070632935),
 ('confusing', 0.6114941835403442),
 ('lame', 0.6083453893661499)]

In [14]:
model.wv.most_similar_cosmul(positive=['man', 'woman'], negative=['princess'])

[('person', 1.0251609086990356),
 ('guy', 0.9356861710548401),
 ('men', 0.9032775163650513),
 ('someone', 0.9017495512962341),
 ('lady', 0.8675357699394226),
 ('people', 0.8628487586975098),
 ('lad', 0.8474027514457703),
 ('women', 0.8137201070785522),
 ('murderer', 0.802432656288147),
 ('teenager', 0.7992427945137024)]

In [15]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(13056, 300)