In [8]:
import pandas as pd
#from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from gensim.models import word2vec
import pickle
import nltk.data
import os
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
# Read data from files 
path = '../input/'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'


train = pd.read_csv(TRAIN_DATA_FILE, header=0)
test = pd.read_csv(TEST_DATA_FILE, header=0)

# Verify the number of comments that were read
print("Read %d labeled train reviews and  %d unlabelled test reviews" % (len(train),len(test)))
all_comments = train['comment_text'].fillna("_na_").tolist() + test['comment_text'].fillna("_na_").tolist() 


with open("all_comments.csv", "w+") as comments_file:
    i=0
    for comment in all_comments:
        comment = re.sub("[^a-zA-Z]"," ",str(comment))
        comments_file.write("%s\n" % comment)
        

##### 1. Preprocess Text Reviews For Creating Word Vectors

The <b>sentences</b> iterable can be simply a list, but for larger corpora, consider a generator that streams the sentences directly from disk/network, without storing everything in RAM. See BrownCorpus, Text8Corpus or LineSentence in the gensim.models.word2vec module for such examples.

<b>min_count</b> ignore all words and bigrams with total collected count lower than this.

<b>threshold</b> represents a score threshold for forming the phrases (higher means fewer phrases). A phrase of words a followed by b is accepted if the score of the phrase is greater than threshold. see the scoring setting.

<b>max_vocab_size</b> is the maximum size of the vocabulary. Used to control pruning of less common words, to keep memory under control. The default of 40M needs about 3.6GB of RAM; increase/decrease max_vocab_size depending on how much available memory you have.

<b>delimiter</b> is the glue character used to join collocation tokens, and should be a byte string (e.g. b’_’).

<b>scoring</b> specifies how potential phrases are scored for comparison to the threshold setting. scoring can be set with either a string that refers to a built-in scoring function, or with a function with the expected parameter names. Two built-in scoring functions are available by setting scoring to a string:

‘default’: from “Efficient Estimaton of Word Representations in Vector Space” by
Mikolov, et. al.: (count(worda followed by wordb) - min_count) * N / (count(worda) * count(wordb)) > threshold`, where <b>N</b> is the total vocabulary size.
<b>npmi</b>: normalized pointwise mutual information, from “Normalized (Pointwise) Mutual
Information in Colocation Extraction” by Gerlof Bouma: ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) / - ln(prop(worda followed by wordb) where prop(n) is the count of n / the count of everything in the entire corpus.
‘npmi’ is more robust when dealing with common words that form part of common bigrams, and ranges from -1 to 1, but is slower to calculate than the default.

To use a custom scoring function, create a function with the following parameters and set the scoring parameter to the custom function. You must use all the parameters in your function call, even if the function does not require all the parameters.

<b>worda_count</b>: number of occurrances in sentences of the first token in the phrase being scored wordb_count: number of occurrances in sentences of the second token in the phrase being scored bigram_count: number of occurrances in sentences of the phrase being scored len_vocab: the number of unique tokens in sentences min_count: the min_count setting of the Phrases class corpus_word_count: the total number of (non-unique) tokens in sentences
A scoring function without any of these parameters (even if the parameters are not used) will raise a ValueError on initialization of the Phrases class. The scoring function must be picklable.

common_terms is an optionnal list of “stop words” that won’t affect frequency count of expressions containing them.

In [10]:
class FileToComments(object):    
    def __init__(self, filename):
        self.filename = filename
        self.stop = set(nltk.corpus.stopwords.words('english'))
        
    def __iter__(self):
        
        def comment_to_wordlist(comment, remove_stopwords=True):
            comment = str(comment)
            words = comment.lower().split()
            #if remove_stopwords:
            #    stops = set(stopwords.words("english"))
            #    words = [w for w in words if not w in stops]
            return(words)
    
        for line in open(self.filename, 'r'):
            #line = unicode(line, 'utf-8')
            tokenized_comment = comment_to_wordlist(line, tokenizer)
            yield tokenized_comment
        
all_comments = FileToComments('all_comments.csv')

In [11]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Train Tokenizer on all comments
bigram = Phrases(all_comments, min_count=30, threshold=15)
bigram_phraser = Phraser(bigram) 

In [12]:
all_tokens = [bigram_phraser[comment] for comment in all_comments]

stops = set(stopwords.words("english"))

clean_all_tokens = []
for token in all_tokens:
    words = [w for w in token if not w in stops]
    clean_all_tokens += [words]
print('tokens cleaned')

In [13]:
#Pickle the tokens file for further use
import pickle
with open('tokenized_all_comments.pickle', 'wb') as filename:
    pickle.dump(clean_all_tokens, filename, protocol=pickle.HIGHEST_PROTOCOL)
print('files saved to tokenized_all_comments.pickle...')

##### 2. Training and Saving Your Model

With the list of nicely parsed sentences, we're ready to train the model. There are a number of parameter choices that affect the run time and the quality of the final model that is produced. For details on the algorithms below, see the word2vec API documentation as well as the Google documentation. 

 - Architecture: Architecture options are skip-gram (default) or continuous bag of words. We found that skip-gram was very slightly slower but produced better results. 
 - Training algorithm: Hierarchical softmax (default) or negative sampling. For us, the default worked well.
 - Downsampling of frequent words: The Google documentation recommends values between .00001 and .001. For us, values closer 0.001 seemed to improve the accuracy of the final model.
 - Word vector dimensionality: More features result in longer runtimes, and often, but not always, result in better models. Reasonable values can be in the tens to hundreds; we used 300. 
 - Context / window size: How many words of context should the training algorithm take into account? 10 seems to work well for hierarchical softmax (more is better, up to a point).
 - Worker threads: Number of parallel processes to run. This is computer-specific, but between 4 and 6 should work on most systems.
 - Minimum word count: This helps limit the size of the vocabulary to meaningful words. Any word that does not occur at least this many times across all documents is ignored. Reasonable values could be between 10 and 100. In this case, since each movie occurs 30 times, we set the minimum word count to 40, to avoid attaching too much importance to individual movie titles. This resulted in an overall vocabulary size of around 15,000 words. Higher values also help limit run time.

In [15]:
#Load Pre-saved tokenized comments
with open('tokenized_all_comments.pickle', 'rb') as filename:
    all_comments = pickle.load(filename)
    
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)


# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 20   # Minimum word count                        
num_workers = 16       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
print("Training model...")
model = word2vec.Word2Vec(all_comments,
                          workers=num_workers,
                          size=num_features,
                          min_count = min_word_count,
                          window = context,
                          sample = downsampling
                         )

# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
model_name = "%sfeatures_%sminwords_%scontext" % (num_features,min_word_count,context)
model.save(model_name)

In [16]:
# You can load the model later using this:
#from gensim.models import Word2Vec
#import gensim
#w2v_model = Word2Vec.load("300features_20minwords_10context")

# You can also retrain existing models by loading the features and retraining
# I'll probably publish another iteration in the next few days