In [1]:
import re
import pandas as pd
from bs4 import BeautifulSoup

import nltk.data
from nltk.corpus import stopwords
nltk.download()

from gensim.models import word2vec

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


Importing training and testing data.

In [2]:
train = pd.read_csv("labeledTrainData.tsv", header=0,  delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

print(f"Read {train['review'].size} labeled train reviews, {test['review'].size} labeled test reviews, " \
 "and {unlabeled_train['review'].size} unlabeled reviews\n")

Read 25000 labeled train reviews, 25000 labeled test reviews, and {unlabeled_train['review'].size} unlabeled reviews



Function for preprocessing raw reviews. \
Removes HTML elements, non-alphanumeric characters, and stopwords.

In [3]:
def review_to_wordlist(review, remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z0-9_]"," ", review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

Function for iterating over raw data and cleaning is using the “review_to_wordlist” function.

In [4]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist( raw_sentence, remove_stopwords))
    return sentences

Cleaning data

In [None]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

Define model parameters

In [6]:
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

Training model

In [7]:
model = word2vec.Word2Vec(sentences, 
            workers=num_workers,
            vector_size=num_features, 
            min_count = min_word_count,
            window = context, 
            sample = downsampling)

init_sims will make the model much more memory-efficient, but prevents further training.

In [8]:
model.init_sims(replace=True)

  model.init_sims(replace=True)


save model

In [9]:
model_name = "300features_40minwords_10context"
model.save(model_name)