In [35]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [66]:
file = pd.read_csv("../data.csv")
file_cleaned = file.dropna().drop_duplicates().reset_index(drop=True).rename(columns={'review':'title'})

In [67]:
file_cleaned.rating.value_counts()/len(file_cleaned)

5.0    0.676095
4.0    0.203063
3.0    0.075295
2.0    0.024383
1.0    0.021165
Name: rating, dtype: float64

In [68]:
file_cleaned[file_cleaned.rating==0]

Unnamed: 0,title,rating


In [69]:
file_cleaned = file_cleaned[file_cleaned.rating!=0]

In [70]:
file_cleaned.rating.value_counts()/len(file_cleaned)

5.0    0.676095
4.0    0.203063
3.0    0.075295
2.0    0.024383
1.0    0.021165
Name: rating, dtype: float64

In [71]:
def text_to_word_list(text, remove_polish_letters):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    text = remove_polish_letters(text)
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    text = text.split()

    return text  

In [72]:
def preprocess_text(sen):
    # Remove all the special characters
    sentence = sub(r'\W', ' ', str(sen))

    # remove all single characters
    sentence= sub(r'\s+[a-zA-Z]\s+', ' ', sentence)

    # Remove single characters from the start
    sentence = sub(r'\^[a-zA-Z]\s+', ' ', sentence) 

    # Substituting multiple spaces with single space
    sentence = sub(r'\s+', ' ', sentence, flags=re.I)

    # Removing prefixed 'b'
    sentence = sub(r'^b\s+', '', sentence)

    # Converting to Lowercase
    sentence= sentence.lower()


    return sentence

In [73]:
file_cleaned.title = file_cleaned.title.apply(lambda x: preprocess_text(x))

In [74]:
file_model = file_cleaned.copy()
file_model = file_model[file_model.title.str.len()>1]

In [75]:
sent = [row for row in file_model.title]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]

INFO - 12:21:25: collecting all words and their counts
INFO - 12:21:25: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 12:21:43: collected 1219 word types from a corpus of 4718372 words (unigram + bigrams) and 10253 sentences
INFO - 12:21:43: using 1219 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 12:21:43: source_vocab length 1219
INFO - 12:21:43: Phraser built with 0 phrasegrams


'the product does exactly as it should and is quite affordable did not realized it was double screened until it arrived so it was even better than had expected as an added bonus one of the screens carries small hint of the smell of an old grape candy used to buy so for reminiscent sake cannot stop putting the pop filter next to my nose and smelling it after recording dif you needed pop filter this will work just as well as the expensive ones and it may even come with pleasing aroma like mine did buy this product '

- min count = 3 - remove most unusual words from training embeddings, like words 'ssssuuuuuuuppppppeeeeeerrrr', which actually stands for 'super', and doesn't need additional training
- window = 4 - Word2Vec model will learn to predict given word from up to 4 words to the left, and up to 4 words to the right
- size = 300 - size of hidden layer used to predict surroundings of embedded word, which also stands for dimensions of trained embeddings
- sample = 1e-5 - probability baseline for subsampling most frequent words from surrounding of embedded word
- negative = 20 - number of negative (ones that shouldn't have been predicted while modeling selected pair of words) words that will have their corresponding weights updated while training on specific training example, along with positive word 

In [76]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 12:21:53: collecting all words and their counts
INFO - 12:21:53: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 12:21:55: collected 38 word types from a corpus of 4718372 raw words and 10253 sentences
INFO - 12:21:55: Loading a fresh vocabulary
INFO - 12:21:55: effective_min_count=3 retains 38 unique words (100% of original 38, drops 0)
INFO - 12:21:55: effective_min_count=3 leaves 4718372 word corpus (100% of original 4718372, drops 0)
INFO - 12:21:55: deleting the raw counts dictionary of 38 items
INFO - 12:21:55: sample=1e-05 downsamples 37 most-common words
INFO - 12:21:55: downsampling leaves estimated 73296 word corpus (1.6% of prior 4718372)
INFO - 12:21:55: estimated required memory for 38 words and 300 dimensions: 110200 bytes
INFO - 12:21:55: resetting layer weights


Time to build vocab: 0.03 mins


In [77]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 12:22:00: training model with 3 workers on 38 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4
INFO - 12:22:01: EPOCH 1 - PROGRESS: at 64.23% examples, 42931 words/s, in_qsize 3, out_qsize 2
INFO - 12:22:02: worker thread finished; awaiting finish of 2 more threads
INFO - 12:22:02: worker thread finished; awaiting finish of 1 more threads
INFO - 12:22:02: worker thread finished; awaiting finish of 0 more threads
INFO - 12:22:02: EPOCH - 1 : training on 4718372 raw words (72972 effective words) took 1.5s, 47098 effective words/s
INFO - 12:22:03: EPOCH 2 - PROGRESS: at 87.46% examples, 62171 words/s, in_qsize 6, out_qsize 1
INFO - 12:22:03: worker thread finished; awaiting finish of 2 more threads
INFO - 12:22:03: worker thread finished; awaiting finish of 1 more threads
INFO - 12:22:03: worker thread finished; awaiting finish of 0 more threads
INFO - 12:22:03: EPOCH - 2 : training on 4718372 raw words (74142 effective words) took 1.3s, 58995 effectiv

INFO - 12:22:26: worker thread finished; awaiting finish of 1 more threads
INFO - 12:22:26: worker thread finished; awaiting finish of 0 more threads
INFO - 12:22:26: EPOCH - 19 : training on 4718372 raw words (73178 effective words) took 1.2s, 61331 effective words/s
INFO - 12:22:27: EPOCH 20 - PROGRESS: at 89.41% examples, 63653 words/s, in_qsize 6, out_qsize 1
INFO - 12:22:27: worker thread finished; awaiting finish of 2 more threads
INFO - 12:22:27: worker thread finished; awaiting finish of 1 more threads
INFO - 12:22:27: worker thread finished; awaiting finish of 0 more threads
INFO - 12:22:27: EPOCH - 20 : training on 4718372 raw words (73486 effective words) took 1.2s, 61333 effective words/s
INFO - 12:22:28: EPOCH 21 - PROGRESS: at 89.19% examples, 62818 words/s, in_qsize 6, out_qsize 1
INFO - 12:22:28: worker thread finished; awaiting finish of 2 more threads
INFO - 12:22:28: worker thread finished; awaiting finish of 1 more threads
INFO - 12:22:28: worker thread finished; aw

Time to train the model: 0.69 mins


In [78]:
w2v_model.save("word2vec4.model")

INFO - 12:22:52: saving Word2Vec object under word2vec4.model, separately None
INFO - 12:22:52: not storing attribute vectors_norm
INFO - 12:22:52: not storing attribute cum_table
INFO - 12:22:52: saved word2vec4.model


Exporting preprocessed dataset for further steps (with replaced bigrams)

In [79]:
file_export = file_model.copy()
file_export['old_title'] = file_export.title
file_export.old_title = file_export.old_title.str.join(' ')
file_export.title = file_export.title.apply(lambda x: ' '.join(bigram[x]))
file_export.rating = file_export.rating.astype('int8')

In [80]:
file_export[['title', 'rating']].to_csv('dataset4.csv', index=False)