In [10]:
import logging
from time import time
logging.basicConfig(format='%(levelname)s %(asctime)s: %(message)s', datefmt='%H:%M:%S', level=logging.INFO)

In [11]:
import spacy 
nlp = spacy.load('en', disable=['ner', 'parser'])

In [12]:
from gensim.parsing.preprocessing import strip_tags, strip_non_alphanum, strip_multiple_whitespaces, strip_short

def preprocess(text):
    ''' 
    this function does simple text pre-processing such as, 
        - remove html tags
        - remove non-alphabetic 
        - remove punctuation
        - lowercase
    '''
    step_process_text = strip_tags(str(text))
    step_process_text = strip_non_alphanum(step_process_text)
#     step_process_text = remove_stopwords(step_process_text)
#     step_process_text = strip_multiple_whitespaces(step_process_text)
    step_process_text = strip_short(step_process_text, minsize=5)
    processed_text = step_process_text.strip().lower()
    return processed_text

def lemmatize(text):
    ''' 
    this function does text lemmatization on pre-processed text; 
        - remove stop words
        - lemmatizaation
    '''
    lemmatized_text = " ".join([token.lemma_ for token in nlp(text) if not token.is_stop])
    return lemmatized_text

In [13]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

dataset_review_meta = pd.read_csv('/mnt/d/data/amazon-review-data/amazon-cell-phone-review-meta.csv', nrows=5)
dataset_review = pd.read_csv('/mnt/d/data/amazon-review-data/amazon-cell-phone-reviews.csv', nrows=5)
dataset_review.columns = map(str.lower, dataset_review.columns)

In [14]:
preprocess_text = (preprocess(row) for row in dataset_review['body'])
lemmatize_text = [lemmatize(str(doc)) for doc in nlp.pipe(preprocess_text, batch_size=100, n_threads=-1)]

In [15]:
df = pd.DataFrame({'lemmatize_text' : lemmatize_text})
df = df.dropna()
df

Unnamed: 0,lemmatize_text
0,samsung awhile absolute review detect stupid thing finally nokia phone buy garage wonder cheap take forever scroll endlessly usually phone number category simply press silent vibrate class ring immediately silence thing remember silent learn mission break nail process damage reason phone start give problem succeed open button big vibration strong reception shabby elevator remarkable consider phone service simply put pocket compare samsung phone work tone phone actually charge quickly great battery doesn potatoe phone convos bright large screen customize scroll purple orange overall phone serve purpose definitely pale comparison phone come sprint great
1,software issue nokia sprint phone messaging capability sprint system software patch come month spend sprint award win customer service admit problem nokia design phone incoming message retrieve quickly view offline provider sprint like people server connected burn minute check inbox compose reply sprint server respond innovation money make fine
2,great reliable phone purchase phone samsung easily comprehendable speed dialing available number voice dialing feature take long speed dialing thing bother game nokia take snake phone skydive bowl tennis ringer feature available choose different ringer person call ringtone available online download phone pretty stuck vibrate ringtone regular polyphonic tone cover reasonable price range
3,phone expect price receive phone little month receive accessory suppose phone company tell couple week receive shortly phone talk make phone call
4,phone great purpose offer buy not picture jaket super store employee mean barely close snap case dirty clear casing look dirty charge


In [62]:
from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in df['lemmatize_text']]
phrases = Phrases(sent, min_count=2, threshold=1, progress_per=1)
bigram = Phraser(phrases)

INFO 16:50:36: collecting all words and their counts
INFO 16:50:36: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO 16:50:36: PROGRESS: at sentence #1, processed 93 words and 171 word types
INFO 16:50:36: PROGRESS: at sentence #2, processed 142 words and 253 word types
INFO 16:50:36: PROGRESS: at sentence #3, processed 195 words and 336 word types
INFO 16:50:36: PROGRESS: at sentence #4, processed 217 words and 368 word types
INFO 16:50:36: collected 405 word types from a corpus of 239 words (unigram + bigrams) and 5 sentences
INFO 16:50:36: using 405 counts as vocab in Phrases<0 vocab, min_count=2, threshold=1, max_vocab_size=40000000>
INFO 16:50:36: source_vocab length 405
INFO 16:50:36: Phraser built with 0 phrasegrams


In [63]:
import multiprocessing
from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() # counts number cpu in computer

In [64]:
from collections import defaultdict
sentences = bigram[sent]
word_frequency = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_frequency[i] += 1

print(sorted(word_frequency, key=word_frequency.get, reverse=True)[:10])

['phone', 'sprint', 'nokia', 'great', 'samsung', 'thing', 'take', 'dialing', 'available', 'receive']


In [65]:
'''
    min_count - words to be ingnored with total absolute frequency lower than this
    window - maximum distance between the current and predicted word within a sentence
    size - dimensionality of the feature vectors
    sample - threshold for configuring which higher-frequency words are randomly downsampled
    alpha - initial learning rate - (0.01, 0.05)
    min_alpha - learning rate will linearly drop to min_alpha as training progresses
    negative - negative sampling rate. If set to 0, no negative sampling is used.
    workers - number of worker threads to train the model
'''
wordvec_model = Word2Vec(min_count=3, window=2, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores-1)

In [66]:
t = time()
wordvec_model.build_vocab(sentences, progress_per=1)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO 16:50:37: collecting all words and their counts
INFO 16:50:37: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO 16:50:37: PROGRESS: at sentence #1, processed 93 words, keeping 79 word types
INFO 16:50:37: PROGRESS: at sentence #2, processed 142 words, keeping 113 word types
INFO 16:50:37: PROGRESS: at sentence #3, processed 195 words, keeping 145 word types
INFO 16:50:37: PROGRESS: at sentence #4, processed 217 words, keeping 156 word types
INFO 16:50:37: collected 172 word types from a corpus of 239 raw words and 5 sentences
INFO 16:50:37: Loading a fresh vocabulary
INFO 16:50:37: effective_min_count=3 retains 10 unique words (5% of original 172, drops 162)
INFO 16:50:37: effective_min_count=3 leaves 53 word corpus (22% of original 239, drops 186)
INFO 16:50:37: deleting the raw counts dictionary of 172 items
INFO 16:50:37: sample=6e-05 downsamples 10 most-common words
INFO 16:50:37: downsampling leaves estimated 1 word corpus (2.3% of prior 53)
INFO 16:50:3

Time to build vocab: 0.0 mins


In [67]:
t = time()
wordvec_model.train(sentences, total_examples=wordvec_model.corpus_count, epochs=2, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO 16:50:37: training model with 3 workers on 10 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO 16:50:37: worker thread finished; awaiting finish of 2 more threads
INFO 16:50:37: worker thread finished; awaiting finish of 1 more threads
INFO 16:50:37: worker thread finished; awaiting finish of 0 more threads
INFO 16:50:37: EPOCH - 1 : training on 239 raw words (1 effective words) took 0.0s, 200 effective words/s
INFO 16:50:37: worker thread finished; awaiting finish of 2 more threads
INFO 16:50:37: worker thread finished; awaiting finish of 1 more threads
INFO 16:50:37: worker thread finished; awaiting finish of 0 more threads
INFO 16:50:37: EPOCH - 2 : training on 239 raw words (1 effective words) took 0.0s, 173 effective words/s
INFO 16:50:37: training on a 478 raw words (2 effective words) took 0.0s, 78 effective words/s


Time to train the model: 0.0 mins


In [68]:
wordvec_model.init_sims(replace=True) # init_sims() will make the model much more memory-efficient

INFO 16:50:37: precomputing L2-norms of word weight vectors


In [69]:
for i in wordvec_model.wv.vocab:
    print(i)

samsung
thing
nokia
phone
take
great
sprint
dialing
available
receive


In [70]:
wordvec_model.wv.most_similar(positive=["phone"])

[('dialing', 0.07674398273229599),
 ('available', 0.023674942553043365),
 ('thing', -0.005188621114939451),
 ('great', -0.008474580012261868),
 ('sprint', -0.034669507294893265),
 ('samsung', -0.037437330931425095),
 ('take', -0.04031497612595558),
 ('receive', -0.06873968988656998),
 ('nokia', -0.07171442359685898)]