In [1]:
import logging
logging.basicConfig(format='%(levelname)s %(asctime)s: %(message)s', datefmt='%H:%M:%S', level=logging.INFO)

In [2]:
import spacy 
nlp = spacy.load('en', disable=['ner', 'parser'])

In [3]:
from gensim.parsing.preprocessing import strip_tags, strip_non_alphanum, strip_multiple_whitespaces, strip_short

def preprocess(text):
    ''' 
    this function does simple text pre-processing such as, 
        - remove html tags
        - remove non-alphabetic 
        - remove punctuation
        - lowercase
    '''
    step_process_text = strip_tags(str(text))
    step_process_text = strip_non_alphanum(step_process_text)
#     step_process_text = remove_stopwords(step_process_text)
#     step_process_text = strip_multiple_whitespaces(step_process_text)
    step_process_text = strip_short(step_process_text, minsize=5)
    processed_text = step_process_text.strip().lower()
    return processed_text

def lemmatize(text):
    ''' 
    this function does text lemmatization on pre-processed text; 
        - remove stop words
        - lemmatizaation
    '''
    lemmatized_text = " ".join([token.lemma_ for token in nlp(text) if not token.is_stop])
    return lemmatized_text

In [5]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

dataset_review_meta = pd.read_csv('/mnt/d/data/amazon-review-data/amazon-cell-phone-review-meta.csv', nrows=5)
dataset_review = pd.read_csv('/mnt/d/data/amazon-review-data/amazon-cell-phone-reviews.csv', nrows=5)
dataset_review.columns = map(str.lower, dataset_review.columns)

In [6]:
preprocess_text = (preprocess(row) for row in dataset_review['body'])
lemmatize_text = [lemmatize(str(doc)) for doc in nlp.pipe(preprocess_text, batch_size=100, n_threads=-1)]

In [8]:
df = pd.DataFrame({'lemmatize_text' : lemmatize_text})
df = df.dropna()
df

Unnamed: 0,lemmatize_text
0,samsung awhile absolute review detect stupid thing finally nokia phone buy garage wonder cheap take forever scroll endlessly usually phone number category simply press silent vibrate class ring immediately silence thing remember silent learn mission break nail process damage reason phone start give problem succeed open button big vibration strong reception shabby elevator remarkable consider phone service simply put pocket compare samsung phone work tone phone actually charge quickly great battery doesn potatoe phone convos bright large screen customize scroll purple orange overall phone serve purpose definitely pale comparison phone come sprint great
1,software issue nokia sprint phone messaging capability sprint system software patch come month spend sprint award win customer service admit problem nokia design phone incoming message retrieve quickly view offline provider sprint like people server connected burn minute check inbox compose reply sprint server respond innovation money make fine
2,great reliable phone purchase phone samsung easily comprehendable speed dialing available number voice dialing feature take long speed dialing thing bother game nokia take snake phone skydive bowl tennis ringer feature available choose different ringer person call ringtone available online download phone pretty stuck vibrate ringtone regular polyphonic tone cover reasonable price range
3,phone expect price receive phone little month receive accessory suppose phone company tell couple week receive shortly phone talk make phone call
4,phone great purpose offer buy not picture jaket super store employee mean barely close snap case dirty clear casing look dirty charge


In [45]:
from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in df['lemmatize_text']]
phrases = Phrases(sent, min_count=2, threshold=1, progress_per=1)
bigram = Phraser(phrases)

INFO 16:00:01: collecting all words and their counts
INFO 16:00:01: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO 16:00:01: PROGRESS: at sentence #1, processed 93 words and 171 word types
INFO 16:00:01: PROGRESS: at sentence #2, processed 142 words and 253 word types
INFO 16:00:01: PROGRESS: at sentence #3, processed 195 words and 336 word types
INFO 16:00:01: PROGRESS: at sentence #4, processed 217 words and 368 word types
INFO 16:00:01: collected 405 word types from a corpus of 239 words (unigram + bigrams) and 5 sentences
INFO 16:00:01: using 405 counts as vocab in Phrases<0 vocab, min_count=2, threshold=1, max_vocab_size=40000000>
INFO 16:00:01: source_vocab length 405
INFO 16:00:01: Phraser built with 0 phrasegrams


In [50]:
import multiprocessing
from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() # counts number cpu in computer

4

In [71]:
from collections import defaultdict

word_frequency = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_frequency[i] += 1

print(sorted(word_frequency, key=word_frequency.get, reverse=True)[:10])

['phone', 'sprint', 'nokia', 'great', 'samsung', 'thing', 'take', 'dialing', 'available', 'receive']


In [73]:
'''
    min_count - words to be ingnored with total absolute frequency lower than this
    window - maximum distance between the current and predicted word within a sentence
    size - dimensionality of the feature vectors
    sample - threshold for configuring which higher-frequency words are randomly downsampled
    alpha - initial learning rate - (0.01, 0.05)
    min_alpha - learning rate will linearly drop to min_alpha as training progresses
    negative - negative sampling rate. If set to 0, no negative sampling is used.
    workers - number of worker threads to train the model
'''
wordvec_model = Word2Vec(min_count=20, window=2, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores-1)