In [25]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import chunk
from nltk.tag.perceptron import PerceptronTagger
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.util import ngrams
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk import word_tokenize
import statsmodels.api as sm
import scipy.sparse as sp
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
import gensim
import re
import string
from toolz import frequencies
from multiprocessing import Pool
wordnet_lemmatizer = WordNetLemmatizer()
from amazon_utils.keywords_crawler import query_page
from amazon_utils.crawler_config import CrawlConfig

In [3]:
import pickle
import numpy as np
import time

def data_load(filepath):
    start_time = time.time()
    with open(filepath, 'rb') as f:
        dataload = pickle.load(f)
    end_time = time.time()
    print('loaddata use time：' + str(end_time - start_time) + 's')
    return dataload

In [4]:
text = 'This is a sample English sentence'

In [5]:
def read_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        rows = f.readlines()
    return rows

In [6]:
def sent_tokenizer(text):
    '''
    INPUT
    text - a string
    
    OUTPUT
    a list of sentences
    '''
    return sent_tokenize(text)

In [7]:
def word_tokenizer(text):
    '''
    INPUT
    text - a string
    
    OUTPUT
    a list of words
    '''
    return word_tokenize(text)

In [8]:
def remove_numbers(text):
    '''
    INPUT
    text - a string
    
    OUTPUT
    a string without numbers
    '''
    return re.sub(r'\d+','', text)

In [9]:
def remove_punctuations(text):
    '''
    INPUT
    text - a string
    
    OUTPUT
    a string without punctuation
    '''
    words = word_tokenizer(text)
    punt_removed = [w for w in words if w.lower() not in string.punctuation]
    return ' '.join(punt_removed)

In [10]:
def remove_stopwords(text, lang='english'):
    '''
    INPUT
    text - a string
    
    OUTPUT
    a string without stopwords
    '''
    words = word_tokenizer(text)
    lang_stopwords = stopwords.words(lang)
    stopwords_removed = [w for w in words if w.lower() not in lang_stopwords]
    return ' '.join(stopwords_removed)

In [11]:
def remove_extra_whitespace(text):
    '''
    INPUT
    text - a string
    
    OUTPUT
    a string without extra whitespaces
    '''
    return ' '.join(text.split())

In [12]:
def pos_tagger(text, tagger='M'):
    '''
    INPUT
    text - a string
    
    OUTPUT
    a list of tuples, each tuple with a word and each pos
    '''
    if tagger == 'M':
        # M means Maxnet Entropy Algorithm
        return nltk.pos_tag(word_tokenizer(text))
    if tagger == 'P':
        # P means Average Perceptrop Algorithm
        PT = PerceptronTagger()
        return PT.tag(word_tokenizer(text))
    

In [13]:
def check_pos_means(pos_name):
    print(nltk.help.upenn_tagset(pos_name))

In [14]:
def words_stemmer(words, type="PS", lang="english", encoding="utf8"):
    '''
    INPUT
    words - a list of words
    
    OUTPUT
    stem_words - a list of words after stemming
    '''

    supported_stemmers = ["PS","LS","SS"]
    if type is False or type not in supported_stemmers:
        return words
    else:
        stem_words = []
        if type == "PS":
            # PS means PorterStemmer
            stemmer = PorterStemmer()
            for word in words:
                stem_words.append(stemmer.stem(word).encode(encoding))
        if type == "LS":
            # PS means LancasterStemmer
            stemmer = LancasterStemmer()
            for word in words:
                stem_words.append(stemmer.stem(word).encode(encoding))
        if type == "SS":
            # PS means SnowballStemmer
            stemmer = SnowballStemmer(lang)
            for word in words:
                stem_words.append(stemmer.stem(word).encode(encoding))
        return stem_words

In [15]:
def words_lemmatizer(text, encoding="utf8"):
    '''
    INPUT
    text - a string
    
    OUTPUT
    lemma_words - a list of words after lemmatization
    '''
    words = word_tokenizer(text)
    lemma_words = []
    wl = WordNetLemmatizer()
    for word in words:
        pos = find_pos(word)
        lemma_words.append(wl.lemmatize(word, pos).encode(encoding))
    return lemma_words


def find_pos(word):
    # Part of Speech constants
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    pos = pos_tagger(word)[0][1]
    # Adjective tags - 'JJ', 'JJR', 'JJS'    
    if pos.lower()[0] == 'j':
        return 'a'
    # Adverb tags - 'RB', 'RBR', 'RBS'
    elif pos.lower()[0] == 'r':
        return 'r'
    # Verb tags - 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
    elif pos.lower()[0] == 'v': 
        return 'v'
    # Noun tags - 'NN', 'NNS', 'NNP', 'NNPS'
    else:
        return 'n'

In [16]:
def find_synonums(word):
    '''
    INPUT
    word - a word
    
    OUTPUT
    synonyms - a set of words as the synonums of given word
    '''
    
    synonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
    synonyms = set(synonyms)
    return synonyms

In [17]:
def get_ngrams(text, n):
    '''
    INPUT
    word - a string
    
    OUTPUT
    a list of phrase
    '''
    n_grams = ngrams(word_tokenizer(text), n)
    return [' '.join(grams) for grams in n_grams]

In [26]:
with Pool() as p:
    result = p.map(query_page, list(CrawlConfig.init_sequence))

In [24]:
result

[['aloe vera gel',
  'alcohol',
  'alcohol wipes',
  'airpod case',
  'antibacterial hand soap',
  'airpods',
  'angel soft toilet paper',
  'animal crossing new horizons',
  'aa batteries',
  'aloe vera gel 100 percent pure'],
 ['baby wipes',
  'bidet',
  'board games',
  'bleach',
  'bidet toilet seat',
  'brita filter',
  'brita pitcher',
  'bounty paper towel',
  'bluetooth earbuds',
  'beef jerky'],
 ['clorox wipes',
  'charmin toilet paper',
  'clorox disinfecting wipes',
  'clorox',
  'cottonelle toilet paper',
  'cat litter',
  'charmin',
  'canned food',
  'car accessories',
  'computer desk'],
 ['disinfectant wipes',
  'disinfectant spray',
  'dish soap',
  'disinfecting wipes',
  'disposable gloves',
  'dust mask',
  'dishwasher pods',
  'dumbell set',
  'dawn dish soap',
  'disinfectant'],
 ['emergency c',
  'elderberry syrup',
  'emergen c',
  'earbuds',
  'electric guitar',
  'ethernet cable',
  'electric scooter',
  'epsom salt',
  'emergency food supply',
  'excedrin mi