In [157]:
import gzip
import os
import multiprocessing as mp
import pickle
import matplotlib.pyplot as plt
import numpy
import random
import sklearn
import string
import math
import numpy as np
import time
from collections import defaultdict
from gensim.models import Word2Vec
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE

In [10]:
data_name = 'Clothing_Shoes_and_Jewelry_5'
data_file = 'data/%s.json.gz' % data_name
batches_folder = 'data/%s' % data_name
num_threads = mp.cpu_count()
batch_size = 8192

In [11]:
all_files = os.listdir(batches_folder)
all_files = [ os.path.join(batches_folder, name) for name in all_files ]
pkl_files = [ name for name in all_files if '.pkl' in name ]
pkl_files.sort()
print(len(pkl_files))

1377


In [90]:
def count_entries(pkl_file):
    with open(pkl_file, 'rb') as file_read:
        items = pickle.load(file_read)
    return len(items)

In [91]:
with mp.Pool(num_threads) as p:
    batch_lens = p.map(count_entries, pkl_files)
dataset_len = sum(batch_lens)
print(dataset_len)

11280384


In [14]:
punctuation = set(string.punctuation)

In [59]:
def count_words(pkl_file):
    word_count_batch = defaultdict(int)
    with open(pkl_file, 'rb') as file_read:
        items = pickle.load(file_read)
        for item in items:
            try:
                review = item['reviewText']
            except KeyError:
                continue
            review = ''.join([c for c in review.lower() if not c in punctuation])
            for w in review.split():
                word_count_batch[w] += 1
    return word_count_batch

In [60]:
with mp.Pool(num_threads) as p:
    word_count_batches = p.map(count_words, pkl_files)

In [61]:
len(word_count_batches)

1377

In [62]:
word_count = defaultdict(int)
for word_count_batch in word_count_batches:
    for w in word_count_batch:
        word_count[w] += word_count_batch[w]

In [67]:
len(word_count)

755779

In [68]:
len(word_count_batch)

10533

In [69]:
counts = [(word_count[w], w) for w in word_count]
counts.sort()
counts.reverse()

In [165]:
def count_df(pkl_file):
    word_count_batch = defaultdict(int)
    with open(pkl_file, 'rb') as file_read:
        items = pickle.load(file_read)
        for item in items:
            try:
                review = item['reviewText']
            except KeyError:
                continue
            review = ''.join([c for c in review.lower() if not c in punctuation])
            for w in set(review.split()):
                word_count_batch[w] += 1
    return word_count_batch

In [166]:
with mp.Pool(num_threads) as p:
    df_batches = p.map(count_words, pkl_files)

In [167]:
df = defaultdict(int)
for df_batch in df_batches:
    for w in df_batch:
        df[w] += df_batch[w]

In [226]:
# only first batch bc way too much data
with open(pkl_files[0], 'rb') as file_read:
    items = pickle.load(file_read)
    rev = items[91]
rev

{'overall': 4.0,
 'verified': True,
 'reviewTime': '11 19, 2014',
 'reviewerID': 'A3F0HOL1IZJMO3',
 'asin': '3979050432',
 'reviewerName': 'michele',
 'reviewText': "I rated this rating because I had been looking for some of these face ski mask last year for my husband, but couldn't find one anywhere. He was very happy when I gave these to him, because he works loading semi trucks and it is very cold. this face ski mask will block the cold from his face and neck area. He started wearing one last week and he likes it and I would recommend these to anyone who works in very cold weather.",
 'summary': 'He was very happy when I gave these to him',
 'unixReviewTime': 1416355200}

In [228]:
tf = defaultdict(int)
r = ''.join([c for c in rev['reviewText'].lower() if not c in punctuation])
for w in r.split():
    # Note = rather than +=, different versions of tf could be used instead
    tf[w] = 1
    
tfidf = dict(zip(words,[tf[w] * math.log2(dataset_len / df[w]) for w in words]))
tfidfQuery = [tf[w] * np.log2(dataset_len / df[w]) for w in words]

In [230]:
maxTf = [(tf[w],w) for w in words]
maxTf.sort(reverse=True)
maxTfIdf = [(tfidf[w],w) for w in words]
maxTfIdf.sort(reverse=True)
maxTfIdf[:20]

[(14.098637916807016, 'trucks'),
 (13.84423007663203, 'loading'),
 (11.788876930144493, 'semi'),
 (11.462971976342546, 'ski'),
 (10.885974186164137, 'block'),
 (10.507890819799911, 'rated'),
 (9.021705984774108, 'mask'),
 (8.837252165174101, 'rating'),
 (8.693767840377888, 'anywhere'),
 (7.428524110335686, 'started'),
 (7.346994951177826, 'neck'),
 (7.21443653240131, 'face'),
 (7.097253801279108, 'anyone'),
 (7.076977903847522, 'weather'),
 (7.0059711839600105, 'likes'),
 (6.963932160023441, 'week'),
 (6.856093489696655, 'couldnt'),
 (6.736987421598345, 'cold'),
 (6.567596082334974, 'gave'),
 (6.560842854286377, 'area')]

In [231]:
def Cosine(x1,x2):
    x1 = np.array(x1)
    x2 = np.array(x2)
    numer = 0
    norm1 = 0
    norm2 = 0
    numer = np.sum(x1*x2)
    norm1 = np.sum(np.square(x1))
    norm2 = np.sum(np.square(x2))
    if norm1*norm2:
        return numer / np.sqrt(norm1*norm2)
    return 0

In [232]:
with open(pkl_files[0], 'rb') as file_read:
    items = pickle.load(file_read)

In [235]:
def get_similarities(rev2):
    if 'reviewText' not in rev2:
        return 0.0, ''
    tf = defaultdict(int)
    r = ''.join([c for c in rev2['reviewText'].lower() if not c in punctuation])
    for w in r.split():
        # Note = rather than +=
        tf[w] = 1
    tfidf2 = [tf[w] * np.log2(dataset_len / df[w]) for w in words]
    return Cosine(tfidfQuery, tfidf2), rev2['reviewText']

In [236]:
len(items)

8192

In [237]:
start_time = time.time()
with mp.Pool(num_threads) as p:
    similarities = p.map(get_similarities, items)
print("--- %s seconds ---" % (time.time() - start_time))

--- 593.2944765090942 seconds ---


In [238]:
similarities.sort(reverse=True)
similarities[:20]

[(1.0,
  "I rated this rating because I had been looking for some of these face ski mask last year for my husband, but couldn't find one anywhere. He was very happy when I gave these to him, because he works loading semi trucks and it is very cold. this face ski mask will block the cold from his face and neck area. He started wearing one last week and he likes it and I would recommend these to anyone who works in very cold weather."),
 (0.1938419680951804,
  "Love them so much! Expensive but I couldn't find them anywhere else."),
 (0.19031869125109166,
  "The color is just as it shows. I'm just so happy to have these, couldn't for the life of me find them anywhere else"),
 (0.18957987918129757, "Great couldn't have been more happy"),
 (0.18102312304951182, 'Fits great and looks great. Semi Dressy shirt..'),
 (0.18080340220426322,
  'My daughter loves these shoes and would recommend to anyone looking.'),
 (0.17672856492207226,
  "It's a ski mask.  It covers your face, minus your eyes an