# Initial Data ETL and EDA

## 2. Data Exploration

In [None]:
import warnings

warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", DeprecationWarning)

In [None]:
import json
# import pyspark
# from pyspark.sql import SparkSession

import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, norm, skew, kurtosis

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# from common import vocabulary

import html
import re
import string
# import spacy
import pickle

import nltk
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

from collections import Counter
# from wordcloud import WordCloud
from unicodedata import normalize

import gensim
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import tensorflow as tf

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, Dense, LSTM


### 2.1 Load data using PySpark

In [None]:
spark = SparkSession.builder.appName('Test').getOrCreate()

In [None]:
mags_meta = spark.read.json('data/magazine_subscriptions_meta.json')
mags = spark.read.json('data/magazine_subscriptions.json')

In [None]:
books = spark.read.json('data/books.json')
books_meta = spark.read.json('data/books_meta.json')

In [None]:
mags.printSchema()

In [None]:
mags_meta.printSchema()

In [None]:
books_meta.printSchema()

In [None]:
book_reviews.printSchema()

In [None]:
books_meta.count()

In [None]:
books.count()

In [None]:
mag_categories = mags_meta.groupBy('category')
mag_categories.count().show()

In [None]:
mags_pd = mags.toPandas()
mags_meta_pd = mags_meta.toPandas()

### 2.2 Load data using pandas

In [None]:
# Note - runtime is untenable on the books data in local jupyter notebook

# books_meta = pd.read_json('data/books_meta.json', lines=True)
# books = pd.read_json('data/books.json', lines=True)
meta = pd.read_json('data/magazine_subscriptions_meta.json', lines=True)
reviews = pd.read_json('data/magazine_subscriptions.json', lines=True)

In [None]:
# Drop duplicate records of a unique ASIN
meta.drop_duplicates(subset='asin', inplace=True)

# Drop reviews with no reviewText
reviews = reviews[reviews['reviewText'].isna() == False]

# Convert vote column to numeric
reviews['vote'] = reviews['vote'].str.replace(',', '').fillna(0).astype(int)

In [None]:
print("\033[1m" + "Dataframe Shape" + "\033[0m")
print(reviews.shape)
print("\n")

print("\033[1m" + "Column Information" + "\033[0m")
reviews.info()
print("\n")

In [None]:
reviews['reviewText'][1]

In [None]:
reviews.iloc[0]

In [None]:
print("\033[1m" + "Dataframe Shape" + "\033[0m")
print(meta.shape)
print("\n")

print("\033[1m" + "Column Information" + "\033[0m")
meta.info()
print("\n")

In [None]:
meta.iloc[0].category

### 2.3 Meta data transformation

In [None]:
categories = ['category', 'subcat1', 'subcat2', 'subcat3', 'subcat4']
meta[categories] = pd.DataFrame(meta['category'].to_list())
meta.replace('amp;', '', regex=True, inplace=True)

In [None]:
meta['category'].value_counts(dropna=False)

In [None]:
subcat = 'subcat1'
meta[subcat].value_counts(dropna=False)

In [None]:
meta['brand'].value_counts(dropna=False)

In [None]:
def count_if(group):
    '''Count the values of a boolean column that are True'''
    return np.sum(group==True)

# Use groupby and agg to summarize relevant statistics from review data by each ASIN
asin_stats = reviews.groupby('asin').agg({'asin': 'count',
                                          'overall': 'mean',
                                          'verified': count_if                                         
                                         })

# Rename columns and reset index
asin_stats.rename(columns={'asin':'reviews',
                           'overall':'avgRating',
                           'verified': 'reviewsVerified'
                          },
                  inplace=True
                 )
asin_stats.reset_index(inplace=True)

# Merge ASIN statistics df with original metadata, drop irrelevant columns
asins = pd.merge(meta,
                asin_stats,
                on='asin',
                how='left'
               )

asins = asins[['asin', 'subcat1', 'brand', 'reviews', 'reviewsVerified', 'avgRating']]
asins['pctVerified'] = asins['reviewsVerified'] / asins['reviews']
asins.head()

In [None]:
print("\033[1m" + "Column Information" + "\033[0m")
print(asins.describe())

### 2.4 Meta data visualization

In [None]:
sns.lmplot(x='reviews', y='reviewsVerified', data=asins, hue='subcat1', fit_reg=False)\
   .set(title='Reviews vs. Verified Reviews by SubCategory')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,12))
sns.boxplot(x='avgRating', y='subcat1', orient='h', data=asins)
# sns.swarmplot(x='avgRating', y='subcat1', orient='h', data=asins, color='.1')
plt.show()

### 2.5 Filter and random sampling

In [None]:
# Filter ASINs to reviews >= 10
asin_samples = asins[asins['reviews'] >= 10]

# Filter ASINs to subcat1 >= 10
# Need to consider whether I want to do this or not
# cat_counts = asin_samples['subcat1'].value_counts(dropna=False)
# asin_samples = asin_samples.loc[asin_samples['subcat1'].isin(cat_counts[cat_counts >= 30].index), :]

asin_samples.shape

In [None]:
sns.lmplot(x='reviews', y='reviewsVerified', data=asin_samples, hue='subcat1', fit_reg=True)\
   .set(title='Reviews vs. Verified Reviews by SubCategory')

plt.show()

In [None]:
# Filter reviews based on asin_samples
review_samples = pd.merge(reviews,
                          asin_samples,
                          on='asin',
                          how='inner'
                         )

In [None]:
# Random sample 5 reviews from each ASIN
# This is likely over-sampling - DON'T USE 
# review_samples = review_samples.groupby('asin').sample(n=5, random_state=1)

In [None]:
print(reviews.shape)
print(review_samples.shape)

In [None]:
# Remove outliers greater than 3 std-dev above mean votes for usefulness
# This makes visualization more interpretable, but likely not a good idea for modeling
# Because we want the outliers to be considered the "most useful"

# outlier_threshold = review_samples.describe()['vote']['mean'] +\
#                     (review_samples.describe()['vote']['std'] * 3)

# review_samples = review_samples[review_samples['vote'] <= outlier_threshold]

In [None]:
# Remove reviews with X votes

min_votes = 10
review_samples = review_samples[review_samples['vote'] >= min_votes]

In [None]:
print(reviews.shape)
print(review_samples.shape)

In [None]:
sns.lmplot(x='avgRating', y='vote', data=review_samples, hue='subcat1', fit_reg=True)\
   .set(title='Reviews vs. Verified Reviews by SubCategory')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,12))
sns.boxplot(x='vote', y='subcat1', hue='verified', orient='h', data=review_samples)
# sns.swarmplot(x='vote', y='subcat1', hue='verified', orient='h', data=review_samples)
plt.show()

In [None]:
sns.histplot(x='vote',
             hue='verified',
             data=review_samples[review_samples['vote'] < 10])
plt.show()

In [None]:
sns.histplot(x='vote',
             hue='verified',
             data=review_samples[(review_samples['vote'] >= 10) & (review_samples['vote'] < 100)])
plt.show()

In [None]:
sns.histplot(x='vote',
             hue='verified',
             data=review_samples[review_samples['vote'] >= 100])
plt.show()

In [None]:
# Explore skew and kurtosis functions in pandas and scipy
pd_skew    = review_samples['vote'].skew()
pd_kurt    = review_samples['vote'].kurt()
sci_skew   = skew(review_samples['vote'], bias=False)
sci_kurt   = kurtosis(review_samples['vote'], bias=False)
sci_skew_b = skew(review_samples['vote'], bias=True)
sci_kurt_b = kurtosis(review_samples['vote'], bias=True)

pd.DataFrame({'skew': [pd_skew, sci_skew, sci_skew_b],
              'kurtosis': [pd_kurt, sci_kurt, sci_kurt_b]},
             index=['pandas', 'scipy-unbiased', 'scipy-biased']
            )

In [None]:
# Normalize vote variable
from scipy.stats import boxcox
from sklearn.preprocessing import PowerTransformer

# Take the log10 of votes
review_samples['vote_norm'] = boxcox(review_samples['vote'], lmbda=0)

# Apply power transformation
pt = PowerTransformer()
review_samples['vote_power'] = pt.fit_transform(pd.DataFrame(review_samples['vote']))

In [None]:
fig, ax = plt.subplots(2,2, figsize=(20,10))

sns.histplot(x='vote',
             data=review_samples,
             bins=20,
             ax=ax[0][0]
            )

sns.histplot(x='vote_norm',
             data=review_samples,
             bins=20,
             ax=ax[0][1]
            )

sns.histplot(x='vote_power',
             data=review_samples,
             bins=20,
             ax=ax[1][0]
            )

plt.show()

In [None]:
# Define the usefulness classification based on vote_power variable

review_samples['useful'] = review_samples['vote_power'].apply(lambda x: 1 if x > 0 else 0)
review_samples['useful'].value_counts()

## 3. Text pre-processing

In [None]:
review_samples = pd.read_csv('data/review_samples.csv')

In [None]:
# Use html library to decode special characters
html_reviews = review_samples[review_samples['reviewText'].str.contains('&#', na=False)]

if html_reviews.empty:
    # Doesn't appear to be an issue in this data, but will use the else statement to check
    pass
else:
    # Show an example of the unescape function
    sample_review = html_reviews['reviewText'].iloc[0]
    print(sample_review)
    print("")
    decoded_review = html.unescape(sample_review)
    print(decoded_review)

In [None]:
# Remove special characters
pattern = r"\&\#[0-9]+\;"

review_samples["reviewText"] = review_samples["reviewText"].str.replace(pat=pattern, repl="", regex=True)

### 3.1 Lemmatization

In [None]:
# Use lemmatization to reduce words to their root form

# import nltk resources
resources = ["wordnet", "stopwords", "punkt", \
             "averaged_perceptron_tagger", "maxent_treebank_pos_tagger"]

for resource in resources:
    try:
        nltk.data.find("tokenizers/" + resource)
    except LookupError:
        nltk.download(resource, quiet=True)

In [None]:
%%time
# create Lemmatizer object
lemma = WordNetLemmatizer()

def lemmatize_word(tagged_token):
    """ Returns lemmatized word given its tag"""
    root = []
    for token in tagged_token:
        tag = token[1][0]
        word = token[0]
        if tag.startswith('J'):
            root.append(lemma.lemmatize(word, wordnet.ADJ))
        elif tag.startswith('V'):
            root.append(lemma.lemmatize(word, wordnet.VERB))
        elif tag.startswith('N'):
            root.append(lemma.lemmatize(word, wordnet.NOUN))
        elif tag.startswith('R'):
            root.append(lemma.lemmatize(word, wordnet.ADV))
        else:          
            root.append(word)
    return root

def lemmatize_doc(document):
    """ Tags words then returns sentence with lemmatized words"""
    lemmatized_list = []
    tokenized_sent = sent_tokenize(document)
    for sentence in tokenized_sent:
        no_punctuation = re.sub(r"[`'\",.!?()]", " ", sentence)
        tokenized_word = word_tokenize(no_punctuation)
        tagged_token = pos_tag(tokenized_word)
        lemmatized = lemmatize_word(tagged_token)
        lemmatized_list.extend(lemmatized)
    return " ".join(lemmatized_list)

# apply functions to review text
review_samples["reviewProcessed"] = review_samples["reviewText"].apply(lambda row: lemmatize_doc(row))

# Example review
print(review_samples["reviewText"].iloc[1])
print("")
print(review_samples["reviewProcessed"].iloc[1])
print("")

### 3.2 General normalization

In [None]:
# Remove accents

remove_accent = lambda text: normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8", "ignore")
review_samples["reviewProcessed"] = review_samples["reviewProcessed"].apply(remove_accent)

# Remove punctuation
pattern = r"[^\w\s]"
review_samples["reviewProcessed"] = review_samples["reviewProcessed"].str.replace(pat=pattern, repl=" ", regex=True)

# Convert to lowercase
review_samples["reviewProcessed"] = review_samples["reviewProcessed"].str.lower()

# Remove stopwords
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
stop_words = [word.replace("\'", "") for word in stop_words]
# print(f"sample stop words: {stop_words[:15]} \n")

remove_stop_words = lambda row: " ".join([token for token in row.split(" ") if token not in stop_words])
review_samples["reviewProcessed"] = review_samples["reviewProcessed"].apply(remove_stop_words)

# Remove extra spaces
pattern = r"[\s]+"
review_samples["reviewProcessed"] = review_samples["reviewProcessed"].str.replace(pat=pattern, repl=" ", regex=True)

# Example review
print(review_samples["reviewText"].iloc[1])
print("")
print(review_samples["reviewProcessed"].iloc[1])

### 3.3 Tokenization and vocabulary

In [None]:
# Create corpora
corpora = review_samples['reviewText'].values
tokenized = [corpus.split(" ") for corpus in corpora]

In [None]:
# Setup phrase modeling
bi_gram = Phrases(tokenized, min_count=300, threshold=50)

tri_gram = Phrases(bi_gram[tokenized], min_count=50, threshold=50)

In [None]:
# Unigrams
uni_gram_tokens = set([token for text in tokenized for token in text])
uni_gram_tokens = set(filter(lambda x: x != "", uni_gram_tokens))

print(list(uni_gram_tokens)[:200])

In [None]:
# Bigrams
bigram_min = bi_gram.min_count
bi_condition = lambda x: x[1] >= bigram_min

bi_gram_tokens = dict(filter(bi_condition, bi_gram.vocab.items()))
bi_gram_tokens = set(bi_gram_tokens)

bi_grams_only = bi_gram_tokens.difference(uni_gram_tokens)
print(list(bi_grams_only))

In [None]:
# Trigrams 
trigram_min = tri_gram.min_count

tri_condition = lambda x: x[1] >= trigram_min

tri_gram_tokens = dict(filter(tri_condition, tri_gram.vocab.items()))
tri_gram_tokens = set(tri_gram_tokens)

tri_grams_only = tri_gram_tokens.difference(bi_gram_tokens)
print(list(tri_grams_only)[:50])

### X.X Output sampled and processed data to CSV

In [None]:
review_samples.to_csv('data/review_samples.csv')

## X.X New data source 

Data description: https://s3.amazonaws.com/amazon-reviews-pds/readme.html  
Data dictionary and download: https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt

__Note:__ The potential advantage of this data is that usefulness can be calculated as a proportion rather than an absolute. Whereas the original data only provides the total # of votes for usefulness, this data provides the total votes for usefulness and the total overall votes.

In [None]:
sample = pd.read_csv("data/sample_us.tsv", sep='\t')
sample.info()

__DATA COLUMNS:__  
__marketplace__       - 2 letter country code of the marketplace where the review was written.  
__customer_id__       - Random identifier that can be used to aggregate reviews written by a single author.  
__review_id__         - The unique ID of the review.  
__product_id__        - The unique Product ID the review pertains to. In the multilingual dataset the reviews for the same product in different countries can be grouped by the same product_id.  
__product_parent__    - Random identifier that can be used to aggregate reviews for the same product.  
__product_title__     - Title of the product.  
__product_category__  - Broad product category that can be used to group reviews (also used to group the dataset into coherent parts).  
__star_rating__       - The 1-5 star rating of the review.  
__helpful_votes__     - Number of helpful votes.  
__total_votes__       - Number of total votes the review received.  
__vine__              - Review was written as part of the Vine program.  
__verified_purchase__ - The review is on a verified purchase.  
__review_headline__   - The title of the review.  
__review_body__       - The review text.  
__review_date__       - The date the review was written.  

In [None]:
sample['useful'] = sample['helpful_votes'] / sample['total_votes']
sample['useful'] = sample['useful'].fillna(0)

In [None]:
sns.lmplot(x='total_votes', y='useful', data=sample, hue='product_category', fit_reg=False)\
   .set(title='')

plt.show()