## Amazon Review Data : Data Exploration

### Setup spark

In [5]:
# Import libraries
import os, pickle, glob
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession, SQLContext, DataFrame
from pyspark.sql.types import StringType
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.ml.stat import Summarizer
from pyspark.ml.feature import Word2Vec, Tokenizer, StringIndexer, OneHotEncoder, PCA, VectorAssembler

Matplotlib created a temporary cache directory at /tmp/matplotlib-2vvhgvya because the default path (/home/jovyan/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [6]:
sc = SparkSession.builder \
    .config("spark.driver.memory", "64g") \
	.config("spark.executor.memory", "32g") \
    .config('spark.executor.instances', 5) \
	.appName("Amazon Reviews") \
	.getOrCreate()

In [7]:
sqlContext = SQLContext(sc)



## Read Data

### Get files

In [8]:
#######################################################################################
###################### change path to: "../clin6/amazon_data" #########################
#######################################################################################
path = "../clin6/amazon_data"

In [9]:
def read(path):
    """
    Method that loads data file as df
    Takes in 1 parameter: path
    """
    return sc.read.csv(path, sep = "\t", header = True, inferSchema = True)

def get_path(file):
    """
    Method to create path
    Takes in 1 parameter: file name
    """
    return "amazon_data/%s" % file

In [10]:
dir = os.listdir(path)
files = [f for f in dir if os.path.isfile(os.path.join(path, f))]

def get_df(files):
    """
    Method that combines files into 1 big df
    Takes in 1 parameter: list of file names
    """
    df = None
    n = len(files)
    for i in range(n):
        data = read(get_path(files[i]))
        if df is None:
            df = data
        else:
            df = df.union(data)
    return df

# Data Preprocessing
* Finish major preprocessing, this includes scaling and/or transforming your data, imputing your data, encoding your data, feature expansion, Feature expansion (example is taking features and generating new features by transforming via polynomial, log multiplication of features).

In [11]:
product_category_column = 'product_category'
review_body_column = 'review_body'
review_date_column = 'review_date'
title_column = 'product_title'
category_column = 'product_category'
product_parent_column = 'product_parent'
review_body_column = 'review_body'
verified_purchase_column = 'verified_purchase'

product_category_col = F.col(product_category_column)
review_body_col = F.col(review_body_column)
review_date_col = F.col(review_date_column)
title_col = F.col(title_column)
category_col = F.col(category_column)
product_parent_col = F.col(product_parent_column)
review_body_col = F.col(review_body_column)
verified_purchase_col = F.col(verified_purchase_column)

## Load Data & Take care of missing categories

In [12]:
def get_imputed_df(files, category = True):
    """
    Method that combines files into 1 big df
    Takes in 1 parameter: list of file names
    """
    df = None
    n = len(files)
    categories = {}
    for i in range(n):
        data = read(get_path(files[i]))
        
        # Fill in null categories
        if category:
            cat = files[i][18:-10]
            categories[cat] = i
            data = data.withColumn(product_category_column,
                                   product_category_col).fillna(cat)
        
        if df is None:
            df = data
        else:
            df = df.union(data)
    return df, categories

## Get df & Remove Columns

In [13]:
df, categories = get_imputed_df(files)
df = df.drop('marketplace', 'vine').cache()

In [14]:
sqlContext.registerDataFrameAsTable(df, "df")

In [15]:
columns = df.columns
num_cols = len(columns)

## Filter out rows with missing body and date and verified purchase

In [22]:
df = df.filter(review_body_col.isNotNull() & review_date_col.isNotNull())
df = df.filter(verified_purchase_col == True)

## Filter out old data

In [28]:
df = df.filter(F.year(review_date_col) >= 2005)

## Check other missing values

In [31]:
for i in range(num_cols):
    missing = df.filter(df[columns[i]].isNull()).count()
    print("'%s' column has %d missing values" % (columns[i], missing))

'customer_id' column has 0 missing values
'review_id' column has 0 missing values
'product_id' column has 0 missing values
'product_parent' column has 0 missing values
'product_title' column has 0 missing values
'product_category' column has 0 missing values
'star_rating' column has 0 missing values
'helpful_votes' column has 0 missing values
'total_votes' column has 0 missing values
'verified_purchase' column has 0 missing values
'review_headline' column has 0 missing values
'review_body' column has 0 missing values
'review_date' column has 0 missing values


* No more missing values

## Extract month and year

In [32]:
month_column = 'month'
year_column = 'year'

In [34]:
df = df.withColumn(month_column, F.month(review_date_col)).withColumn(year_column, F.year(review_date_col))

## Encode Categorical Columns

### Match category to numbers

In [36]:
category_num_col = 'product_category_num'

In [37]:
def translate(dic): 
    return F.udf(lambda x: dic.get(x), StringType()) 

In [38]:
df = df.withColumn(category_num_col, translate(categories)(category_col))

### Change title into vectors

In [None]:
titleArray_column = 'titleArray'
titleVector_column = 'titleVector'

In [None]:
df = df.withColumn(titleArray_column, F.split(F.lower(F.col(title_column)), ' '))

In [None]:
word2vec = Word2Vec(inputCol = titleArray_column, outputCol = titleVector_column,
                    minCount = 100, vectorSize = 16, numPartitions = 4)
model = word2vec.fit(df)
df = model.transform(df)

### Change text into vectors

In [None]:
reviewArray_column = 'reviewArray'
reviewVector_column = 'reviewVector'

In [None]:
df = df.withColumn(reviewArray_column, F.split(F.lower(F.col(review_body_column)), ' '))

In [None]:
word2vec = Word2Vec(inputCol = reviewArray_column, outputCol = reviewVector_column,
                    minCount = 100, vectorSize = 16, numPartitions = 4)
model = word2vec.fit(df)
df = model.transform(df)

## Dataset Splitting
* Use last year as test and the rest as train

In [40]:
train = df.filter(F.year(review_date_col) < 2015)
test = df.filter(F.year(review_date_col) >= 2015)

In [41]:
print(train.count())
print(test.count())

58754087
27498612


## Count Product Reviews Per Day
* Group by unique product identifier and day to get reviews per day for each product

In [None]:
grouped_df = df.groupby(review_date_col, product_parent_col).count()

In [17]:
import os

def get_folder_size(folder_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            total_size += os.path.getsize(file_path)
    return total_size

folder_path = "../clin6/data"
folder_size_bytes = get_folder_size(folder_path)
folder_size_kb = folder_size_bytes / 1024
folder_size_mb = folder_size_kb / 1024
folder_size_gb = folder_size_mb / 1024

print("Folder size:", folder_size_bytes, "bytes")
print("Folder size:", folder_size_kb, "KB")
print("Folder size:", folder_size_mb, "MB")
print("Folder size:", folder_size_gb, "GB")

Folder size: 13204063462 bytes
Folder size: 12894593.224609375 KB
Folder size: 12592.376195907593 MB
Folder size: 12.297242378816009 GB


# Data modeling
* Train your first model

# Data Evaluation
* Evaluate your model and compare training vs. test error

# Answer the questions
* Where does your model fit in the fitting graph? and What are the next models you are thinking of and why?

# Conclusion section
* What is the conclusion of your 1st model? What can be done to possibly improve it?