## Amazon Review Data : Data Exploration

### Setup spark

In [1]:
import os, pickle, glob
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

import pyspark.sql.functions as F

In [2]:
sc = SparkSession.builder \
    .config("spark.driver.memory", "64g") \
	.config("spark.executor.memory", "32g") \
    .config('spark.executor.instances', 2) \
	.appName("Amazon Reviews") \
	.getOrCreate()

In [3]:
sqlContext = SQLContext(sc)



## Read Data

### Get files

In [4]:
#######################################################################################
###################### change path to: "../clin6/amazon_data" #########################
#######################################################################################
path = "../clin6/amazon_data"

In [5]:
def read(path):
    """
    Method that loads data file as df
    Takes in 1 parameter: path
    """
    return sc.read.csv(path, sep = "\t", header = True, inferSchema = True)

def get_path(file):
    """
    Method to create path
    Takes in 1 parameter: file name
    """
    return "amazon_data/%s" % file

In [6]:
dir = os.listdir(path)
files = [f for f in dir if os.path.isfile(os.path.join(path, f))]

def get_df(files):
    """
    Method that combines files into 1 big df
    Takes in 1 parameter: list of file names
    """
    df = None
    n = len(files)
    for i in range(n):
        data = read(get_path(files[i]))
        if df is None:
            df = data
        else:
            df = df.union(data)
    return df

# Data Preprocessing
* Finish major preprocessing, this includes scaling and/or transforming your data, imputing your data, encoding your data, feature expansion, Feature expansion (example is taking features and generating new features by transforming via polynomial, log multiplication of features).

### Take care of missing categories

In [None]:
product_category_col = 'product_category'

In [None]:
def impute_category(df):
    """
    Method that fills in product_category
    Takes in one paramater: df
    """
    return df.withColumn(product_category_col,
                         F.col(product_category_col)).fillna(files[i][18:-10])

In [None]:
def get_imputed_df(files, category = True):
    """
    Method that combines files into 1 big df
    Takes in 1 parameter: list of file names
    """
    df = None
    n = len(files)
    for i in range(1):
        data = read(get_path(files[i]))
        
        # Fill in null categories
        if category:
            data = impute_category(data)
        
        if df is None:
            df = data
        else:
            df = df.union(data)
    return df

### Get df and Removing Repetitive/Unnecessary Information

In [None]:
df = get_imputed_df(files).drop('marketplace', 'vine').cache()
sqlContext.registerDataFrameAsTable(df, "df")

In [None]:
columns = df.columns
num_cols = len(columns)

### Filter rows with missing body and date 

In [16]:
review_body_col = 'review_body'
review_date_col = 'review_date'

In [18]:
df.filter(df[review_body_col].isNotNull() & df[review_date_col].isNotNull())

+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+-----------------+--------------------+--------------------+-----------+
|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|verified_purchase|     review_headline|         review_body|review_date|
+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+-----------------+--------------------+--------------------+-----------+
|   36075342| RAB23OVFNCXZQ|B00LPRXQ4Y|     339193102|17" 2003-2006 For...|      Automotive|          1|            0|          0|                Y|     As it was used,|As it was used, t...| 2015-08-31|
|   42462164|R3NORADVJO6IE6|B000C7S0TO|     907684644|Spectra Premium C...|      Automotive|          5|            0|          0|                Y|          Five Stars|Put it in fine, n..

# Data modeling
* Train your first model

# Data Evaluation
* Evaluate your model and compare training vs. test error

# Answer the questions
* Where does your model fit in the fitting graph? and What are the next models you are thinking of and why?

# Conclusion section
* What is the conclusion of your 1st model? What can be done to possibly improve it?