# Data Cleaning

# Import Packages

In [1]:
import os
import json 
import gzip
import pandas as pd

# Load Data

In [2]:
### load the meta data

meta = []
reviews = []

# Beauty products metadata
with gzip.open('data/meta_All_Beauty.json.gz') as f:
    for l in f:
        meta.append(json.loads(l.strip()))

# Beauty products reviews
with gzip.open('data/All_Beauty.json.gz') as f:
    for l in f:
        reviews.append(json.loads(l.strip())
        
    
# total length of list, this number equals total number of products
print(len(meta), len(reviews))

# first row of the list
print(meta[0])
print(reviews[0])

SyntaxError: invalid syntax (<ipython-input-2-86a513e50e8a>, line 18)

In [None]:
df_meta = pd.DataFrame.from_dict(meta)
df_reviews = pd.DataFrame.from_dict(reviews)

# Inspect Data

## Metadata

In [None]:
df_meta.head(3)

In [None]:
df_meta.info()

*   title - product name
*   image - link of the source of image
*   brand - the name of the brand of the product
*   rank - information relevant to the rank of the sales
*   main_cat - the categories the product belong to
*   asin - Amazon product id
*   description - brief description of the product
*   also_view
*   also_buy
*   price
*   similar_item
*   details
*   feature
*   tech1
*    date 

### Inspecting item frequency in 'asin' column

In [None]:
print(len(df_meta['asin'].unique().tolist()))
df_meta['asin'].value_counts()

In [None]:
# Let's inspect the 'asin' with ID: B00008WMNI
id = "B0002CD01M"
df_meta.loc[df_meta['asin'] == id]

There are duplicate entries for the same product ID. The `asin` column needs to be compltely unique, so we will want to drop duplicates.

## Reviews

In [None]:
df_reviews.head(3)

In [None]:
df_reviews.info()

*   overall - the rating of the product ranging from 1 to 5

*   verified - if the reviewer is a verified customer in Amazon

*   reviewTime - the time of the review

*   reviewerID - the id of the user who has given the review

*   asin - Amazon product id

*   reviewerName - name of the user who has given the review

*   reviewText - the actual content of the review

*   summary - the title of the review

*   unixReviewTime - the time of the review in Unix format

*   vote - amount of votes regarding the review

*   style - 

*   image - 


### Inspect the Reviewer by reviewerID
This will give us the reviews by a certain reviewer

In [None]:
# Let's inspect the Reviewer by ID: A2GJX2KCUSR0EI
id = "A1KSC91G9AIY2Z"
df_reviews.loc[df_reviews['reviewerID'] == id]

### Show all of the reviewer ID for a specific name

Different reviewerIDs can have the same name. Let's who all the different reviewerDs for the same name.
Even uf they have the same, reviewers are mainly identified by their reviewerID.


In [None]:
# Let's inspect the reviewer with reviewerName: Sarah
name = "Sarah"
reviewer_name = df_reviews.loc[df_reviews['reviewerName'] == name]
print(reviewer_name['reviewerID'].value_counts())
reviewer_name

# Clean data before merge

## Drop duplicates of metadata df 
We need to make sure that the `asin` column is completely unique

In [None]:
df_meta = df_meta.drop_duplicates('asin', keep="first")
df_meta['asin'].value_counts()

## Fill missing values in the review and metadata df

In [None]:
def missing(dataset):
    columns = dataset.columns
    print('MISSING ROWS per COLUMN')
    for column in columns:
        percentage = (dataset[column].isnull().sum() / len(dataset)) * 100
        print('{}: {}, {:0.2f}%'.format(column, dataset[column].isnull().sum(), percentage))

In [None]:
## remove rows with unformatted title (i.e. some 'title' may still contain html style content)
df_meta = df_meta.fillna('NaN')
missing(df_meta)

In [None]:
df_reviews = df_reviews.fillna('NaN')
missing(df_reviews)

## Convert times to datetimes in the review df

In [None]:
# converting to 'reviewName' to datetime format
df_reviews['reviewTime'] = pd.to_datetime(df_reviews['reviewTime'])
df_reviews['unixReviewTime'] = pd.to_datetime(df_reviews['unixReviewTime'],unit='s')

print(type(df_reviews['reviewTime'][0]), type(df_reviews['unixReviewTime'][0]))

# Merge review and metadata by `asin` column

In [None]:
df = pd.merge(df_reviews, df_meta, on='asin', sort='reviewTime')
df.head(3)

In [None]:
df.info()

# Clean data after merge

## Drop and rename columns

In [None]:
df = df.drop(columns=['style', 'image_x', 'image_y', 'feature', 'tech1', 'date', 'unixReviewTime'])

rename_dict = {
    "overall": "rating", 
    "asin": "product_id",
    "reviewTime": "review_time",
    "reviewerID": "reviewer_id",
    "reviewerName": "reviewer_name",
    "reviewText": "reviewer_text"  
}

df = df.rename(columns=rename_dict)

In [None]:
df.head(3)

## Change `rating` to integer data type

In [None]:
df['rating'] = df['rating'].astype(int)
df['rating'].head(3)

# Export data

In [None]:
df.to_csv('./data/final_amazon_beauty.csv', index=False)