In [None]:
%%bash

pip install -q pandas==0.23.0
pip install -q numpy==1.14.3
pip install -q matplotlib==3.0.3
pip install -q seaborn==0.8.1
pip install -q PyAthena==1.8.0

In [None]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.sparse import lil_matrix

import boto3
import botocore
import sagemaker

In [None]:
session = boto3.session.Session()
region_name = session.region_name

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

## Data Preparation

### Create New Table with Sentiment
We no longer need a separate table.

This will be used by the model training.

This following query takes a long time (approx. 5min / scanning 50GB).
Find the equivalent in the sql/ directory and run within Athena Console.

**[TODO] Show screenshot of Athena console where to put in the query with link to github sql statement**

In [None]:
from pyathena import connect
from pyathena.util import as_pandas

database_name = 'dsoaws'
table_name = 'amazon_reviews_parquet'
sentiment_table_name = 'amazon_reviews_with_sentiment'
table_prefix = 'amazon_reviews_with_sentiment'

s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)

In [None]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
try:
    cursor.execute('CREATE TABLE IF NOT EXISTS {} \
                WITH ( \
                  format = \'PARQUET\', \
                  external_location = \'s3://{}/{}/\', \
                      partitioned_by = ARRAY[\'product_category\'] \
                ) \
                AS \
                SELECT customer_id, \
                         review_id, \
                         product_id, \
                         product_title, \
                         review_headline, \
                         review_body, \
                         review_date, \
                         year, \
                         star_rating, \
                         CASE \
                             WHEN star_rating > 3 THEN 1 \
                             ELSE 0 \
                         END AS sentiment, \
                         product_category \
                FROM {}.{} \
                WHERE LENGTH(review_body) > 20'
               .format(sentiment_table_name, bucket, table_prefix, database_name, table_name))
except:
    pass
 

### View Sentiment

This following query takes a long time... find the equivalent in the sql/ directory and run within Athena Console.

In [None]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()

cursor.execute('SELECT customer_id, \
                         review_id, \
                         product_id, \
                         product_title, \
                         review_headline, \
                         review_body, \
                         review_date, \
                         year, \
                         star_rating, \
                         sentiment, \
                         product_category \
                FROM {}.{} \
                WHERE product_category = \'Digital_Video_Download\' \
                ORDER BY review_id \
                LIMIT 500'
               .format(database_name, sentiment_table_name))
df = as_pandas(cursor)

In [None]:
df.head(5)

In [None]:
print(df['sentiment'].value_counts())
sns.countplot(x='sentiment', data=df)

### Handling imbalanced datasets

Here you can see we have a larger number of `positive` samples vs. `negative` ones. There are a number of techniques to blance this dataset out and the two most popular approaches are to either under-sample or over-sample. With under sampling you remove rows to balance the dataset out and in over sampling you can duplicate entries in the daatset which could lead to overfitting. This discussion is beyond the scope of this lab. You will under sample the data to balance the dataset but you can find more information [here]().

In [None]:
from sklearn.utils import resample

positive = df[df['sentiment']==1]
negative = df[df['sentiment']==0]

positive_downsampled = resample(positive,
                                replace = False, # sample without replacement
                                n_samples = len(negative), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
df = pd.concat([positive_downsampled, negative])

# checking counts
print(df['sentiment'].value_counts())

sns.countplot(x='sentiment', data=df)

### Create Test, Train, and Validation Datasets

Depending on the framework you are leveraging in your AI/ML workloads you may decide to split the data into test, train, and validate splits before uploading to S3. You can leverage some built in functions in the sklearn package to do the split. To learn more about the sklearn framework click [here](https://scikit-learn.org/stable/).

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=0)
test, validate = train_test_split(test, test_size=0.5, random_state=0)

print(f'Number of training examples: {len(train.index)}')
print(f'Number of testing examples: {len(test.index)}')
print(f'Number of validation examples: {len(validate.index)}')


### Visualize the Train, Test, and Validation Split

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = ['Train', 'Validation', 'Test']
sizes = [len(train.index), len(validate.index), len(test.index)]
explode = (0.1, 0, 0)  

fig1, ax1 = plt.subplots()

ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=90)

# Equal aspect ratio ensures that pie is drawn as a circle.
ax1.axis('equal')  

plt.show()

In [None]:
# df = df[['customer_id', 'product_id', 'product_title', 'star_rating', 'review_date']]

In [None]:
# df.head(5)

In [None]:
# df.shape

In [None]:
customers = df['customer_id'].value_counts()
products = df['product_id'].value_counts()

num_customers = customers.count()
print(num_customers)
num_products = products.count()
print(num_products)
num_features = num_customers + num_products
print(num_features)

TODO:  Filter out customers who haven't rated many movies

In [None]:
#customers = customers[customers >= 5]
#products = products[products >= 10]

reduced_df = df.merge(pd.DataFrame({'customer_id': customers.index})).merge(pd.DataFrame({'product_id': products.index}))
reduced_df

Create a sequential index for customers and movies

In [None]:
customers = reduced_df['customer_id'].value_counts()
products = reduced_df['product_id'].value_counts()

#customers = df['customer_id'].value_counts()
#products = df['product_id'].value_counts()

print(customers)
print(products)

Preserve the index for later use

## Balancing dataset
[TODO] add this here

In [None]:
customer_index = pd.DataFrame({'customer_id': customers.index, 
                               'user': np.arange(customers.shape[0])})
print(customer_index.shape)

product_index = pd.DataFrame({'product_id': products.index, 
                              'item': np.arange(products.shape[0]) + customer_index.shape[0]})
print(product_index.shape)

In [None]:
reduced_df = reduced_df.merge(customer_index).merge(product_index)
reduced_df.head()

Count days since first review (included as a feature to capture trend)

In [None]:
reduced_df['review_date'] = pd.to_datetime(reduced_df['review_date'])
customer_first_date = reduced_df.groupby('customer_id')['review_date'].min().reset_index()
customer_first_date.columns = ['customer_id', 'first_review_date']

In [None]:
reduced_df = reduced_df.merge(customer_first_date)
reduced_df['days_since_first'] = (reduced_df['review_date'] - reduced_df['first_review_date']).dt.days
reduced_df['days_since_first'] = reduced_df['days_since_first'].fillna(0)

Split into train and test datasets

In [None]:
train_df = reduced_df.groupby('customer_id').last().reset_index()
train_df

In [None]:
test_df = reduced_df.merge(train_df[['customer_id', 'product_id']], 
                            on=['customer_id', 'product_id'], 
                            how='outer', 
                            indicator=True)
test_df = test_df[(test_df['_merge'] == 'left_only')]
test_df

- Factorization machines expects data to look something like:
  - Sparse matrix
  - Target variable is that user's rating for a movie
  - One-hot encoding for users ($N$ features)
  - One-hot encoding for movies ($M$ features)

|Rating|User1|User2|...|UserN|Movie1|Movie2|Movie3|...|MovieM|Feature1|Feature2|...|
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|4|1|0|...|0|1|0|0|...|0|20|2.2|...|
|5|1|0|...|0|0|1|0|...|0|17|9.1|...|
|3|0|1|...|0|1|0|0|...|0|3|11.0|...|
|4|0|1|...|0|0|0|1|...|0|15|6.4|...|

In [None]:
from scipy.sparse import csr_matrix

def to_csr_matrix(df, num_users, num_items):
    feature_dim = num_users + num_items + 1
    data = np.concatenate([np.array([1] * df.shape[0]),
                           np.array([1] * df.shape[0]),
                           df['days_since_first'].values])
    row = np.concatenate([np.arange(df.shape[0])] * 3)
    col = np.concatenate([df['user'].values,
                          df['item'].values,
                          np.array([feature_dim - 1] * df.shape[0])])
    return csr_matrix((data, (row, col)), 
                      shape=(df.shape[0], feature_dim), 
                      dtype=np.float32)

In [None]:
train_csr = to_csr_matrix(train_df, customer_index.shape[0], product_index.shape[0])
print(train_csr)

In [None]:
test_csr = to_csr_matrix(test_df, customer_index.shape[0], product_index.shape[0])
print(test_csr)

Convert to sparse recordIO-wrapped protobuf that SageMaker factorization machines expects

In [None]:
import io
import sagemaker.amazon.common as smac

fm_prefix = 'factorization-machines'

def upload_to_s3_as_protobuf(csr, label, bucket, prefix, channel, splits):
    indices = np.array_split(np.arange(csr.shape[0]), splits)
    for i in range(len(indices)):
        index = indices[i]
        buf = io.BytesIO()
        smac.write_spmatrix_to_sparse_tensor(buf, csr[index, ], label[index])
        buf.seek(0)

        boto3.client('s3').upload_fileobj(buf, bucket, '{}/{}/data-{}'.format(prefix, channel, i))

In [None]:
upload_to_s3_as_protobuf(train_csr, train_df['star_rating'].values.astype(np.float32), bucket, fm_prefix, channel='train', splits=10)
upload_to_s3_as_protobuf(test_csr, test_df['star_rating'].values.astype(np.float32), bucket, fm_prefix, channel='test', splits=1)