In [None]:
!pip install -q boto3
!pip install -q xgboost==0.90
!pip install -q scikit-learn==0.20.3
!pip install -q nltk==3.4.5

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [None]:
!aws s3 cp 's3://{bucket}/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz' ./data/

In [None]:
import csv

df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', 
                 delimiter='\t', 
                 quoting=csv.QUOTE_NONE,
                 compression='gzip')
df.shape

In [None]:
df.head(5)

# Clean commas from raw text
_Note:  This is not needed as the data does not currently contain commas._

In [None]:
df_scrubbed_raw = df

df_scrubbed_raw['marketplace'] = df_scrubbed_raw['marketplace'].replace(',', ' ')
df_scrubbed_raw['review_id'] = df_scrubbed_raw['review_id'].replace(',', ' ')
df_scrubbed_raw['product_id'] = df_scrubbed_raw['product_id'].replace(',', ' ')
df_scrubbed_raw['product_title'] = df_scrubbed_raw['product_title'].replace(',', ' ')
df_scrubbed_raw['product_category'] = df_scrubbed_raw['product_category'].replace(',', ' ')
df_scrubbed_raw['review_headline'] = df_scrubbed_raw['review_headline'].replace(',', ' ')
df_scrubbed_raw['review_body'] = df_scrubbed_raw['review_body'].replace(',', ' ')
df_scrubbed_raw['review_date'] = df_scrubbed_raw['review_date'].replace(',', ' ')

df_scrubbed_raw.shape

In [None]:
df_scrubbed_raw.head(5)

# Drop NaNs

In [None]:
df_scrubbed_raw.isna().values.any()

In [None]:
df_scrubbed_raw = df_scrubbed_raw.dropna()
df_scrubbed_raw = df_scrubbed_raw.reset_index(drop=True)
df_scrubbed_raw.shape

In [None]:
df_scrubbed_raw.head(5)

### Enrich the data with `is_positive_sentiment` label
* True:  `star_rating >= 4`
* False:  `star_rating < 4`

In [None]:
df_is_positive_sentiment = (df_scrubbed_raw['star_rating'] >= 4).astype(int)
df_scrubbed_raw.insert(0, 'is_positive_sentiment', df_is_positive_sentiment)
df_scrubbed_raw.shape

In [None]:
df_scrubbed_raw.head(5)

# Split the data into `train` and `test` datasets

In [None]:
from sklearn.model_selection import train_test_split

df_unbalanced_raw = df_scrubbed_raw

print('df_unbalanced_raw.shape={}'.format(df_unbalanced_raw.shape))

# Split all data into 90% train and 10% test
df_unbalanced_raw_train, df_unbalanced_raw_test = train_test_split(df_unbalanced_raw, test_size=0.1, stratify=df_scrubbed_raw['is_positive_sentiment'])

df_unbalanced_raw_train = df_unbalanced_raw_train.reset_index(drop=True)
df_unbalanced_raw_test = df_unbalanced_raw_test.reset_index(drop=True)

print('df_unbalanced_raw_train.shape={}'.format(df_unbalanced_raw_train.shape))
print('df_unbalanced_raw_test.shape={}'.format(df_unbalanced_raw_test.shape))

## Write the data files locally

In [None]:
prefix_unbalanced_raw_train = 'feature-store/amazon-reviews-autopilot/raw-labeled-split-unbalanced-header-train-csv'
prefix_unbalanced_raw_test = 'feature-store/amazon-reviews-autopilot/raw-labeled-split-unbalanced-header-test-csv'

unbalanced_raw_with_header_train_path = './{}/data.csv'.format(prefix_unbalanced_raw_train)
unbalanced_raw_with_header_test_path = './{}/data.csv'.format(prefix_unbalanced_raw_test)

import os
os.makedirs(prefix_unbalanced_raw_train, exist_ok=True)
os.makedirs(prefix_unbalanced_raw_test, exist_ok=True)

# Only review_body
df_unbalanced_raw_train[['is_positive_sentiment', 'review_body']].to_csv(unbalanced_raw_with_header_train_path, index=False, header=True)
df_unbalanced_raw_test[['is_positive_sentiment', 'review_body']].to_csv(unbalanced_raw_with_header_test_path, index=False, header=True)

# All features
# df_unbalanced_raw_train.to_csv(unbalanced_raw_with_header_train_path, index=False, header=True)
# df_unbalanced_raw_test.to_csv(unbalanced_raw_with_header_test_path, index=False, header=True)


In [None]:
unbalanced_raw_with_header_train_s3_uri = sess.upload_data(path=unbalanced_raw_with_header_train_path, key_prefix=prefix_unbalanced_raw_train)
unbalanced_raw_with_header_test_s3_uri = sess.upload_data(path=unbalanced_raw_with_header_test_path, key_prefix=prefix_unbalanced_raw_test)

print(unbalanced_raw_with_header_train_s3_uri)
print(unbalanced_raw_with_header_test_s3_uri)


In [None]:
!aws s3 ls $unbalanced_raw_with_header_train_s3_uri
!aws s3 ls $unbalanced_raw_with_header_test_s3_uri

# View the Class Imbalance (Train)

### Note:  You may need to run this next cell twice


In [None]:
import seaborn as sns

sns.countplot(x='is_positive_sentiment', data=df_unbalanced_raw_train)


In [None]:
is_positive_sentiment_count_unbalanced_raw_train = len(df_unbalanced_raw_train.query('is_positive_sentiment == 1'))
is_negative_sentiment_count_unbalanced_raw_train = len(df_unbalanced_raw_train.query('is_positive_sentiment == 0'))

print('Majority (positive) count: {}'.format(is_positive_sentiment_count_unbalanced_raw_train))
print('Minority (negative) count: {}'.format(is_negative_sentiment_count_unbalanced_raw_train))
print('Ratio of Majority to Minority: {}'.format(is_positive_sentiment_count_unbalanced_raw_train / is_negative_sentiment_count_unbalanced_raw_train))

# Balance the Classes (Train-Split Only)

In [None]:
from sklearn.utils import resample

is_negative_sentiment_unbalanced_raw_train_df = df_unbalanced_raw_train.query('is_positive_sentiment == 0')
is_positive_sentiment_unbalanced_raw_train_df = df_unbalanced_raw_train.query('is_positive_sentiment == 1')

# TODO:  Check which sentiment has the least number of samples
#        We know we have more positive than negative sentiment samples in our case.

is_positive_downsampled_raw_train_df = resample(is_positive_sentiment_unbalanced_raw_train_df,
                                                replace = False,
                                                n_samples = len(is_negative_sentiment_unbalanced_raw_train_df))

df_balanced_train_raw = pd.concat([is_negative_sentiment_unbalanced_raw_train_df, is_positive_downsampled_raw_train_df])
df_balanced_train_raw = df_balanced_train_raw.reset_index(drop=True)

In [None]:
import seaborn as sns

sns.countplot(x='is_positive_sentiment', data=df_balanced_train_raw)


## Write the data files locally (Balanced, Train)


In [None]:
prefix_balanced_raw_train = 'feature-store/amazon-reviews-autopilot/raw-labeled-split-balanced-header-train-csv'

balanced_raw_with_header_train_path = './{}/data.csv'.format(prefix_balanced_raw_train)

import os
os.makedirs(prefix_balanced_raw_train, exist_ok=True)

# Only review_body
df_balanced_train_raw[['is_positive_sentiment', 'review_body']].to_csv(balanced_raw_with_header_train_path, index=False, header=True)

# All features
#df_balanced_train_raw.to_csv(balanced_raw_with_header_train_path, index=False, header=True)


## Upload Only S3 (Balanced, Train)
We will use the test set later to verify.

In [None]:
balanced_raw_with_header_train_s3_uri = sess.upload_data(path=balanced_raw_with_header_train_path, key_prefix=prefix_balanced_raw_train)

print(balanced_raw_with_header_train_s3_uri)


In [None]:
!aws s3 ls $balanced_raw_with_header_train_s3_uri


# Balance the Classes (Test-Split Only)

In [None]:
from sklearn.utils import resample

is_negative_sentiment_unbalanced_raw_test_df = df_unbalanced_raw_test.query('is_positive_sentiment == 0')
is_positive_sentiment_unbalanced_raw_test_df = df_unbalanced_raw_test.query('is_positive_sentiment == 1')

# TODO:  Check which sentiment has the least number of samples
#        We know we have more positive than negative sentiment samples in our case.

is_positive_downsampled_raw_test_df = resample(is_positive_sentiment_unbalanced_raw_test_df,
                                                replace = False,
                                                n_samples = len(is_negative_sentiment_unbalanced_raw_test_df))

df_balanced_test_raw = pd.concat([is_negative_sentiment_unbalanced_raw_test_df, is_positive_downsampled_raw_test_df])
df_balanced_test_raw = df_balanced_test_raw.reset_index(drop=True)

In [None]:
df_balanced_test_raw.head(5)

In [None]:
import seaborn as sns

sns.countplot(x='is_positive_sentiment', data=df_balanced_test_raw)


## Write the data files locally (Balanced, Test)


In [None]:
prefix_balanced_test_raw = 'feature-store/amazon-reviews-autopilot/raw-labeled-split-balanced-header-test-csv'
balanced_raw_with_header_test_path = './{}/data.csv'.format(prefix_balanced_test_raw)

import os
os.makedirs(prefix_balanced_test_raw, exist_ok=True)

# Only review_body
df_balanced_test_raw[['is_positive_sentiment', 'review_body']].to_csv(balanced_raw_with_header_test_path, index=False, header=True)

# All
#df_balanced_test_raw.to_csv(balanced_raw_with_header_test_path, index=False, header=True)


## Upload Only S3 (Balanced, Test)
We will use the test set later to verify.

In [None]:
balanced_raw_with_header_test_s3_uri = sess.upload_data(path=balanced_raw_with_header_test_path, key_prefix=prefix_balanced_raw_test)

print(balanced_raw_with_header_test_s3_uri)


In [None]:
!aws s3 ls $balanced_raw_with_header_test_s3_uri


In [None]:
!ls -R ./feature-store