In [None]:
!pip install -q boto3
!pip install -q xgboost==0.90
!pip install -q scikit-learn==0.20.3
!pip install -q nltk==3.4.5

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [None]:
!aws s3 cp 's3://{bucket}/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz' ./data/

In [None]:
import csv

df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', 
                 delimiter='\t', 
                 quoting=csv.QUOTE_NONE,
                 compression='gzip')
df.shape

In [None]:
df.head(5)

# Clean commas from raw text
_Note:  This is not needed as the data does not currently contain commas._

In [None]:
df_scrubbed_raw = df

df_scrubbed_raw['marketplace'] = df_scrubbed_raw['marketplace'].replace(',', ' ')
df_scrubbed_raw['review_id'] = df_scrubbed_raw['review_id'].replace(',', ' ')
df_scrubbed_raw['product_id'] = df_scrubbed_raw['product_id'].replace(',', ' ')
df_scrubbed_raw['product_title'] = df_scrubbed_raw['product_title'].replace(',', ' ')
df_scrubbed_raw['product_category'] = df_scrubbed_raw['product_category'].replace(',', ' ')
df_scrubbed_raw['review_headline'] = df_scrubbed_raw['review_headline'].replace(',', ' ')
df_scrubbed_raw['review_body'] = df_scrubbed_raw['review_body'].replace(',', ' ')
df_scrubbed_raw['review_date'] = df_scrubbed_raw['review_date'].replace(',', ' ')

df_scrubbed_raw.shape

In [None]:
df_scrubbed_raw.head(5)

# Drop NaNs

In [None]:
df_scrubbed_raw.isna().values.any()

In [None]:
df_scrubbed_raw = df_scrubbed_raw.dropna()
df_scrubbed_raw = df_scrubbed_raw.reset_index(drop=True)
df_scrubbed_raw.shape

In [None]:
df_scrubbed_raw.head(5)

### Enrich the data with `is_positive_sentiment` label
* True:  `star_rating >= 4`
* False:  `star_rating < 4`

In [None]:
df_is_positive_sentiment = (df_scrubbed_raw['star_rating'] >= 4).astype(int)
df_scrubbed_raw.insert(0, 'is_positive_sentiment', df_is_positive_sentiment)
df_scrubbed_raw.shape

In [None]:
df_scrubbed_raw.head(5)

In [None]:
is_positive_sentiment_count = len(df_scrubbed_raw.query('is_positive_sentiment == 1'))
is_negative_sentiment_count = len(df_scrubbed_raw.query('is_positive_sentiment == 0'))

print('Majority (positive) count: {}'.format(is_positive_sentiment_count))
print('Minority (negative) count: {}'.format(is_negative_sentiment_count))
print('Ratio of Majority to Minority: {}'.format(is_positive_sentiment_count / is_negative_sentiment_count))

### Note:  You may need to run this next cell twice


In [None]:
import seaborn as sns

sns.countplot(x='is_positive_sentiment', data=df_scrubbed_raw)


# Split the data into `train`, `validation`, and `test` datasets

In [None]:
from sklearn.model_selection import train_test_split

# Split all data into 90% train and 10% holdout
df_scrubbed_raw_train, df_scrubbed_raw_holdout = train_test_split(df_scrubbed_raw, test_size=0.1, stratify=df_scrubbed_raw['is_positive_sentiment'])
df_scrubbed_raw_train = df_scrubbed_raw_train.reset_index(drop=True)
df_scrubbed_raw_holdout = df_scrubbed_raw_holdout.reset_index(drop=True)

# Split the holdout into 50% validation and 50% test
df_scrubbed_raw_validation, df_scrubbed_raw_test = train_test_split(df_scrubbed_raw_holdout, test_size=0.5, stratify=df_scrubbed_raw_holdout['is_positive_sentiment'])
df_scrubbed_raw_validation = df_scrubbed_raw_validation.reset_index(drop=True)
df_scrubbed_raw_test = df_scrubbed_raw_test.reset_index(drop=True)

print('df_scrubbed_raw.shape={}'.format(df_scrubbed_raw.shape))
print('df_scrubbed_raw_train.shape={}'.format(df_scrubbed_raw_train.shape))
print('df_scrubbed_raw_validation.shape={}'.format(df_scrubbed_raw_validation.shape))
print('df_scrubbed_raw_test.shape={}'.format(df_scrubbed_raw_test.shape))

## Write the data files locally

In [None]:
prefix_unbalanced_raw_train = 'feature-store/amazon-reviews-notebook/raw-labeled-split-unbalanced-header-train-csv'
prefix_unbalanced_raw_validation = 'feature-store/amazon-reviews-notebook/raw-labeled-split-unbalanced-header-validation-csv'
prefix_unbalanced_raw_test = 'feature-store/amazon-reviews-notebook/raw-labeled-split-unbalanced-header-test-csv'

scrubbed_raw_with_header_train_path = './{}/part-00000.csv'.format(prefix_unbalanced_raw_train)
scrubbed_raw_with_header_validation_path = './{}/part-00000.csv'.format(prefix_unbalanced_raw_validation)
scrubbed_raw_with_header_test_path = './{}/part-00000.csv'.format(prefix_unbalanced_raw_test)

import os
os.makedirs(prefix_unbalanced_raw_train, exist_ok=True)
os.makedirs(prefix_unbalanced_raw_validation, exist_ok=True)
os.makedirs(prefix_unbalanced_raw_test, exist_ok=True)

df_scrubbed_raw_train[['is_positive_sentiment', 'review_body']].to_csv(scrubbed_raw_with_header_train_path, index=False, header=True)
df_scrubbed_raw_validation[['is_positive_sentiment', 'review_body']].to_csv(scrubbed_raw_with_header_validation_path, index=False, header=True)
df_scrubbed_raw_test[['is_positive_sentiment', 'review_body']].to_csv(scrubbed_raw_with_header_test_path, index=False, header=True)


In [None]:
pd.read_csv(scrubbed_raw_with_header_train_path)

# Upload to S3

In [None]:
scrubbed_raw_with_header_train_s3_uri = sess.upload_data(path=scrubbed_raw_with_header_train_path, key_prefix=prefix_unbalanced_raw_train)
scrubbed_raw_with_header_validation_s3_uri = sess.upload_data(path=scrubbed_raw_with_header_validation_path, key_prefix=prefix_unbalanced_raw_validation)
scrubbed_raw_with_header_test_s3_uri = sess.upload_data(path=scrubbed_raw_with_header_test_path, key_prefix=prefix_unbalanced_raw_test)

print(scrubbed_raw_with_header_train_s3_uri)
print(scrubbed_raw_with_header_validation_s3_uri)
print(scrubbed_raw_with_header_test_s3_uri)


In [None]:
!aws s3 ls $scrubbed_raw_with_header_train_s3_uri
!aws s3 ls $scrubbed_raw_with_header_validation_s3_uri
!aws s3 ls $scrubbed_raw_with_header_test_s3_uri

# Balance the Dataset between Classes

In [None]:
from sklearn.utils import resample

is_negative_sentiment_df = df_scrubbed_raw.query('is_positive_sentiment == 0')
is_positive_sentiment_df = df_scrubbed_raw.query('is_positive_sentiment == 1')

# TODO:  check which sentiment has the least number of samples

is_positive_downsampled_df = resample(is_positive_sentiment_df,
                                      replace = False,
                                      n_samples = len(is_negative_sentiment_df),
                                      random_state = 27)

df_balanced_raw = pd.concat([is_negative_sentiment_df, is_positive_downsampled_df])
df_balanced_raw = df_balanced_raw.reset_index(drop=True)

In [None]:
df_balanced_raw.head(5)

In [None]:
import seaborn as sns

sns.countplot(x='is_positive_sentiment', data=df_balanced_raw)


## Split the data into `train`, `validation`, and `test` datasets

In [None]:
from sklearn.model_selection import train_test_split

# Split all data into 90% train and 10% holdout
df_balanced_raw_train, df_balanced_raw_holdout = train_test_split(df_balanced_raw, test_size=0.1, stratify=df_balanced_raw['is_positive_sentiment'])
df_balanced_raw_train = df_balanced_raw_train.reset_index(drop=True)
df_balanced_raw_holdout = df_balanced_raw_holdout.reset_index(drop=True)

# Split the holdout into 50% validation and 50% test
df_balanced_raw_validation, df_balanced_raw_test = train_test_split(df_balanced_raw_holdout, test_size=0.5, stratify=df_balanced_raw_holdout['is_positive_sentiment'])
df_balanced_raw_validation = df_balanced_raw_validation.reset_index(drop=True)
df_balanced_raw_test = df_balanced_raw_test.reset_index(drop=True)

print('df_balanced_raw.shape={}'.format(df_balanced_raw.shape))
print('df_balanced_raw_train.shape={}'.format(df_balanced_raw_train.shape))
print('df_balanced_raw_validation.shape={}'.format(df_balanced_raw_validation.shape))
print('df_balanced_raw_test.shape={}'.format(df_balanced_raw_test.shape))

In [None]:
import seaborn as sns

sns.countplot(x='is_positive_sentiment', data=df_balanced_raw_train)


In [None]:
import seaborn as sns

sns.countplot(x='is_positive_sentiment', data=df_balanced_raw_validation)


In [None]:
import seaborn as sns

sns.countplot(x='is_positive_sentiment', data=df_balanced_raw_test)


## Write the data file locally

In [None]:
prefix_balanced_raw_train = 'feature-store/amazon-reviews-notebook/raw-labeled-split-balanced-header-train-csv'
prefix_balanced_raw_validation = 'feature-store/amazon-reviews-notebook/raw-labeled-split-balanced-header-validation-csv'
prefix_balanced_raw_test = 'feature-store/amazon-reviews-notebook/raw-labeled-split-balanced-header-test-csv'

balanced_raw_with_header_train_path = './{}/part-00000.csv'.format(prefix_balanced_raw_train)
balanced_raw_with_header_validation_path = './{}/part-00000.csv'.format(prefix_balanced_raw_validation)
balanced_raw_with_header_test_path = './{}/part-00000.csv'.format(prefix_balanced_raw_test)

import os
os.makedirs(prefix_balanced_raw_train, exist_ok=True)
os.makedirs(prefix_balanced_raw_validation, exist_ok=True)
os.makedirs(prefix_balanced_raw_test, exist_ok=True)

df_balanced_raw_train[['is_positive_sentiment', 'review_body']].to_csv(balanced_raw_with_header_train_path, index=False, header=True)
df_balanced_raw_validation[['is_positive_sentiment', 'review_body']].to_csv(balanced_raw_with_header_validation_path, index=False, header=True)
df_balanced_raw_test[['is_positive_sentiment', 'review_body']].to_csv(balanced_raw_with_header_test_path, index=False, header=True)


In [None]:
pd.read_csv(balanced_raw_with_header_train_path)

# Upload to S3

In [None]:
balanced_raw_with_header_train_s3_uri = sess.upload_data(path=balanced_raw_with_header_train_path, key_prefix=prefix_balanced_raw_train)
balanced_raw_with_header_validation_s3_uri = sess.upload_data(path=balanced_raw_with_header_validation_path, key_prefix=prefix_balanced_raw_validation)
balanced_raw_with_header_test_s3_uri = sess.upload_data(path=balanced_raw_with_header_test_path, key_prefix=prefix_balanced_raw_test)

print(balanced_raw_with_header_train_s3_uri)
print(balanced_raw_with_header_validation_s3_uri)
print(balanced_raw_with_header_test_s3_uri)


In [None]:
!aws s3 ls $balanced_raw_with_header_train_s3_uri
!aws s3 ls $balanced_raw_with_header_validation_s3_uri
!aws s3 ls $balanced_raw_with_header_test_s3_uri

# Transform the raw text into TF/IDF features

In [None]:
# Use TruncatedSVD vs. PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

def feature_transform_fn(df_text, column_name, num_components):
    text_processors = Pipeline(
        steps=[
            (
                'tfidfvectorizer',
                TfidfVectorizer(
                    max_df=0.25,                                       
                    min_df=.0025,
                    analyzer='word',
                    max_features=10000
                )
            )
        ]
    )

    column_transformer = ColumnTransformer(
        transformers=[('text_processing', text_processors, df_text.columns.get_loc(column_name))]
    )

    pipeline = Pipeline(
        steps=[
            ('column_transformer', column_transformer), 
            ('dimension_reducer', TruncatedSVD(n_components=num_components)),
            ('standard_scaler', StandardScaler())
        ]
    )

    return pipeline

# [ALL DATA for AutoPilot] Unbalanced Raw with TF/IDF

# Train - Transform just the features (X)
_This will run for a minute or two.  Please be patient._

In [None]:
df_X_scrubbed_raw = df_scrubbed_raw[['review_body']]
df_y_scrubbed_raw = df_scrubbed_raw['is_positive_sentiment']

df_X_scrubbed_raw.shape

In [None]:
pipeline_scrubbed = feature_transform_fn(df_text=df_X_scrubbed_raw, column_name='review_body', num_components=300)
np_tfidf_scrubbed = pipeline_scrubbed.fit_transform(df_X_scrubbed_raw)
df_tfidf_scrubbed = pd.DataFrame(np_tfidf_scrubbed)
df_tfidf_scrubbed.shape

## Show the learned TF/IDF features for each sentence

In [None]:
vectorizer_tfidf = pipeline_scrubbed \
    .named_steps['column_transformer'] \
    .transformers[0][1].named_steps['tfidfvectorizer']

vectorizer_tfidf_fitted = vectorizer_tfidf.fit_transform(df_scrubbed_raw['review_body'])

In [None]:
df_vectorizer_tfidf = pd.DataFrame(vectorizer_tfidf_fitted.toarray())
df_vectorizer_tfidf.columns = vectorizer_tfidf.get_feature_names()
df_vectorizer_tfidf.shape

In [None]:
df_vectorizer_tfidf.head(5)

In [None]:
df_scrubbed_raw_with_tfidf = pd.merge(df_scrubbed_raw['review_body'], 
                                      df_vectorizer_tfidf,
                                      left_index=True,
                                      right_index=True)
df_scrubbed_raw_with_tfidf.head(5)

In [None]:
#vectorizer = pipeline.named_steps['column_transformer'].transformers[0][1].named_steps['tfidfvectorizer']
#vectorizer.fit_transform(df_X_balanced_raw)
#vectorizer.get_feature_names()

In [None]:
#vectorizer = TfidfVectorizer(stop_words='english')
#df_tfidf = vectorizer.fit_transform(text)

#feature_names = vectorizer.get_feature_names()
#print(feature_names)

#df_tfidf = pd.DataFrame(vectorizer.idf_, 
#                        index=vectorizer.get_feature_names(),
#                        columns=['idf'])

#df_tfidf = pd.DataFrame(X.todense(),columns=vectorizer.get_feature_names())
#df_tfidf.head(5)

### Use t-SNE to Visualize the Data

_Note:  This takes a while.  Please be patient._

In [None]:
# from sklearn.manifold import TSNE

# tsne_model_train = TSNE(perplexity=10, n_components=2, init='pca', n_iter=250, random_state=10)
# np_tsne_train = tsne_model_train.fit_transform(df_vectorizer_tfidf)


In [None]:
# import matplotlib.pyplot as plt

# # column 0-299
# labels = df_tdf_vectorizer_tfidf.columns.values
# x = []
# y = []
# for value in np_tsne:
#     x.append(value[0])
#     y.append(value[1])

# plt.figure(figsize=(16, 16)) 
# # TODO:  Check the -1
# for i in range(len(x) - 1):
#     plt.scatter(x[i],y[i])
#     plt.annotate(labels[i],
#                  xy=(x[i], y[i]),
#                  xytext=(5, 2),
#                  textcoords='offset points',
#                  ha='right',
#                  va='bottom')
# plt.show()

## Combine the features with the labels 

In [None]:
df_tfidf_scrubbed.head(5)

In [None]:
df_y_scrubbed_raw.head(5)

In [None]:
df_tfidf_scrubbed.insert(0, 'is_positive_sentiment', df_y_scrubbed_raw)
df_tfidf_scrubbed.shape

In [None]:
df_tfidf_scrubbed.head(5)

## Write the data locally

In [None]:
# $S3_BUCKET/feature-store/csv/amazon-reviews/scrubbed-tfidf-without-header

prefix_unbalanced_raw_train = 'feature-store/amazon-reviews-notebook/raw-labeled-split-unbalanced-header-train-csv'

scrubbed_tfidf_without_header_path = './{}/part-00000.csv'.format(prefix_scrubbed)

import os
os.makedirs(prefix_scrubbed, exist_ok=True)

df_tfidf_scrubbed.to_csv(scrubbed_tfidf_without_header_path, index=False, header=None)


# Upload to S3

In [None]:
df_scrubbed_tfidf_without_header_s3_uri = sess.upload_data(path=scrubbed_tfidf_without_header_path, key_prefix=prefix_scrubbed)

print(df_scrubbed_tfidf_without_header_s3_uri)


In [None]:
!aws s3 ls $df_scrubbed_tfidf_without_header_s3_uri

# [Separate Train, Validation, Test] Convert Scrubbed Raw Train, Validation, Test into TF/IDF

In [None]:
# Features (X), # Labels (y)
df_X_scrubbed_raw_train = df_scrubbed_raw_train[['review_body']]
df_y_scrubbed_raw_train = df_scrubbed_raw_train['is_positive_sentiment']

df_X_scrubbed_raw_validation = df_scrubbed_raw_validation[['review_body']]
df_y_scrubbed_raw_validation = df_scrubbed_raw_validation['is_positive_sentiment']

df_X_scrubbed_raw_test = df_scrubbed_raw_test[['review_body']]
df_y_scrubbed_raw_test = df_scrubbed_raw_test['is_positive_sentiment']

print('df_scrubbed_X_raw_train.shape:  {}'.format(df_X_scrubbed_raw_train.shape))
print('df_scrubbed_y_raw_train.shape:  {}'.format(df_y_scrubbed_raw_train.shape))
print('df_scrubbed_X_raw_validation.shape:  {}'.format(df_X_scrubbed_raw_validation.shape))
print('df_scrubbed_y_raw_validation.shape:  {}'.format(df_y_scrubbed_raw_validation.shape))
print('df_scrubbed_X_raw_test.shape:  {}'.format(df_X_scrubbed_raw_test.shape))
print('df_scrubbed_y_raw_test.shape:  {}'.format(df_y_scrubbed_raw_test.shape))

In [None]:
df_X_scrubbed_raw_train.head(5)

In [None]:
df_y_scrubbed_raw_train.head(5)

# Train - Transform just the features (X)
_This will run for a minute or two.  Please be patient._

In [None]:
pipeline_scrubbed_train = feature_transform_fn(df_X_scrubbed_raw_train, 'review_body', 300)
np_tfidf_scrubbed_train = pipeline_scrubbed_train.fit_transform(df_X_scrubbed_raw_train)
df_tfidf_scrubbed_train = pd.DataFrame(np_tfidf_scrubbed_train)
df_tfidf_scrubbed_train.shape

In [None]:
df_tfidf_scrubbed_train.head(5)

In [None]:
df_y_scrubbed_raw_train.head(5)

# Add back the label (y) into the first column
The label needs to be in the 1st column for some of our models.

In [None]:
df_tfidf_scrubbed_train.insert(0, 'is_positive_sentiment', df_y_scrubbed_raw_train)
df_tfidf_scrubbed_train.shape

In [None]:
df_tfidf_scrubbed_train.head(5)

# Validation - Transform just the features (X)
_This will run for a minute or two.  Please be patient._

In [None]:
pipeline_scrubbed_validation = feature_transform_fn(df_X_scrubbed_raw_validation, 'review_body', 300)
np_tfidf_scrubbed_validation = pipeline_scrubbed_validation.fit_transform(df_X_scrubbed_raw_validation)
df_tfidf_scrubbed_validation = pd.DataFrame(np_tfidf_scrubbed_validation)
df_tfidf_scrubbed_validation.shape

In [None]:
df_tfidf_scrubbed_validation.head(5)

# Add back the label (y) into the first column
The label needs to be in the 1st column for some of our models.

In [None]:
df_tfidf_scrubbed_validation.insert(0, 'is_positive_sentiment', df_y_scrubbed_raw_validation)
df_tfidf_scrubbed_validation.shape

In [None]:
df_tfidf_scrubbed_validation.head(5)

# Test - Transform just the features (X)
_This will run for a minute or two.  Please be patient._

In [None]:
pipeline_scrubbed_test = feature_transform_fn(df_X_scrubbed_raw_test, 'review_body', 300)
np_tfidf_scrubbed_test = pipeline_scrubbed_test.fit_transform(df_X_scrubbed_raw_test)
df_tfidf_scrubbed_test = pd.DataFrame(np_tfidf_scrubbed_test)
df_tfidf_scrubbed_test.shape

In [None]:
df_tfidf_scrubbed_test.head(5)

# Add back the label (y) into the first column
The label needs to be in the 1st column for some of our models.

In [None]:
df_tfidf_scrubbed_test.insert(0, 'is_positive_sentiment', df_y_scrubbed_raw_test)
df_tfidf_scrubbed_test.shape

In [None]:
df_tfidf_scrubbed_test.head(5)

# Write the datasets locally
_Note: `header=None`_

In [None]:
# $S3_BUCKET/feature-store/csv/amazon-reviews/scrubbed-tfidf-without-header/train/data.csv
# $S3_BUCKET/feature-store/csv/amazon-reviews/scrubbed-tfidf-without-header/validation/data.csv
# $S3_BUCKET/feature-store/csv/amazon-reviews/scrubbed-tfidf-without-header/test/data.csv

prefix_scrubbed_train = 'feature-store/amazon-reviews-notebook/tfidf-labeled-split-unbalanced-header-train-csv'
prefix_scrubbed_validation = 'feature-store/amazon-reviews-notebook/tfidf-labeled-split-unbalanced-header-validation-csv'
prefix_scrubbed_test = 'feature-store/amazon-reviews-notebook/tfidf-labeled-split-unbalanced-header-test-csv'

scrubbed_tfidf_without_header_train_path = './{}/part-00000.csv'.format(prefix_scrubbed_train)
scrubbed_tfidf_without_header_validation_path = './{}/part-00000.csv'.format(prefix_scrubbed_validation)
scrubbed_tfidf_without_header_test_path = './{}/part-00000.csv'.format(prefix_scrubbed_test)

import os
os.makedirs(prefix_scrubbed_train, exist_ok=True)
os.makedirs(prefix_scrubbed_validation, exist_ok=True)
os.makedirs(prefix_scrubbed_test, exist_ok=True)

df_tfidf_scrubbed_train.to_csv(scrubbed_tfidf_without_header_train_path, index=False, header=None)
df_tfidf_scrubbed_validation.to_csv(scrubbed_tfidf_without_header_validation_path, index=False, header=None)
df_tfidf_scrubbed_test.to_csv(scrubbed_tfidf_without_header_test_path, index=False, header=None)


# Upload to S3

In [None]:
df_scrubbed_tfidf_without_header_train_s3_uri = sess.upload_data(path=scrubbed_tfidf_without_header_train_path, key_prefix=prefix_scrubbed_train)
df_scrubbed_tfidf_without_header_validation_s3_uri = sess.upload_data(path=scrubbed_tfidf_without_header_validation_path, key_prefix=prefix_scrubbed_validation)
df_scrubbed_tfidf_without_header_test_s3_uri = sess.upload_data(path=scrubbed_tfidf_without_header_test_path, key_prefix=prefix_scrubbed_test)

print(df_scrubbed_tfidf_without_header_train_s3_uri)
print(df_scrubbed_tfidf_without_header_validation_s3_uri)
print(df_scrubbed_tfidf_without_header_test_s3_uri)


In [None]:
!aws s3 ls $df_scrubbed_tfidf_without_header_train_s3_uri
!aws s3 ls $df_scrubbed_tfidf_without_header_validation_s3_uri
!aws s3 ls $df_scrubbed_tfidf_without_header_test_s3_uri

# Balanced Raw

In [None]:
# Features (X), # Labels (y)
df_X_balanced_raw_train = df_balanced_raw_train[['review_body']]
df_y_balanced_raw_train = df_balanced_raw_train['is_positive_sentiment']

df_X_balanced_raw_validation = df_balanced_raw_validation[['review_body']]
df_y_balanced_raw_validation = df_balanced_raw_validation['is_positive_sentiment']

df_X_balanced_raw_test = df_balanced_raw_test[['review_body']]
df_y_balanced_raw_test = df_balanced_raw_test['is_positive_sentiment']

print('df_balanced_X_raw_train.shape:  {}'.format(df_X_balanced_raw_train.shape))
print('df_balanced_y_raw_train.shape:  {}'.format(df_y_balanced_raw_train.shape))
print('df_balanced_X_raw_validation.shape:  {}'.format(df_X_balanced_raw_validation.shape))
print('df_balanced_y_raw_validation.shape:  {}'.format(df_y_balanced_raw_validation.shape))
print('df_balanced_X_raw_test.shape:  {}'.format(df_X_balanced_raw_test.shape))
print('df_balanced_y_raw_test.shape:  {}'.format(df_y_balanced_raw_test.shape))

In [None]:
df_X_balanced_raw_train.head(5)

In [None]:
df_y_balanced_raw_train.head(5)

# Train - Transform just the features (X)
_This will run for a minute or two.  Please be patient._

In [None]:
pipeline_train = feature_transform_fn(df_X_balanced_raw_train, 'review_body', 300)
np_tfidf_train = pipeline_train.fit_transform(df_X_balanced_raw_train)
df_tfidf_train = pd.DataFrame(np_tfidf_train)
df_tfidf_train.shape

In [None]:
df_tfidf_train.head(5)

In [None]:
df_y_balanced_raw_train.head(5)

# Add back the label (y) into the first column
The label needs to be in the 1st column for some of our models.

In [None]:
df_tfidf_train.insert(0, 'is_positive_sentiment', df_y_balanced_raw_train)
df_tfidf_train.shape

In [None]:
df_tfidf_train.head(5)

# Validation - Transform just the features (X)
_This will run for a minute or two.  Please be patient._

In [None]:
pipeline_validation = feature_transform_fn(df_X_balanced_raw_validation, 'review_body', 300)
np_tfidf_validation = pipeline_validation.fit_transform(df_X_balanced_raw_validation)
df_tfidf_validation = pd.DataFrame(np_tfidf_validation)
df_tfidf_validation.shape

In [None]:
df_tfidf_validation.head(5)

# Add back the label (y) into the first column
The label needs to be in the 1st column for some of our models.

In [None]:
df_tfidf_validation.insert(0, 'is_positive_sentiment', df_y_balanced_raw_validation)
df_tfidf_validation.shape

In [None]:
df_tfidf_validation.head(5)

# Test - Transform just the features (X)
_This will run for a minute or two.  Please be patient._

In [None]:
pipeline_test = feature_transform_fn(df_X_balanced_raw_test, 'review_body', 300)
np_tfidf_test = pipeline_test.fit_transform(df_X_balanced_raw_test)
df_tfidf_test = pd.DataFrame(np_tfidf_test)
df_tfidf_test.shape

In [None]:
df_tfidf_test.head(5)

# Add back the label (y) into the first column
The label needs to be in the 1st column for some of our models.

In [None]:
df_tfidf_test.insert(0, 'is_positive_sentiment', df_y_balanced_raw_test)
df_tfidf_test.shape

In [None]:
df_tfidf_test.head(5)

# Write the datasets locally
_Note: `header=None`_

In [None]:
# $S3_BUCKET/feature-store/csv/amazon-reviews/balanced-tfidf-without-header/train/data.csv
# $S3_BUCKET/feature-store/csv/amazon-reviews/balanced-tfidf-without-header/validation/data.csv
# $S3_BUCKET/feature-store/csv/amazon-reviews/balanced-tfidf-without-header/test/data.csv

prefix_train = 'feature-store/amazon-reviews-notebook/tfidf-labeled-split-balanced-noheader-train-csv'
prefix_validation = 'feature-store/amazon-reviews-notebook/tfidf-labeled-split-balanced-noheader-validation-csv'
prefix_test = 'feature-store/amazon-reviews-notebook/tfidf-labeled-split-balanced-noheader-test-csv'

balanced_tfidf_without_header_train_path = './{}/part-00000.csv'.format(prefix_train)
balanced_tfidf_without_header_validation_path = './{}/part-00000.csv'.format(prefix_validation)
balanced_tfidf_without_header_test_path = './{}/part-00000.csv'.format(prefix_test)

import os
os.makedirs(prefix_train, exist_ok=True)
os.makedirs(prefix_validation, exist_ok=True)
os.makedirs(prefix_test, exist_ok=True)

df_tfidf_train.to_csv(balanced_tfidf_without_header_train_path, index=False, header=None)
df_tfidf_validation.to_csv(balanced_tfidf_without_header_validation_path, index=False, header=None)
df_tfidf_test.to_csv(balanced_tfidf_without_header_test_path, index=False, header=None)


# Upload to S3

In [None]:
df_balanced_tfidf_without_header_train_s3_uri = sess.upload_data(path=balanced_tfidf_without_header_train_path, key_prefix=prefix_train)
df_balanced_tfidf_without_header_validation_s3_uri = sess.upload_data(path=balanced_tfidf_without_header_validation_path, key_prefix=prefix_validation)
df_balanced_tfidf_without_header_test_s3_uri = sess.upload_data(path=balanced_tfidf_without_header_test_path, key_prefix=prefix_test)

print(df_balanced_tfidf_without_header_train_s3_uri)
print(df_balanced_tfidf_without_header_validation_s3_uri)
print(df_balanced_tfidf_without_header_test_s3_uri)


In [None]:
!aws s3 ls $df_balanced_tfidf_without_header_train_s3_uri
!aws s3 ls $df_balanced_tfidf_without_header_validation_s3_uri
!aws s3 ls $df_balanced_tfidf_without_header_test_s3_uri