In [1]:
#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#SPDX-License-Identifier: MIT-0

In [2]:
#Install additional libraries
!pip install nltk
!pip install jsonlines
!pip install pandarallel

Collecting jsonlines
  Downloading jsonlines-2.0.0-py3-none-any.whl (6.3 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-2.0.0
Collecting pandarallel
  Downloading pandarallel-1.5.2.tar.gz (16 kB)
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25ldone
[?25h  Created wheel for pandarallel: filename=pandarallel-1.5.2-py3-none-any.whl size=18384 sha256=76f976dde1ffc8507adf55755d4891e1df44d2b929ec33691ea61e278974fbe9
  Stored in directory: /home/ec2-user/.cache/pip/wheels/2f/2c/3d/02269317e8eb74af3d9d7ed857a6d49acd37fd7b9346f38164
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.5.2


In [3]:
#Import libraries and functions
import re
import pandas as pd
import sagemaker
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from search_utils import helpers

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [4]:
#Define common variables

#Creating a sagemaker session
sagemaker_session = sagemaker.Session()

#We'll be using the sagemaker default bucket
#Feel free to change this to another bucket name and make sure it's the same across all four notebooks
bucket_name = sagemaker_session.default_bucket()

# 1. Download data

We're using the Amazon reviews dataset (https://s3.amazonaws.com/amazon-reviews-pds/readme.html) which is provided under the following licence https://amazon-reviews-pds.s3.amazonaws.com/LICENSE.txt

We load 4 datasets from 4 different categories (Electronics, Shoes, Furniture and Toys), we then sample 100k examples from each category, merge the 4 samples and generate a smaller dataset of 100K containing all categories.

In [5]:
!mkdir ../data/
!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Electronics_v1_00.tsv.gz ../data/
!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Shoes_v1_00.tsv.gz ../data/
!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Furniture_v1_00.tsv.gz ../data/
!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Toys_v1_00.tsv.gz ../data/

download: s3://amazon-reviews-pds/tsv/amazon_reviews_us_Electronics_v1_00.tsv.gz to ../data/amazon_reviews_us_Electronics_v1_00.tsv.gz
download: s3://amazon-reviews-pds/tsv/amazon_reviews_us_Shoes_v1_00.tsv.gz to ../data/amazon_reviews_us_Shoes_v1_00.tsv.gz
download: s3://amazon-reviews-pds/tsv/amazon_reviews_us_Furniture_v1_00.tsv.gz to ../data/amazon_reviews_us_Furniture_v1_00.tsv.gz
download: s3://amazon-reviews-pds/tsv/amazon_reviews_us_Toys_v1_00.tsv.gz to ../data/amazon_reviews_us_Toys_v1_00.tsv.gz


In [6]:
!gunzip ../data/amazon_reviews_us_Electronics_v1_00.tsv.gz
!gunzip ../data/amazon_reviews_us_Shoes_v1_00.tsv.gz 
!gunzip ../data/amazon_reviews_us_Furniture_v1_00.tsv.gz
!gunzip ../data/amazon_reviews_us_Toys_v1_00.tsv.gz

## 1.1 Electronics dataset

In [7]:
data_electronics = pd.read_csv("../data/amazon_reviews_us_Electronics_v1_00.tsv", error_bad_lines=False, warn_bad_lines=False, sep="\t")

In [8]:
print(data_electronics.shape)

(3091024, 15)


In [9]:
sub_set_electronics = data_electronics[:100000]

## 1.2 Shoes dataset

In [10]:
data_shoes = pd.read_csv("../data/amazon_reviews_us_Shoes_v1_00.tsv", error_bad_lines=False, warn_bad_lines=False, sep="\t")

In [11]:
print(data_shoes.shape)

(4358820, 15)


In [12]:
sub_set_shoes = data_shoes[:100000]

## 1.3 Furniture dataset

In [13]:
data_furniture = pd.read_csv("../data/amazon_reviews_us_Furniture_v1_00.tsv", error_bad_lines=False, warn_bad_lines=False, sep="\t")

In [14]:
print(data_furniture.shape)

(791673, 15)


In [15]:
sub_set_furniture = data_furniture[:100000]

## 1.4 Toys dataset

In [16]:
data_toys = pd.read_csv("../data/amazon_reviews_us_Toys_v1_00.tsv", error_bad_lines=False, warn_bad_lines=False, sep="\t")

In [17]:
print(data_toys.shape)

(4859607, 15)


In [18]:
sub_set_toys = data_toys[:100000]

# 2. Merge and process datasets

In [19]:
dataset = pd.concat([sub_set_electronics, sub_set_shoes, sub_set_furniture, sub_set_toys])

In [20]:
dataset=dataset.rename(columns={"product_id":"id"})

In [21]:
dataset.head()

Unnamed: 0,marketplace,customer_id,review_id,id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,41409413,R2MTG1GCZLR2DK,B00428R89M,112201306,yoomall 5M Antenna WIFI RP-SMA Female to Male ...,Electronics,5.0,0.0,0.0,N,Y,Five Stars,As described.,2015-08-31
1,US,49668221,R2HBOEM8LE9928,B000068O48,734576678,"Hosa GPM-103 3.5mm TRS to 1/4"" TRS Adaptor",Electronics,5.0,0.0,0.0,N,Y,It works as advertising.,It works as advertising.,2015-08-31
2,US,12338275,R1P4RW1R9FDPEE,B000GGKOG8,614448099,Channel Master Titan 2 Antenna Preamplifier,Electronics,5.0,1.0,1.0,N,Y,Five Stars,Works pissa,2015-08-31
3,US,38487968,R1EBPM82ENI67M,B000NU4OTA,72265257,LIMTECH Wall charger + USB Hotsync & Charging ...,Electronics,1.0,0.0,0.0,N,Y,One Star,Did not work at all.,2015-08-31
4,US,23732619,R372S58V6D11AT,B00JOQIO6S,308169188,Skullcandy Air Raid Portable Bluetooth Speaker,Electronics,5.0,1.0,1.0,N,Y,Overall pleased with the item,Works well. Bass is somewhat lacking but is pr...,2015-08-31


In [22]:
print("Distribution of categories:")
print(dataset["product_category"].value_counts())

Distribution of categories:
Shoes          100000
Furniture      100000
Toys           100000
Electronics    100000
Name: product_category, dtype: int64


In [26]:
bucket_name="sagemaker-knn-benfelip"
helpers.write_dataframe_to_s3(dataset, bucket_name=bucket_name, file_name="search_knn_blog/data/raw_data/data.csv", index=False, header=True)
print(bucket_name)

sagemaker-knn-benfelip


# 2. Processing data 

In [28]:
#Changing column names and making sure there is no null values in the text
dataset = dataset.rename(columns={"product_id":"id"})
dataset = dataset[~dataset["product_title"].isnull()]

In [29]:
def clean_data(document):
    
    lemmatizer = WordNetLemmatizer()
    
    tokens = [t.lower() for t in word_tokenize(document)]
    
    clean_tokens = []
    for t in tokens:
        if len(t) >= 3 and re.match("^[a-zA-Z]*$", t):
            clean_tokens.append(t)

    clean_document = " ".join(clean_tokens)
    
    return clean_document

In [30]:
#This will take 1-2mins to run
dataset["processed_title"] = dataset["product_title"].apply(lambda x : clean_data(x))

In [31]:
#Making sure there is no null values after processing..
dataset = dataset[~dataset["processed_title"].isnull()]
dataset = dataset[dataset["processed_title"]!=""]

In [32]:
dataset.head()

Unnamed: 0,marketplace,customer_id,review_id,id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,processed_title
0,US,41409413,R2MTG1GCZLR2DK,B00428R89M,112201306,yoomall 5M Antenna WIFI RP-SMA Female to Male ...,Electronics,5.0,0.0,0.0,N,Y,Five Stars,As described.,2015-08-31,yoomall antenna wifi female male extensionl cable
1,US,49668221,R2HBOEM8LE9928,B000068O48,734576678,"Hosa GPM-103 3.5mm TRS to 1/4"" TRS Adaptor",Electronics,5.0,0.0,0.0,N,Y,It works as advertising.,It works as advertising.,2015-08-31,hosa trs trs adaptor
2,US,12338275,R1P4RW1R9FDPEE,B000GGKOG8,614448099,Channel Master Titan 2 Antenna Preamplifier,Electronics,5.0,1.0,1.0,N,Y,Five Stars,Works pissa,2015-08-31,channel master titan antenna preamplifier
3,US,38487968,R1EBPM82ENI67M,B000NU4OTA,72265257,LIMTECH Wall charger + USB Hotsync & Charging ...,Electronics,1.0,0.0,0.0,N,Y,One Star,Did not work at all.,2015-08-31,limtech wall charger usb hotsync charging dock...
4,US,23732619,R372S58V6D11AT,B00JOQIO6S,308169188,Skullcandy Air Raid Portable Bluetooth Speaker,Electronics,5.0,1.0,1.0,N,Y,Overall pleased with the item,Works well. Bass is somewhat lacking but is pr...,2015-08-31,skullcandy air raid portable bluetooth speaker


In [33]:
print(bucket_name)
helpers.write_dataframe_to_s3(dataset, bucket_name=bucket_name, file_name="search_knn_blog/data/processed_data/data.csv", index=False, header=True)

sagemaker-knn-benfelip
