In [65]:
import pandas as pd
import re

# Read the data

In [81]:
data = pd.read_csv('Amazon.csv')
data.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link'],
      dtype='object')

# Remove the columns that are not needed

In [83]:
data = data[['product_name', 'about_product', 'review_title', 'review_content', 'img_link']]
data.columns

Index(['product_name', 'about_product', 'review_title', 'review_content',
       'img_link'],
      dtype='object')

In [84]:
data.head()

Unnamed: 0,product_name,about_product,review_title,review_content,img_link
0,D-Link DWA-131 300 Mbps Wireless Nano USB Adap...,Connects your computer to a high-speed wireles...,"good tool to use for,Brand is always good,Over...",good quality tool from d linkWiFi signal is go...,https://m.media-amazon.com/images/I/31+NwZ8gb1...
1,D-Link DWA-131 300 Mbps Wireless Nano USB Adap...,Connects your computer to a high-speed wireles...,"good tool to use for,Brand is always good,Over...",good quality tool from d linkWiFi signal is go...,https://m.media-amazon.com/images/W/WEBP_40237...
2,TP-Link Nano USB WiFi Dongle 150Mbps High Gain...,150 Mbps Wi-Fi —— Exceptional wireless speed u...,Works on linux for me. Get the model with ante...,I use this to connect an old PC to internet. I...,https://m.media-amazon.com/images/I/31Wb+A3VVd...
3,Duracell Plus AAA Rechargeable Batteries (750 ...,Duracell Rechargeable AAA 750mAh batteries sta...,"Works Good,Perfect replacement cell for trimme...","Works good,Bought it to replace my Phillips QT...",https://m.media-amazon.com/images/I/418YrbHVLC...
4,"Logitech B100 Wired USB Mouse, 3 yr Warranty, ...","A comfortable, ambidextrous shape feels good i...","Handy Mouse,Good quality mouse,Good one.,Good,...","Liked this Product,https://m.media-amazon.com/...",https://m.media-amazon.com/images/I/31iFF1Kbkp...


# Remove the text in brackets in product_name

In [86]:
data['product_name'] = data['product_name'].str.replace(r"\(.*?\)", "")

  data['product_name'] = data['product_name'].str.replace(r"\(.*?\)", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['product_name'] = data['product_name'].str.replace(r"\(.*?\)", "")


In [87]:
from cleantext import clean
def clean_text(text):

    # Remove non-English words but keep '.', ',', '!'
    text = re.sub('[^a-zA-Z0-9 \n.,!-]', '', text)

    # Remove comma if nothing follows
    text = ','.join([word for word in text.split(',') if word.strip()])

    # Add space to number and unit, ex: 150ft -> 150 ft
    text = re.sub(r'(\d+)([A-Za-z]+)', r'\1 \2', text)

    # Remove quote 
    text = text.replace('"', '').replace("'", '')

    text = clean(text,
              fix_unicode=True,               # fix various unicode errors
              to_ascii=True,                  # transliterate to closest ASCII representation
              lower=True,                     # lowercase text
              no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
              no_urls=True,                   # replace all URLs with a special token
              no_emails=False,                # replace all email addresses with a special token
              no_phone_numbers=False,         # replace all phone numbers with a special token
              no_numbers=True,               # replace all numbers with a special token
              no_digits=False,                # replace all digits with a special token
              no_currency_symbols=False,      # replace all currency symbols with a special token
              no_punct=False,                 # remove punctuations
              no_emoji=True,                  # remove emojis
              replace_with_punct="",          # instead of removing punctuations you may replace them
              replace_with_url="",
              replace_with_email="",
              replace_with_phone_number="",
              replace_with_number="<NUMBER>",
              replace_with_currency_symbol="",
              lang="en"                       # set to 'de' for German special handling
              )
    return text

# Remove URLs and non-English words

In [88]:
data['product_name'] = data['product_name'].apply(clean_text)
data['about_product'] = data['about_product'].apply(clean_text)
data['review_title'] = data['review_title'].apply(clean_text)
data['review_content'] = data['review_content'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['product_name'] = data['product_name'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['about_product'] = data['about_product'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['review_title'] = data['review_title'].apply(clean_text)
A value is trying

# Download the images

In [71]:
!mkdir image

In [89]:
data['img_link'].head()

0    https://m.media-amazon.com/images/I/31+NwZ8gb1...
1    https://m.media-amazon.com/images/W/WEBP_40237...
2    https://m.media-amazon.com/images/I/31Wb+A3VVd...
3    https://m.media-amazon.com/images/I/418YrbHVLC...
4    https://m.media-amazon.com/images/I/31iFF1Kbkp...
Name: img_link, dtype: object

In [90]:
import requests

In [94]:
def download_image(url, file_name, index):
    r = requests.get(url, stream=True)
    try:
        if r.status_code == 200:
            with open(file_name, 'wb') as f:
                for chunk in r:
                    f.write(chunk)
        else:
            print('Image couldn\'t be retrieved for ' + file_name, url)
            skipped_index.add(index)
    except Exception as e:
        print(e)



In [104]:
from PIL import Image
skipped_index = set()
for i, row in data.iterrows():
    # if image exists, skip
    try:
        img = Image.open('image/' + str(i) + '.jpg')
        continue
    except:
        pass
        skipped_index.add(i)
        download_image(row['img_link'], 'image/' + str(i) + '.jpg', i)

In [105]:
# Drop the rows that don't have images
data = data.drop(skipped_index)

# Save the data

In [106]:
data.to_csv('Amazon_cleaned.csv', index_label='index')

In [45]:
from PIL import Image

# check if every image is downloaded

for i, row in data.iterrows():
    try:
        img = Image.open('image/' + str(i) + '.jpg')
    except:
        print(i)

In [16]:
!zip -r image.zip image

updating: image/ (stored 0%)
updating: image/1409.jpg (deflated 1%)
updating: image/77.jpg (stored 0%)
updating: image/1435.jpg (stored 0%)
updating: image/1347.jpg (stored 0%)
updating: image/1390.jpg (deflated 2%)
updating: image/162.jpg (stored 0%)
updating: image/176.jpg (deflated 3%)
updating: image/88.jpg (deflated 0%)
updating: image/610.jpg (stored 0%)
updating: image/1384.jpg (stored 0%)
updating: image/1179.jpg (stored 0%)
updating: image/1151.jpg (deflated 2%)
updating: image/348.jpg (stored 0%)
updating: image/360.jpg (stored 0%)
updating: image/1186.jpg (deflated 3%)
updating: image/412.jpg (deflated 12%)
updating: image/374.jpg (stored 0%)
updating: image/1019.jpg (stored 0%)
updating: image/599.jpg (stored 0%)
updating: image/1025.jpg (stored 0%)
updating: image/1031.jpg (deflated 5%)
updating: image/228.jpg (stored 0%)
updating: image/214.jpg (stored 0%)
updating: image/572.jpg (stored 0%)
updating: image/957.jpg (deflated 5%)
updating: image/9

In [17]:
!aws s3 ls --profile iamadmin-general

2022-02-12 05:33:10 cf-templates-7x4o2u7ssxah-us-east-1
2023-11-16 13:07:55 comp576-image-data


In [18]:
!aws s3 cp image.zip s3://comp576-image-data/ --acl public-read --profile iamadmin-general

upload: ./image.zip to s3://comp576-image-data/image.zip          


In [19]:
!aws s3 ls s3://comp576-image-data/ --profile iamadmin-general

2023-11-20 18:18:00    7369597 image.zip


# Object url: https://comp576-image-data.s3.us-east-2.amazonaws.com/image.zip