In [1]:
import pandas as pd
import re

# Read the data

In [2]:
data = pd.read_csv('Amazon.csv')
data.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link'],
      dtype='object')

# Remove the columns that are not needed

In [3]:
data = data[['product_name', 'about_product', 'review_title', 'review_content', 'img_link']]
data.columns

Index(['product_name', 'about_product', 'review_title', 'review_content',
       'img_link'],
      dtype='object')

In [4]:
data.head()

Unnamed: 0,product_name,about_product,review_title,review_content,img_link
0,D-Link DWA-131 300 Mbps Wireless Nano USB Adap...,Connects your computer to a high-speed wireles...,"good tool to use for,Brand is always good,Over...",good quality tool from d linkWiFi signal is go...,https://m.media-amazon.com/images/I/31+NwZ8gb1...
1,D-Link DWA-131 300 Mbps Wireless Nano USB Adap...,Connects your computer to a high-speed wireles...,"good tool to use for,Brand is always good,Over...",good quality tool from d linkWiFi signal is go...,https://m.media-amazon.com/images/W/WEBP_40237...
2,TP-Link Nano USB WiFi Dongle 150Mbps High Gain...,150 Mbps Wi-Fi —— Exceptional wireless speed u...,Works on linux for me. Get the model with ante...,I use this to connect an old PC to internet. I...,https://m.media-amazon.com/images/I/31Wb+A3VVd...
3,Duracell Plus AAA Rechargeable Batteries (750 ...,Duracell Rechargeable AAA 750mAh batteries sta...,"Works Good,Perfect replacement cell for trimme...","Works good,Bought it to replace my Phillips QT...",https://m.media-amazon.com/images/I/418YrbHVLC...
4,"Logitech B100 Wired USB Mouse, 3 yr Warranty, ...","A comfortable, ambidextrous shape feels good i...","Handy Mouse,Good quality mouse,Good one.,Good,...","Liked this Product,https://m.media-amazon.com/...",https://m.media-amazon.com/images/I/31iFF1Kbkp...


# Remove the text in brackets in product_name

In [5]:
data['product_name'] = data['product_name'].str.replace(r"\(.*?\)", "")

  data['product_name'] = data['product_name'].str.replace(r"\(.*?\)", "")


In [6]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'\bhttp\S+|\bwww\.\S+', '', text)

    # Remove non-English words but keep '.', ',', '!'
    text = re.sub('[^a-zA-Z0-9 \n\.,!]', '', text)

    return text

# Remove URLs and non-English words

In [7]:
data['product_name'] = data['product_name'].apply(clean_text)
data['about_product'] = data['about_product'].apply(clean_text)
data['review_title'] = data['review_title'].apply(clean_text)
data['review_content'] = data['review_content'].apply(clean_text)

# Download the images

In [8]:
!mkdir image

mkdir: image: File exists


In [9]:
data['img_link'].head()

0    https://m.media-amazon.com/images/I/31+NwZ8gb1...
1    https://m.media-amazon.com/images/W/WEBP_40237...
2    https://m.media-amazon.com/images/I/31Wb+A3VVd...
3    https://m.media-amazon.com/images/I/418YrbHVLC...
4    https://m.media-amazon.com/images/I/31iFF1Kbkp...
Name: img_link, dtype: object

In [10]:
import requests

In [11]:
def download_image(url, file_name, index):
    r = requests.get(url, stream=True)
    try:
        if r.status_code == 200:
            with open(file_name, 'wb') as f:
                for chunk in r:
                    f.write(chunk)
        else:
            print('Image couldn\'t be retrieved for ' + file_name, url)
            skipped_index.append(index)
    except Exception as e:
        print(e)



In [12]:
from PIL import Image
skipped_index = []
for i, row in data.iterrows():
    # if image exists, skip
    try:
        img = Image.open('image/' + str(i) + '.jpg')
        continue
    except:
        download_image(row['img_link'], 'image/' + str(i) + '.jpg', i)

Image couldn't be retrieved for image/10.jpg https://m.media-amazon.com/images/W/WEBP_402378-T2/images/I/313jBpnrJVL._SX300_SY300_QL70_FMwebp_.jpg
Image couldn't be retrieved for image/12.jpg https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/31c6zDmtEnL._SY300_SX300_QL70_FMwebp_.jpg
Image couldn't be retrieved for image/13.jpg https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/31l0oxTSJuL._SX300_SY300_QL70_FMwebp_.jpg
Image couldn't be retrieved for image/15.jpg https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/11ICusapw3L._SY300_SX300_QL70_FMwebp_.jpg
Image couldn't be retrieved for image/17.jpg https://m.media-amazon.com/images/W/WEBP_402378-T2/images/I/31e6ElWRymL._SX300_SY300_QL70_FMwebp_.jpg
Image couldn't be retrieved for image/18.jpg https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/41sKyiPWzAL._SX300_SY300_QL70_FMwebp_.jpg
Image couldn't be retrieved for image/19.jpg https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/31M+TYWPdQL._

In [13]:
# Drop the rows that don't have images
data = data.drop(skipped_index)

# Save the data

In [14]:
data.to_csv('Amazon_cleaned.csv', index_label='index')

In [15]:
from PIL import Image

# check if every image is downloaded

for i, row in data.iterrows():
    try:
        img = Image.open('image/' + str(i) + '.jpg')
    except:
        print(i)

In [16]:
!zip -r image.zip image

  adding: image/ (stored 0%)
  adding: image/1409.jpg (deflated 1%)
  adding: image/77.jpg (stored 0%)
  adding: image/1435.jpg (stored 0%)
  adding: image/1347.jpg (stored 0%)
  adding: image/1390.jpg (deflated 2%)
  adding: image/162.jpg (stored 0%)
  adding: image/176.jpg (deflated 3%)
  adding: image/88.jpg (deflated 0%)
  adding: image/610.jpg (stored 0%)
  adding: image/1384.jpg (stored 0%)
  adding: image/1179.jpg (stored 0%)
  adding: image/1151.jpg (deflated 2%)
  adding: image/348.jpg (stored 0%)
  adding: image/360.jpg (stored 0%)
  adding: image/1186.jpg (deflated 3%)
  adding: image/412.jpg (deflated 12%)
  adding: image/374.jpg (stored 0%)
  adding: image/1019.jpg (stored 0%)
  adding: image/599.jpg (stored 0%)
  adding: image/1025.jpg (stored 0%)
  adding: image/1031.jpg (deflated 5%)
  adding: image/228.jpg (stored 0%)
  adding: image/214.jpg (stored 0%)
  adding: image/572.jpg (stored 0%)
  adding: image/957.jpg (deflated 5%)
  adding: image/9

In [22]:
!aws s3 ls --profile iamadmin-general

2022-02-12 05:33:10 cf-templates-7x4o2u7ssxah-us-east-1
2023-11-16 13:07:55 comp576-image-data


In [25]:
!aws s3 cp image.zip s3://comp576-image-data/ --acl public-read --profile iamadmin-general

upload: ./image.zip to s3://comp576-image-data/image.zip          


In [26]:
!aws s3 ls s3://comp576-image-data/ --profile iamadmin-general

2023-11-16 13:26:41    7367087 image.zip


# Object url: https://comp576-image-data.s3.us-east-2.amazonaws.com/image.zip