# Amazon scraper
- Based on this: https://github.com/vijeshs/Web-Scraping-/blob/master/Web%20Scraping-JBL%20speaker.ipynb

In [9]:
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
from datetime import datetime

import pandas as pd
import requests
import pickle

In [10]:
header={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}



## This saves all the product name and the asin no
- The Amazon Standard Identification Number (asin no) is the number used for specific product search
- Given a search query, we will loop through multiple pages of amazon and get the coressponding products
- Note that we are using amazon.in, not amazon.com because amazon.com parsing may fail


In [11]:
from collections import OrderedDict

In [12]:
def get_product_asins(url_base, search_query):
    """
    ## Function to scrape product name and asin no (as the file path is similar)
    """
    url="%s%s" %(url_base, search_query)
    print(url)
    
    # Get the data
    page=requests.get(url,headers=header)  
    if page.status_code==200:
        return page                                #returns the page if there is no error
    else:
        return "Error"
    
sites = {
    'amazon.com':{
        'url_base': "https://www.amazon.com/s?k=",
        'asin_div': {'data-asin':True},
        'product_name_span': {'class':'a-size-base-plus a-color-base a-text-normal'}
    },
    'amazon.in':{
        'url_base': "https://www.amazon.in/s?k=",
        #'asin_div': {'class':['sg-col-4-of-24 sg-col-4-of-12 sg-col-4-of-36 s-result-item s-asin sg-col-4-of-28 sg-col-4-of-16 AdHolder sg-col sg-col-4-of-20 sg-col-4-of-32']},
        'asin_div': {'data-asin':True},
        'product_name_span': {'class':'a-size-base-plus a-color-base a-text-normal'}
    },
}    


# -------------------- custom parameters
site = sites['amazon.com']
npages = 10
query = 'oily+skin+lotion'
skin_type = 'dry'


# ------------------------- start
products = OrderedDict()
for i in tqdm(range(1, npages+1)):
    try:
        # Get the respone and create bf4
        response=get_product_asins(site['url_base'], '%s&page=%s' %(query,str(i)))     #iterates through multiple pages of the search products
        soup=BeautifulSoup(response.content, 'lxml') # somehow need to use lxml parser to make amazon.com work.
                                                     # pip install lxml
                                                     # html.parser won't work

        # parse the asins (each product will have a unique asin)
        for p in soup.findAll('div', attrs=site['asin_div']):
            asin = p['data-asin']

            # parse the product name. If there is no product name
            pn = p.find('span', attrs=site['product_name_span'])
            if pn:
                pn = pn.text
                products[asin] = pn
    except: 
        print('Fail to parse this page:', i)


            
# print and check
print('Total number of products identified for %s pages:%s' %(npages, len(products)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(1, npages+1)):


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

https://www.amazon.com/s?k=oily+skin+lotion&page=1
https://www.amazon.com/s?k=oily+skin+lotion&page=2
https://www.amazon.com/s?k=oily+skin+lotion&page=3
https://www.amazon.com/s?k=oily+skin+lotion&page=4
https://www.amazon.com/s?k=oily+skin+lotion&page=5
https://www.amazon.com/s?k=oily+skin+lotion&page=6
https://www.amazon.com/s?k=oily+skin+lotion&page=7
https://www.amazon.com/s?k=oily+skin+lotion&page=8
https://www.amazon.com/s?k=oily+skin+lotion&page=9
https://www.amazon.com/s?k=oily+skin+lotion&page=10

Total number of products identified for 10 pages:0


In [11]:
# with open('a.txt', 'wb') as f:
#     f.write(response.content)


### save
save the information in case something break and I lost all the webscraping information

In [29]:
# save
dt = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
fn = '%s_product_asins' %dt
with open('../data/%s.pkl' %fn, 'wb') as f:
    pickle.dump({'asin':products}, f)
    

In [77]:
# # hack the asin_no
# Asin_no=['B00EH99VY6',
#  'B07STDDDGF',
#  'B07LCQW2RC',]

## Identify each product (ASIN) review link
- One example of review link is (The overaall review link): 
https://www.amazon.in//Lacto-Calamine-Daily-Lotion-Balance/product-reviews/B00EH99VY6/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&amp;reviewerType=all_reviews&pageNumber=1

### Load the saved ASINs

In [54]:
# check the data/ folder and identify the saved file
fn = '10-06-2020_22-51-32_product_asins'
with open('../data/%s.pkl' %fn, 'rb') as f:
    data = pickle.load(f)
    
# Get all the ASINS
asins = list(data['asin'].keys())
asins = [x for x in asins if x]
print(asins)

['B00352MHE2', 'B00BPF3LN2', 'B00AYSUKTE', 'B000XY9LT4', 'B001459IEE', 'B0034F9K36', 'B004D2DPNK', 'B00TTD9BRC', 'B00NR1YQK4', 'B000YJ2SLG', 'B005EZSUJY', 'B002HR3AGU', 'B0065I0UMO', 'B003BMJGKE', 'B001E6OCO4', 'B075G3RJDZ', 'B00DG8F04Y', 'B07L9RTWRS', 'B00DG8EZKO', 'B07RTPLBD2', 'B01M047N2Q', 'B0067H6G26', 'B01MCTTDJS', 'B000NKL3D0', 'B01KJGUUHM', 'B07YLJ2X55', 'B000UBN5S8', 'B018233T04', 'B0046OFO3Y', 'B00119XXPI', 'B0013OJUY4', 'B0063I3M5I', 'B00FAUS1HY', 'B07GC88JBC', 'B07GVPCGJK', 'B001G7PLZ0', 'B00DEX61A8', 'B00DG8F02Q', 'B008B9L6WS', 'B07SJVNJRL', 'B0009F3O8Q', 'B07QYWKLKP', 'B01GPWPJOS', 'B001PBOKRM', 'B004XZHR4K', 'B01B12KCGC', 'B0107QOYIK', 'B00143UYJE', 'B07SS6BN5X', 'B074MFDJRG', 'B083JM36DX', 'B07H14726J', 'B07F2GTD63', 'B00171ETM6', 'B008QXDD9I', 'B00NAF79MO', 'B00HNSSHQ6', 'B000NWGCZ2', 'B005C2NB3O', 'B07FMPPP5B', 'B00008MNZH', 'B075RH96ZL', 'B00NFR14PK', 'B00WB8843Y', 'B01HWIEGAW', 'B0859SPVR6', 'B00U97TO6U', 'B01K4Q1R2E', 'B07DPDNWKJ', 'B07NJPSHMD', 'B073HDJL96', 'B07P

### Identify the review link 

In [56]:
def get_review_url(base_url, query):
    """
    Function to scrape link of the All customer reviews to acess all the reviews
    """
    # query
    url="%s%s" %(base_url, query)
    print(url)
    page=requests.get(url, headers=header)
    
    # check status
    if page.status_code==200:
        return page                           
    else:
        return "Error"

sites = {
    'amazon.in':{
        'url_base': "https://www.amazon.in/dp/",
        'tag_a': {'data-hook':'see-all-reviews-link-foot'}
    },
    'amazon.com':{
        'url_base': "https://www.amazon.com/dp/",
        'tag_a': {'data-hook':'see-all-reviews-link-foot'}
    },
}  

# ------------------------- custom parameters
site = sites['amazon.com']
nproducts = 50 # THis can adjust based on how many products you want to check
                    # FYI: this cell take a lot of time to run, you may want to
                    # start small, e.g., nproducts = 5, and see if everything work,
                    # then go to all the available ASINS
                    # len(asins)


# ------------------------- start
review_links = {}
for i in tqdm(range(0, nproducts)):
    try:
        r=get_review_url(site['url_base'], asins[i])

        # bf4
        soup=BeautifulSoup(r.content, 'lxml')
        tag = soup.find('a',attrs=site['tag_a'])
        if tag:
            review_links[asins[i]] = tag['href']
    except:
        print('Failed to get the url for this product:', i)

# check
print('Number of review links:', len(review_links))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

https://www.amazon.com/dp/B00352MHE2
https://www.amazon.com/dp/B00BPF3LN2
https://www.amazon.com/dp/B00AYSUKTE
https://www.amazon.com/dp/B000XY9LT4
https://www.amazon.com/dp/B001459IEE
https://www.amazon.com/dp/B0034F9K36
https://www.amazon.com/dp/B004D2DPNK
https://www.amazon.com/dp/B00TTD9BRC
https://www.amazon.com/dp/B00NR1YQK4
https://www.amazon.com/dp/B000YJ2SLG
https://www.amazon.com/dp/B005EZSUJY
https://www.amazon.com/dp/B002HR3AGU
https://www.amazon.com/dp/B0065I0UMO
https://www.amazon.com/dp/B003BMJGKE
https://www.amazon.com/dp/B001E6OCO4
https://www.amazon.com/dp/B075G3RJDZ
https://www.amazon.com/dp/B00DG8F04Y
https://www.amazon.com/dp/B07L9RTWRS
https://www.amazon.com/dp/B00DG8EZKO
https://www.amazon.com/dp/B07RTPLBD2
https://www.amazon.com/dp/B01M047N2Q
https://www.amazon.com/dp/B0067H6G26
https://www.amazon.com/dp/B01MCTTDJS
https://www.amazon.com/dp/B000NKL3D0
https://www.amazon.com/dp/B01KJGUUHM
https://www.amazon.com/dp/B07YLJ2X55
https://www.amazon.com/dp/B000UBN5S8
h

In [12]:
# with open('a.txt', 'wb') as f:
#     f.write(r.content)


### save
save the information in case something break and I lost all the webscraping information

In [59]:
# save
dt = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
fn = '%s_product_review_link' %dt
with open('../data/%s.pkl' %fn, 'wb') as f:
    pickle.dump({'review_link':review_links}, f)
    

## For each product, scrape the review, rating, and images
- install tqdm
- install https://ipywidgets.readthedocs.io/en/stable/user_install.html
- Restart your notebook after installation

### Load the review link of every product

In [61]:
fn = '10-06-2020_23-43-11_product_review_link'
with open('../data/%s.pkl' %fn, 'rb') as f:
    data = pickle.load(f)
    
# Get all the ASINS
links = data['review_link']
links

{'B00352MHE2': '/Cetaphil-Advance-Hydrating-Lotion-Sensitive/product-reviews/B00352MHE2/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews',
 'B00BPF3LN2': '/Eucerin-Original-Healing-Lotion-packaging/product-reviews/B00BPF3LN2/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews',
 'B00AYSUKTE': '/Eucerin-Hydration-Spectrum-Lotion-Packaging/product-reviews/B00AYSUKTE/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews',
 'B000XY9LT4': '/Eucerin-Intensive-Repair-Lotion-Bottle/product-reviews/B000XY9LT4/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews',
 'B001459IEE': '/Aveeno-Moisturizing-Soothing-Emollients-Fragrance-Free/product-reviews/B001459IEE/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews',
 'B0034F9K36': '/Jergens-Ultra-Healing-Moisturizer-Ounces/product-reviews/B0034F9K36/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews',
 'B004D2DPNK': '/Lubriderm-Enriched-Non-Greasy-Moisturizer-Fragrance/product-reviews/B004D2DP

In [None]:
# # hack the link
# # Find a particlar review page link and put it here
# links=['/Lacto-Calamine-Daily-Lotion-Balance/product-reviews/B00EH99VY6/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&amp;reviewerType=all_reviews']

### Extract all the reivew and rating for each products

In [43]:
def get_review_img(url_base, query):
    url="%s%s" %(url_base, query)
    page=requests.get(url,headers=header)
    if page.status_code==200:
        return page, url
    else:
        return "Error" 

sites = {
    'amazon.in':{
        'url_base': "https://www.amazon.in/",
    },
    'amazon.com':{
        'url_base': "https://www.amazon.com/",
    },
}
    
    
# ------------------------- custom parameters
npages = 10 # number of page of reviews to visit
site = sites['amazon.com']


# ------------------------- Start
# lists to store different information
asins = []
products = [] # product names
reviews = [] # reviews of the products
ratings = [] # rating of a product of a review
img_links = [] # all the image links related to a review

# for k in range(len(links)): 
for k, asin in tqdm(enumerate(links.keys())):
    for i in range(1, npages):   
        try:
            # Construct the bf4
            response, url=get_review_img(site['url_base'], links[asin]+'&pageNumber='+str(i))
            soup=BeautifulSoup(response.content, 'lxml')
            
            # check
            if i==1:
                print(url)

            # Get the product name
            pn = soup.find("a", attrs={'data-hook':'product-link'}).text

            # Get each review for this particular product
            for review in soup.findAll("div", attrs={'data-hook':'review'}):
                # identify if a review contains any image. If so, then we will
                # save the image(s) and assign ids to images
                imgs = review.findAll("img", attrs={'class':'review-image-tile'})
                if len(imgs) > 0:
                    # get the review text
                    text = review.find("span", attrs={'data-hook':'review-body'}).text.replace('\n',"")

                    # get the rating
                    rating = review.find("i", attrs={'data-hook':"review-star-rating"}).text

                    # get the image link (I don't save the images for now since it will take time).
                    # As long as we get the image links, we can write another function to read the csv
                    # and save the images somewhere
                    ilinks = [img['src'].replace("._SY88", "") for img in imgs]
                    ilinks = ",".join(ilinks)

                    # append
                    asins.append(asin)
                    products.append(pn)
                    reviews.append(text)
                    ratings.append(rating)
                    img_links.append(ilinks)
        except:
            print("Failed to get the review and images for this ASIN (%s) in this page (%s)" %(asin, i))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

https://www.amazon.com//Cetaphil-Advance-Hydrating-Lotion-Sensitive/product-reviews/B00352MHE2/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&pageNumber=1
https://www.amazon.com//Eucerin-Original-Healing-Lotion-packaging/product-reviews/B00BPF3LN2/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&pageNumber=1
https://www.amazon.com//Eucerin-Hydration-Spectrum-Lotion-Packaging/product-reviews/B00AYSUKTE/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&pageNumber=1
https://www.amazon.com//Eucerin-Intensive-Repair-Lotion-Bottle/product-reviews/B000XY9LT4/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&pageNumber=1
https://www.amazon.com//Aveeno-Moisturizing-Soothing-Emollients-Fragrance-Free/product-reviews/B001459IEE/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&pageNumber=1



### save the information into pandas and then into csv
- you should see a new file created in the data/ folder

In [44]:
from datetime import datetime

In [57]:
# Constructe the data frame
df = pd.DataFrame({'pid': list(range(0, len(asins))),
                    'ASIN': asins,
                    'product_name':products,
                   'review': reviews,
                   'rating': ratings,
                   'img_link': img_links
                  })
df['skin_type'] = skin_type
df.head()

ValueError: arrays must all be same length

In [40]:
# save
dt = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
df.to_csv('../data/%s_amazon_review_with_image.csv' %dt)


In [41]:
len(img_links)

13