# Amazon scraper
- Reference: https://github.com/vijeshs/Web-Scraping-/blob/master/Web%20Scraping-JBL%20speaker.ipynb

In [3]:
pip install fake_useragent

Collecting fake_useragent
  Downloading fake-useragent-0.1.11.tar.gz (13 kB)
Building wheels for collected packages: fake-useragent
  Building wheel for fake-useragent (setup.py) ... [?25ldone
[?25h  Created wheel for fake-useragent: filename=fake_useragent-0.1.11-py3-none-any.whl size=13487 sha256=e270201b5a34ac2a1d12165f1a68cf1817597aeb9d30cdd4240c689473fa1405
  Stored in directory: /Users/ChristieFung/Library/Caches/pip/wheels/a0/b8/b7/8c942b2c5be5158b874a88195116b05ad124bac795f6665e65
Successfully built fake-useragent
Installing collected packages: fake-useragent
Successfully installed fake-useragent-0.1.11
Note: you may need to restart the kernel to use updated packages.


In [1]:
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
from datetime import datetime

import pandas as pd
import requests
import pickle

In [2]:
header={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}

## This saves all the product name and the asin no
- The Amazon Standard Identification Number (asin no) is the number used for specific product search
- Given a search query, loop through multiple pages of amazon and get the coressponding products
- Using amazon.in, not amazon.com because amazon.com parsing may fail


In [3]:
from collections import OrderedDict

In [7]:
def get_product_asins(url_base, search_query):
    """
    ## Function to scrape product name and asin no (as the file path is similar)
    """
    url="%s%s" %(url_base, search_query)
    print(url)
    
    # Get the data
    page=requests.get(url,headers=header)  
    if page.status_code==200:
        return page                                #returns the page if there is no error
    else:
        return "Error"
    
sites = {
    'amazon.com':{
        'url_base': "https://www.amazon.com/s?k=",
        'asin_div': {'data-asin':True},
        'product_name_span': {'class':'a-size-base-plus a-color-base a-text-normal'}
    },
    'amazon.in':{
        'url_base': "https://www.amazon.in/s?k=",
        #'asin_div': {'class':['sg-col-4-of-24 sg-col-4-of-12 sg-col-4-of-36 s-result-item s-asin sg-col-4-of-28 sg-col-4-of-16 AdHolder sg-col sg-col-4-of-20 sg-col-4-of-32']},
        'asin_div': {'data-asin':True},
        'product_name_span': {'class':'a-size-base-plus a-color-base a-text-normal'}
    },
}    


# -------------------- custom parameters
site = sites['amazon.in']
npages = 15
query = 'normal+skin+lotion'


# ------------------------- start
products = OrderedDict()
for i in tqdm(range(1, npages+1)):
    try:
    
        # Get the respone and create bf4
        response=get_product_asins(site['url_base'],'%s&page=%s' %(query,str(i)))     #iterates through multiple pages of the search products
        soup=BeautifulSoup(response.content)

        # parse the asins (each product will have a unique asin)
        for p in soup.findAll('div', attrs=site['asin_div']):
            asin = p['data-asin']

            # parse the product name. If there is no product name
            pn = p.find('span', attrs=site['product_name_span'])
            if pn:
                pn = pn.text
                products[asin] = pn
    except:
        print('Failed to parse in this page:', i)
            
# print and check
print('Total number of products identified for %s pages:%s' %(npages, len(products)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(1, npages+1)):


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

https://www.amazon.in/s?k=normal+skin+lotion&page=1
https://www.amazon.in/s?k=normal+skin+lotion&page=2
https://www.amazon.in/s?k=normal+skin+lotion&page=3
https://www.amazon.in/s?k=normal+skin+lotion&page=4
https://www.amazon.in/s?k=normal+skin+lotion&page=5
https://www.amazon.in/s?k=normal+skin+lotion&page=6
https://www.amazon.in/s?k=normal+skin+lotion&page=7
https://www.amazon.in/s?k=normal+skin+lotion&page=8
https://www.amazon.in/s?k=normal+skin+lotion&page=9
https://www.amazon.in/s?k=normal+skin+lotion&page=10
https://www.amazon.in/s?k=normal+skin+lotion&page=11
https://www.amazon.in/s?k=normal+skin+lotion&page=12
https://www.amazon.in/s?k=normal+skin+lotion&page=13
https://www.amazon.in/s?k=normal+skin+lotion&page=14
https://www.amazon.in/s?k=normal+skin+lotion&page=15

Total number of products identified for 15 pages:0


### Save
- save the information in case something break and I lost all the webscraping information

In [None]:
# save
dt = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
fn = '%s_product_asins' %dt
with open('../data/%s.pkl' %fn, 'wb') as f:
    pickle.dump({'asin':products}, f)

    #pkl = dictionary

In [None]:
# # hack the asin_no
# Asin_no=['B00EH99VY6',
#  'B07STDDDGF',
#  'B07LCQW2RC',]

## Identify each product (ASIN) review link
- One example of review link is (The overaall review link): 
https://www.amazon.in//Lacto-Calamine-Daily-Lotion-Balance/product-reviews/B00EH99VY6/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&amp;reviewerType=all_reviews&pageNumber=1

### Load the saved ASINs

In [None]:
# check the data/ folder and identify the saved file   #03-06-2020_23-45-41_product_asins oily #04-06-2020_01-01-51_product_asins.pkl dry
fn = '04-06-2020_02-06-03_product_asins' 
with open('../data/%s.pkl' %fn, 'rb') as f:
    data = pickle.load(f)
    
# Get all the ASINS
asins = list(data['asin'].keys())
print(asins)


### Identify the review link 

In [None]:
def get_review_url(base_url, query):
    """
    Function to scrape link of the All customer reviews to acess all the reviews
    """
    # query
    url="%s%s" %(base_url, query)
    print(url)
    page=requests.get(url, headers=header)
    
    # check status
    if page.status_code==200:
        return page                           
    else:
        return "Error"

sites = {
    'amazon.in':{
        'url_base': "https://www.amazon.in/dp/",
        'tag_a': {'data-hook':'see-all-reviews-link-foot'}
    },
}  

# ------------------------- custom parameters
site = sites['amazon.in']
nproducts = len(asins) # THis can adjust based on how many products you want to check
                    # FYI: this cell take a lot of time to run, you may want to
                    # start small, e.g., nproducts = 5, and see if everything work,
                    # then go to all the available ASINS
                    # len(asins)


# ------------------------- start
review_links = {}
for i in range(0, nproducts):
    try:
        r=get_review_url(site['url_base'], asins[i])

        # bf4
        soup=BeautifulSoup(r.content)
        tag = soup.find('a',attrs=site['tag_a'])
        if tag:
            review_links[asins[i]] = tag['href']
    except:
        print('Failed to get the url for this product:', asins[i])

# check
print('Number of review links:', len(review_links))

### save
save the information in case something break and I lost all the webscraping information

In [None]:
# save
dt = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
fn = '%s_product_review_link' %dt
with open('../data/%s.pkl' %fn, 'wb') as f:
    pickle.dump({'review_link':review_links}, f)
    

## For each product, scrape the review, rating, and images
- install tqdm
- install https://ipywidgets.readthedocs.io/en/stable/user_install.html
- Restart your notebook after installation

### Load the review link of every product

In [None]:
fn = '04-06-2020_01-18-20_product_review_link'     #04-06-2020_00-03-58_product_review_link oily
with open('../data/%s.pkl' %fn, 'rb') as f:
    data = pickle.load(f)
    
# Get all the ASINS
links = data['review_link']
links


In [None]:
# # hack the link
# # Find a particlar review page link and put it here
# links=['/Lacto-Calamine-Daily-Lotion-Balance/product-reviews/B00EH99VY6/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&amp;reviewerType=all_reviews']

### Extract all the reivew and rating for each products

In [None]:
def get_review_img(url_base, query):
    url="%s%s" %(url_base, query)
    print(url)
    page=requests.get(url,headers=header)
    if page.status_code==200:
        return page
    else:
        return "Error" 

sites = {
    'amazon.in':{
        'url_base': "https://www.amazon.in/",
    },
}
    
    
# ------------------------- custom parameters
npages = 10 # number of page of reviews to visit
site = sites['amazon.in']


# ------------------------- Start
# lists to store different information
asins = []
products = [] # product names
reviews = [] # reviews of the products
ratings = [] # rating of a product of a review
img_links = [] # all the image links related to a review

# for k in range(len(links)): 
for k, asin in enumerate(links.keys()):
    for i in tqdm(range(1, npages)):   
        try:
            # Construct the bf4
            response=get_review_img(site['url_base'], links[asin]+'&pageNumber='+str(i))
            soup=BeautifulSoup(response.content)

            # Get the product name
            pn = soup.find("a", attrs={'data-hook':'product-link'}).text

            # Get each review for this particular product
            for review in soup.findAll("div", attrs={'data-hook':'review'}):
                # identify if a review contains any image. If so, then we will
                # save the image(s) and assign ids to images
                imgs = review.findAll("img", attrs={'class':'review-image-tile'})
                if len(imgs) > 0:
                    # get the review text
                    text = review.find("span", attrs={'data-hook':'review-body'}).text.replace('\n',"")

                    # get the rating
                    rating = review.find("i", attrs={'data-hook':"review-star-rating"}).text

                    # get the image link (I don't save the images for now since it will take time).
                    # As long as we get the image links, we can write another function to read the csv
                    # and save the images somewhere
                    ilinks = [img['src'].replace("._SY88", "") for img in imgs]
                    ilinks = ",".join(ilinks)

                    # append
                    asins.append(asin)
                    products.append(pn)
                    reviews.append(text)
                    ratings.append(rating)
                    img_links.append(ilinks)
        except:
            print("Failed to get the review and images for this ASIN (%s) in this page (%s)" %(asin, i))


### save the information into pandas and then into csv
- you should see a new file created in the data/ folder

In [None]:
from datetime import datetime

In [None]:
# Constructe the data frame
df = pd.DataFrame({'pid': list(range(0, len(asins))),
                    'ASIN': asins,
                    'product_name':products,
                   'review': reviews,
                   'rating': ratings,
                   'img_link': img_links
                  })
df.head()

In [None]:
# save
dt = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
df.to_csv('../data/%s_amazon_review_with_image.csv' %dt)


In [None]:
len(img_links)