## Introduction

## Jupyter Notebook Setup

In [83]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import os
from tqdm import tqdm

## Building the Data Scraper

product page

https://www.amazon.in/Apple-iPhone-XR-64GB-White/dp/B07JGXM9WN/ref=cm_cr_arp_d_bdcrb_top?ie=UTF8

product review: page 1

https://www.amazon.in/Apple-iPhone-XR-64GB-White/product-reviews/B07JGXM9WN/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=1

product review: page 2

https://www.amazon.in/Apple-iPhone-XR-64GB-White/product-reviews/B07JGXM9WN/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2

product review: page 3

https://www.amazon.in/Apple-iPhone-XR-64GB-White/product-reviews/B07JGXM9WN/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=3

In [16]:
url = "https://www.amazon.in/Apple-iPhone-XR-64GB-White/product-reviews/B07JGXM9WN/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber="

page = requests.get(url+'1')
html = BeautifulSoup(page.text, "html.parser")
print(html.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-in">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|012LjolmrML.css,41-crZfIjzL.css,11cMnOipjJL.css,017DsKjNQJL.css,01Vctty9pOL.css,01HEsUOLYvL.css,41EWOOlBJ9L.css,11PIM2x8KnL.css,01ElnPiDxWL.css,11QxHU4QYaL.css,01Sp8sB1HiL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,01evdoiemkL.css,01K+Ps1DeEL.css,314djKvMsUL.css,01ZTetsDh7L.css,01pbA9Lg3yL.css,21LK7jaicML.css,11L58Qpo0GL.css,21kyTi1FabL.css,01ruG+gDPFL.css,01YhS3Cs-hL.css,21GwE3cR-y

## What are we looking for?

In [17]:
# sample of data that will be scraped

# **Review Rating**

# <i class="a-icon a-icon-star a-star-1 review-rating" data-hook="review-star-rating">
# <span class="a-icon-alt">
# 1.0 out of 5 stars
# </span>
# </i>

# **Review Title**

# <a class="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold" data-hook="review-title" href="/gp/customer-reviews/RZQ9NHUADHI5W?ASIN=B07JGXM9WN">
# <span>
# Def a bad experience
# </span>
# </a>

# **Review Date and Location**

# <span class="a-size-base a-color-secondary review-date" data-hook="review-date">
# Reviewed in India on 21 April 2019
# </span>

# **Review Text**

# <span class="a-size-base review-text review-text-content" data-hook="review-body">
# <span>
# Went with the iPhone XR after over a month of consideration.Amazon was offering some really good discounts and felt the time was right to upgrade having used the iPhone 6 for four seamless years.
# <br/>
# The delivery and the product was perfect
# <br/>
# 3 days in and I started to experience issues
# <br/>
# A.The Phone microphone stopped working as a result I can not use any functionality of the phone no calls/no video recording/no voice recording or voice messages.
# <br/>
# B. The camera in jus the photo mode has also slowed down it takes over 15 secs to click and store a pic to gallery
# <br/>
# I have initiated a replacement option with Amazon within just 3 days of the purchase.in hindsight I now am starting to see why people shop in stores.
# </span>
# </span>

## One Page Data Scraper

In [52]:
def grab_review_rating(text):
    return float(text.replace(' out of 5 stars', '').strip())

def grab_review_location_and_date(text):
    location = re.sub('Reviewed in | on \d{1,2} \w+ \d{4}', '', text).strip()
    date = re.findall('\d{1,2} \w+ \d{4}', text)[0]
    return location, date

def scrape(url):
    page = requests.get(url)
    html = BeautifulSoup(page.text, "html.parser")
    review_titles = html.find_all('a', class_='review-title', attrs={'data-hook':'review-title'})
    review_dates_and_locations = html.find_all('span', class_='review-date', attrs={'data-hook':'review-date'})
    review_texts = html.find_all('span', class_='review-text', attrs={'data-hook':'review-body'})
    review_ratings = html.find_all('i', class_='review-rating', attrs={'data-hook':'review-star-rating'})
    data = []
    for title, date_and_location, text, rating in zip(
        review_titles, 
        review_dates_and_locations, 
        review_texts, 
        review_ratings
    ):
        title = ' '.join([i.strip() for i in title.get_text().split()])
        location, date = grab_review_location_and_date(date_and_location.get_text())
        rating = grab_review_rating(rating.get_text())
        text = ' '.join([i.strip() for i in text.get_text().split()])
        data.append([title, date, location, rating, text])
    df = pd.DataFrame(data, columns=['title', 'date', 'location', 'rating', 'text'])
    df['date'] = pd.to_datetime(df['date'])
    return df

**Sanity Check**

In [57]:
df = scrape(url + '1')
df

Unnamed: 0,title,date,location,rating,text
0,"Which iPhone you should Purchase ? iPhone 8, X...",2018-12-12,India,3.0,NOTE:@ This is detailed comparison between iPh...
1,Don't buy iPhone xr from Amazon.,2018-11-17,India,1.0,Very bad experience with this iPhone xr phone....
2,Happy with the purchase,2019-01-27,India,5.0,Amazing phone with amazing camera coming from ...
3,Amazon is not an apple authorised reseller. Pl...,2019-05-02,India,1.0,So I got the iPhone XR just today. The product...
4,Excellent Battery life and buttery smooth UI,2019-05-24,India,5.0,I've been an android user all my life until I ...
5,Never purchase a phone online.,2019-04-22,India,1.0,I was delivered a phone that did not work imme...
6,Awesome iPhone,2018-12-07,India,5.0,It has been a month since I started using my i...
7,Exchange of I phone xr with xs.,2018-11-25,India,1.0,The phone is hanging. Video quality is not ver...
8,Worth upgrading from the 6S? I'll know in a fe...,2019-04-27,India,4.0,I'll use this review to mostly say what I'm no...
9,Def a bad experience,2019-04-21,India,1.0,Went with the iPhone XR after over a month of ...


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   title     10 non-null     object        
 1   date      10 non-null     datetime64[ns]
 2   location  10 non-null     object        
 3   rating    10 non-null     float64       
 4   text      10 non-null     object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 528.0+ bytes


## Building A Multi Page Data Scraper

In [68]:
for i in tqdm(range(1, 10)):
    z = scrape(url + str(i))
    if i == 1:
        df = z.copy()
    else:
        df = df.append(z)
        
df

100%|██████████| 9/9 [00:10<00:00,  1.14s/it]


Unnamed: 0,title,date,location,rating,text
0,"Which iPhone you should Purchase ? iPhone 8, X...",2018-12-12,India,3.0,NOTE:@ This is detailed comparison between iPh...
1,Don't buy iPhone xr from Amazon.,2018-11-17,India,1.0,Very bad experience with this iPhone xr phone....
2,Happy with the purchase,2019-01-27,India,5.0,Amazing phone with amazing camera coming from ...
3,Amazon is not an apple authorised reseller. Pl...,2019-05-02,India,1.0,So I got the iPhone XR just today. The product...
4,Excellent Battery life and buttery smooth UI,2019-05-24,India,5.0,I've been an android user all my life until I ...
...,...,...,...,...,...
5,Review after using for 1 year,2020-06-09,India,5.0,Phone is really amazing. It’s been nearly one ...
6,ok,2019-08-10,India,4.0,You will always feel cheated after buying an i...
7,Good phone with lot of cons but still flagship,2020-02-05,India,4.0,There are many pros about this phone which you...
8,"In this price of 45000, its the best choice fo...",2019-09-29,India,5.0,"Camera superbBattery amazing, came with 58%, a..."


In [69]:
df.shape

(90, 5)

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90 entries, 0 to 9
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   title     90 non-null     object        
 1   date      90 non-null     datetime64[ns]
 2   location  90 non-null     object        
 3   rating    90 non-null     float64       
 4   text      90 non-null     object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 4.2+ KB


## Saving The Scraped Data

In [73]:
path = "../scraped_data"
if not os.path.exists(path):
    os.makedirs(path)
    
df.to_parquet(os.path.join(path, 'sample.parquet.gzip'), index=False)

In [75]:
z = pd.read_parquet(os.path.join(path, 'sample.parquet.gzip'))
z.head()

Unnamed: 0,title,date,location,rating,text
0,"Which iPhone you should Purchase ? iPhone 8, X...",2018-12-12,India,3.0,NOTE:@ This is detailed comparison between iPh...
1,Don't buy iPhone xr from Amazon.,2018-11-17,India,1.0,Very bad experience with this iPhone xr phone....
2,Happy with the purchase,2019-01-27,India,5.0,Amazing phone with amazing camera coming from ...
3,Amazon is not an apple authorised reseller. Pl...,2019-05-02,India,1.0,So I got the iPhone XR just today. The product...
4,Excellent Battery life and buttery smooth UI,2019-05-24,India,5.0,I've been an android user all my life until I ...


In [76]:
z.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   title     90 non-null     object        
 1   date      90 non-null     datetime64[ns]
 2   location  90 non-null     object        
 3   rating    90 non-null     float64       
 4   text      90 non-null     object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 3.6+ KB


## Putting all together

In [92]:
!pygmentize helper_scripts/scrape.py

[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mfrom[39;49;00m [04m[36mbs4[39;49;00m [34mimport[39;49;00m BeautifulSoup
[34mimport[39;49;00m [04m[36mrequests[39;49;00m
[34mimport[39;49;00m [04m[36mre[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mfrom[39;49;00m [04m[36mtqdm[39;49;00m [34mimport[39;49;00m tqdm


[34mdef[39;49;00m [32mgrab_review_rating[39;49;00m(text):
    [34mreturn[39;49;00m [36mfloat[39;49;00m(text.replace([33m"[39;49;00m[33m out of 5 stars[39;49;00m[33m"[39;49;00m, [33m"[39;49;00m[33m"[39;49;00m).strip())


[34mdef[39;49;00m [32mgrab_review_location_and_date[39;49;00m(text):
    location = re.sub([33m"[39;49;00m[33mReviewed in | on [39;49;00m[33m\[39;49;00m[33md[39;49;00m[33m{[39;49;00m[33m1,2} [39;49;00m[33m\[39;49;00m[33mw+ [39;49;00m[33m\[39;49;00m[33md[39;49;00m[33m{4}[39;49;00m[33m"[39;49;00m, [33m"[39;49;00m[33m"

In [79]:
!python helper_scripts/scrape.py

scraping: 100%|███████████████████████████████| 500/500 [07:32<00:00,  1.11it/s]


In [89]:
df = pd.read_parquet(os.path.join(path, "reviews.parquet.gzip"))
df.head()

Unnamed: 0,title,date,location,rating,text
0,"Which iPhone you should Purchase ? iPhone 8, X...",2018-12-12,India,3.0,NOTE:@ This is detailed comparison between iPh...
1,Don't buy iPhone xr from Amazon.,2018-11-17,India,1.0,Very bad experience with this iPhone xr phone....
2,Happy with the purchase,2019-01-27,India,5.0,Amazing phone with amazing camera coming from ...
3,Amazon is not an apple authorised reseller. Pl...,2019-05-02,India,1.0,So I got the iPhone XR just today. The product...
4,Excellent Battery life and buttery smooth UI,2019-05-24,India,5.0,I've been an android user all my life until I ...


In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3870 entries, 0 to 3869
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   title     3870 non-null   object        
 1   date      3870 non-null   datetime64[ns]
 2   location  3870 non-null   object        
 3   rating    3870 non-null   float32       
 4   text      3870 non-null   object        
dtypes: datetime64[ns](1), float32(1), object(3)
memory usage: 136.2+ KB


## Closing Remarks