##### Web Scrapping of Amazon Best Selling Books

In [24]:
#libraries required
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import time
from datetime import datetime
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

In [26]:
page_no=2
def get_data(pageNo):
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
    r = requests.get('https://www.amazon.in/gp/bestsellers/books/ref=zg_bs_pg_'+str(pageNo)+'?ie=UTF8&pg='+str(pageNo), headers=headers)#, proxies=proxies)
    content = r.content
    soup = BeautifulSoup(content)
    #print(soup)
    alls = []
    for d in soup.findAll('div', attrs={'class':'a-section a-spacing-none aok-relative'}):
        #print(d)
        name = d.find('span', attrs={'class':'zg-text-center-align'})
        n = name.find_all('img', alt=True)
        #print(n[0]['alt'])
        author = d.find('a', attrs={'class':'a-size-small a-link-child'})
        rating = d.find('span', attrs={'class':'a-icon-alt'})
        users_rated = d.find('a', attrs={'class':'a-size-small a-link-normal'})
        price = d.find('span', attrs={'class':'p13n-sc-price'})

        all1=[]

        if name is not None:
            #print(n[0]['alt'])
            all1.append(n[0]['alt'])
        else:
            all1.append("unknown-product")

        if author is not None:
            #print(author.text)
            all1.append(author.text)
        elif author is None:
            author = d.find('span', attrs={'class':'a-size-small a-color-base'})
            if author is not None:
                all1.append(author.text)
            else:    
                all1.append('0')

        if rating is not None:
            #print(rating.text)
            all1.append(rating.text)
        else:
            all1.append('-1')

        if users_rated is not None:
            #print(price.text)
            all1.append(users_rated.text)
        else:
            all1.append('0')     

        if price is not None:
            #print(price.text)
            all1.append(price.text)
        else:
            all1.append('0')
        alls.append(all1)    
    return alls

In [27]:
results = []
for i in range(1, page_no+1):
    results.append(get_data(i))
flatten = lambda l: [item for sublist in l for item in sublist]
df = pd.DataFrame(flatten(results),columns=['Book Name','Author','Rating','Customers_Rated', 'Price'])
df.to_csv('amazon_products.csv', index=False, encoding='utf-8')

In [28]:
df = pd.read_csv("amazon_products.csv")

In [29]:
df.shape

(100, 5)

In [30]:
df

Unnamed: 0,Book Name,Author,Rating,Customers_Rated,Price
0,Karma: A Yogi's Guide to Crafting Your Destiny,Sadhguru,-1,0,₹ 280.00
1,Unfinished: A Memoir,Priyanka Chopra Jonas,4.3 out of 5 stars,235,₹ 511.00
2,Spoken English Course (Telugu),Vashista 360,4.7 out of 5 stars,69,₹ 399.00
3,Ikigai: The Japanese secret to a long and happ...,Héctor García,4.6 out of 5 stars,8537,₹ 300.00
4,My First Library: Boxset of 10 Board Books for...,Wonder House Books,4.5 out of 5 stars,14215,₹ 399.00
...,...,...,...,...,...
95,Fast Track Objective Arithmetic,Rajesh Verma,4.4 out of 5 stars,2404,₹ 322.00
96,My Activity- ABC Colouring Book,Dreamland Publications,4.5 out of 5 stars,691,₹ 80.00
97,101 Panchatantra Stories,Dreamland Publications,4.3 out of 5 stars,2601,₹ 130.00
98,My Activity- Phonics Activity Book,Dreamland Publications,4.4 out of 5 stars,528,₹ 80.00


In [31]:
df['Rating'] = df['Rating'].apply(lambda x: x.split()[0])
df['Rating'] = pd.to_numeric(df['Rating'])
df["Price"] = df["Price"].str.replace('₹', '')
df["Price"] = df["Price"].str.replace(',', '')
df['Price'] = df['Price'].apply(lambda x: x.split('.')[0])
df['Price'] = df['Price'].astype(int)
df["Customers_Rated"] = df["Customers_Rated"].str.replace(',', '')
df['Customers_Rated'] = pd.to_numeric(df['Customers_Rated'], errors='ignore')

In [32]:
df

Unnamed: 0,Book Name,Author,Rating,Customers_Rated,Price
0,Karma: A Yogi's Guide to Crafting Your Destiny,Sadhguru,-1.0,0,280
1,Unfinished: A Memoir,Priyanka Chopra Jonas,4.3,235,511
2,Spoken English Course (Telugu),Vashista 360,4.7,69,399
3,Ikigai: The Japanese secret to a long and happ...,Héctor García,4.6,8537,300
4,My First Library: Boxset of 10 Board Books for...,Wonder House Books,4.5,14215,399
...,...,...,...,...,...
95,Fast Track Objective Arithmetic,Rajesh Verma,4.4,2404,322
96,My Activity- ABC Colouring Book,Dreamland Publications,4.5,691,80
97,101 Panchatantra Stories,Dreamland Publications,4.3,2601,130
98,My Activity- Phonics Activity Book,Dreamland Publications,4.4,528,80


In [33]:
df.dtypes

Book Name           object
Author              object
Rating             float64
Customers_Rated      int64
Price                int32
dtype: object

In [34]:
df.replace(str(0), np.nan, inplace=True)
df.replace(0, np.nan, inplace=True)

In [35]:
count_nan = len(df) - df.count()
count_nan

Book Name          0
Author             1
Rating             0
Customers_Rated    2
Price              0
dtype: int64

In [36]:
df = df.dropna()

In [37]:
df

Unnamed: 0,Book Name,Author,Rating,Customers_Rated,Price
1,Unfinished: A Memoir,Priyanka Chopra Jonas,4.3,235.0,511
2,Spoken English Course (Telugu),Vashista 360,4.7,69.0,399
3,Ikigai: The Japanese secret to a long and happ...,Héctor García,4.6,8537.0,300
4,My First Library: Boxset of 10 Board Books for...,Wonder House Books,4.5,14215.0,399
5,The Alchemist,Paulo Coelho,4.6,42815.0,189
...,...,...,...,...,...
95,Fast Track Objective Arithmetic,Rajesh Verma,4.4,2404.0,322
96,My Activity- ABC Colouring Book,Dreamland Publications,4.5,691.0,80
97,101 Panchatantra Stories,Dreamland Publications,4.3,2601.0,130
98,My Activity- Phonics Activity Book,Dreamland Publications,4.4,528.0,80


In [38]:
data = df.sort_values(["Price"], axis=0, ascending=False)[:10]
data

Unnamed: 0,Book Name,Author,Rating,Customers_Rated,Price
37,My First Complete Learning Library: Boxset of ...,Wonder House Books,4.5,2624.0,799
23,How to Avoid a Climate Disaster: The Solutions...,Bill Gates,4.4,22.0,729
61,10 Years Solved Papers: ICSE Class 10 for 2021...,Gurukul Books,4.4,220.0,672
74,Oswaal CBSE SAMPLE QUESTION PAPERS CLASS 10 (S...,Oswaal Editorial Board,4.1,43.0,669
89,Indian Art and Culture for Civil Services and ...,Nitin Singhania,4.6,1424.0,637
18,Indian Polity - For Civil Services and Other S...,M. Laxmikanth,4.6,4838.0,617
31,A Modern Approach to Verbal & Non-Verbal Reaso...,R.S. Aggarwal,4.4,3223.0,562
58,How to Prepare for Quantitative Aptitude for t...,Arun Sharma,4.5,1662.0,557
25,Objective NCERT at your FINGERTIPS for NEET-AI...,MTG Editorial Board,4.7,383.0,538
34,The Intelligent Investor (English) Paperback –...,Benjamin Graham,4.4,18026.0,516


In [39]:
data1=df.sort_values(["Rating"], axis=0, ascending=False)[:5]
data1

Unnamed: 0,Book Name,Author,Rating,Customers_Rated,Price
26,"‘Kaun Hain Bharat Mata?’ : Itihas, Sanskriti a...",Purushottam Agrawal,5.0,1.0,350
35,Bhagavad Gita: Yatharoop (Hindi),A.C. Bhaktivendanta Swami Prabhupada,4.8,4234.0,185
44,Death; An Inside Story: A book for all those w...,Sadhguru,4.7,3066.0,211
82,The Magic of the Lost Temple,Sudha Murty,4.7,1607.0,140
79,Harry Potter and the Philosopher's Stone,J.K. Rowling,4.7,22659.0,287
