### web-scrapping to get a data on best selling books and different books genres using Requests and beautifulSoup


step 1: install modules required to to scrape the data and save it into csv files.

note: modules required are listed in requirements.text

In [354]:
!pip install -r requirements.txt  --quiet

In [355]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [356]:
url="https://www.amazon.in/gp/bestsellers/books/"

### using this url to retrieve data to get genre details and different genre urls 

In [357]:
def copy_page(url):
    """Copy the specified page using specified url"""
    response=requests.get(url)
    if response.status_code>=200 and response.status_code<=299:
        page_content=response.text
        doc=BeautifulSoup(page_content, "html.parser")
    return doc
        

In [358]:
def genre_details(doc):
    """
    This function is used to get the genre details.
    """
    genre_tags=doc.find_all('a')
    genre=[]
    genre_url=[]
    for i,link in enumerate(genre_tags):
        href = link.get("href")
        if href and href.startswith("/gp/bestsellers/books/1"):
            genre.append(link.text.strip())
            genre_url.append("https://www.amazon.in"+href)
    return genre,genre_url

In [359]:
def to_csv(x,y):
    dict={"genres":x,"genres_url":y}
    df=pd.DataFrame(dict, index=pd.RangeIndex(start=1,stop=len(dict['genres'])+1))
    df.to_csv('book_genre.csv')

In [361]:
doc=copy_page(url)
x,y=genre_details(doc)
to_csv(x,y)

### using the same url to find data of best selling books and its details.

## extracting data from webpage to find books titles

In [362]:
def book_title(doc):
    book_tags=doc.find_all('div', {'class':"zg-grid-general-faceout"})
    book_name=[]
    for book_tag in book_tags:
        try:
            book_name.append(book_tag.find('span').text.strip())
        except:
            book_name.append("none")
    return book_name



In [363]:
book_name=book_title(doc)
len(book_name)

50

## extracting data from webpage to find author names

In [364]:
def author_name(doc):
    author_tags=doc.find_all('div', {'class':"zg-grid-general-faceout"})
    author_name=[]
    for tag in author_tags:
        try:
            author_name.append(tag.find('div', {'class':"a-row a-size-small"}).text.strip())
        except:
            author_name.append("None")
    return author_name

In [365]:
author=author_name(doc)
len(author)

50

## extracting data from webpage to find stars provided by customers

In [366]:
def stars(doc):
    star_tags=doc.find_all('div',{'class':"zg-grid-general-faceout"})
    stars=[]
    for tag in star_tags:
        try:
            stars.append(tag.find('span',{'class':'a-icon-alt'}).text[:3])
        except:
            stars.append("None")
    return stars

In [407]:
star=stars(doc)
len(star)


50

## extracting data from webpage to find number of reviews can be found for respective books

In [368]:
def no_reviews(doc):
    review_tags=doc.find_all('div',{'class':"a-icon-row"})
    reviews=[]
    for tag in review_tags:
        try:
            reviews.append(tag.find('span',{'class':"a-size-small"}).text.strip())
        except:
            reviews.append('None')
    return reviews

In [406]:
reviews=no_reviews(doc)
print(len(reviews))


49


## extracting data from webpage to find book's editions like paperback, hardcopy, kindle...etc...

In [370]:
def editions(doc):
    edition_tags=doc.find_all('span',{'class':"a-size-small a-color-secondary a-text-normal"})
    edition=[]
    for tag in edition_tags:
        try:
            edition.append(tag.text.strip())
        except:
            edition.append("None")
    return edition

In [371]:
edition=editions(doc)
len(edition)

50

## extracting data from webpage to find book's Price

In [372]:
def price(doc):
    price_tags=doc.find_all('div',{"class": "zg-grid-general-faceout"})
    prices=[]
    for tag in price_tags:
        try:
            prices.append(tag.find('span',{'class':'p13n-sc-price'}).text.strip())
        except:
            prices.append("None")
    return prices

In [373]:
prices=price(doc)
len(prices)

50

## extracting data from webpage to find book's address 

In [374]:
def book_url(doc):
    book_tags=doc.find_all('div',{"class": "zg-grid-general-faceout"})
    books_url=[]
    for tag in book_tags:
        try:
            books_url.append("https://www.amazon.in/"+tag.find('a',{'class':'a-link-normal'})['href'])
        except:
            books_url.append("None")
    return books_url


In [375]:
books_url=book_url(doc)
len(books_url)

50

## extracting data from just 1st page which shows top 50 bestseller books from amazon

In [386]:
def data_scrape(url):
    doc=copy_page(url)
    book_name=book_title(doc)
    author=author_name(doc)
    star=stars(doc)
    review=no_reviews(doc)
    edition=editions(doc)
    books_url=book_url(doc)
    prices=price(doc)
    dict={'Name':book_name,'Author':author,'Ratings (out of 5)':star,'Reviews':review,'Edition':edition,'Price':prices,'Books_url':books_url}
    books_df=pd.DataFrame.from_dict(dict, orient='index').transpose()
    books_df.index=pd.RangeIndex(start=1, stop=len(df)+1)   
    books_df.to_csv("Bestsellers.csv")
    print('Extraction done and saved in Csv file')
                                                   

In [387]:
data_scrape(url)

Extraction done and saved in Csv file


In [388]:
df=pd.read_csv("Bestsellers.csv")

In [389]:
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Author,Ratings (out of 5),Reviews,Edition,Price,Books_url
0,1,Atomic Habits: The life-changing million copy ...,James Clear,4.6,73374,Paperback,₹349.00,https://www.amazon.in//Atomic-Habits-James-Cle...
1,2,The Psychology of Money,Morgan Housel,4.6,47856,Paperback,₹210.00,https://www.amazon.in//Psychology-Money-Morgan...
2,3,NTA NEET (UG) BIOLOGY | RAKSHITA SINGH,RAKSHITA SINGH,,18570,Paperback,₹383.00,https://www.amazon.in//NTA-NEET-BIOLOGY-RAKSHI...
3,4,Grandma's Bag of Stories: Collection of 20+ Il...,Sudha Murty,4.6,46829,Paperback,₹167.00,https://www.amazon.in//Grandmas-Bag-Stories-Su...
4,5,Ikigai: The Japanese secret to a long and happ...,Héctor García,4.6,68280,Hardcover,₹333.00,https://www.amazon.in//Ikigai-H%C3%A9ctor-Garc...


In [390]:
df.columns

Index(['Unnamed: 0', 'Name', 'Author', 'Ratings (out of 5)', 'Reviews',
       'Edition', 'Price', 'Books_url'],
      dtype='object')

In [391]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          50 non-null     int64  
 1   Name                50 non-null     object 
 2   Author              50 non-null     object 
 3   Ratings (out of 5)  49 non-null     float64
 4   Reviews             49 non-null     object 
 5   Edition             50 non-null     object 
 6   Price               50 non-null     object 
 7   Books_url           50 non-null     object 
dtypes: float64(1), int64(1), object(6)
memory usage: 3.2+ KB


## Extract data from multiple pages and return all data together.

In [395]:
def get_all_books(n):
    all_books={'Name':[],'Author':[],'Ratings (out of 5)':[],'Reviews':[],'Edition':[],'Price':[],'Books_url':[]}
    for i in range(1, n+1):
        url=f"https://www.amazon.in/gp/bestsellers/books/ref=zg_bs_pg_{str(i)}?ie=UTF8&pg={str(i)}"
        doc=copy_page(url)
        all_books['Name'] += book_title(doc)
        all_books['Author']+=(author_name(doc))
        all_books['Ratings (out of 5)']+=stars(doc)
        all_books['Reviews']+=no_reviews(doc)
        all_books['Edition']+=editions(doc)
        all_books['Price']+=price(doc)
        all_books['Books_url']+=book_url(doc)
        df=pd.DataFrame.from_dict(all_books,orient='index').transpose()
        df.index=pd.RangeIndex(start=1, stop=len(df)+1)
        df.to_csv("books.csv")
    print(f"data extracted from all the pages, saved in csv file format and length of the data: {len(df)} !") 

In [397]:
get_all_books(5)

data extracted from all the pages, saved in csv file format and length of the data: 250 !


In [404]:
df=pd.read_csv("books.csv", index_col=0)
df.head()

Unnamed: 0,Name,Author,Ratings (out of 5),Reviews,Edition,Price,Books_url
1,Atomic Habits: The life-changing million copy ...,James Clear,4.6,73374,Paperback,₹349.00,https://www.amazon.in//Atomic-Habits-James-Cle...
2,The Psychology of Money,Morgan Housel,4.6,47856,Paperback,₹210.00,https://www.amazon.in//Psychology-Money-Morgan...
3,NTA NEET (UG) BIOLOGY | RAKSHITA SINGH,RAKSHITA SINGH,,18570,Paperback,₹383.00,https://www.amazon.in//NTA-NEET-BIOLOGY-RAKSHI...
4,Grandma's Bag of Stories: Collection of 20+ Il...,Sudha Murty,4.6,46833,Paperback,₹167.00,https://www.amazon.in//Grandmas-Bag-Stories-Su...
5,Ikigai: The Japanese secret to a long and happ...,Héctor García,4.6,68280,Hardcover,₹333.00,https://www.amazon.in//Ikigai-H%C3%A9ctor-Garc...


In [405]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250 entries, 1 to 250
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                250 non-null    object 
 1   Author              250 non-null    object 
 2   Ratings (out of 5)  245 non-null    float64
 3   Reviews             245 non-null    object 
 4   Edition             250 non-null    object 
 5   Price               249 non-null    object 
 6   Books_url           250 non-null    object 
dtypes: float64(1), object(6)
memory usage: 15.6+ KB
