# URLs of Pages, Books

The webpage has more than 900 pages, and each page has a certain amount of books represented on it. Thus the script below is gathering page urls and urls of all books represented on all the pages.

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

In [None]:
# Define the base URL
base_url = "https://zangakbookstore.am/en/grqer?page="

# Create a list to store the page URLs
page_urls = []

# Iterate through page numbers from 1 to 952
for page_number in range(1, 953):
    page_url = base_url + str(page_number)
    page_urls.append(page_url)



In [None]:
# Define the base URL
books_urls = []

# Iterate through page URLs
for url in page_urls:
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all the 'a' tags with the specified structure
        a_tags = soup.find_all("a", class_="d-inline-block position-relative")

        # Extract the 'href' attribute from each 'a' tag
        for a in a_tags:
            href = a.get("href")
            if href:
                #print(href)
                books_urls.append(href)

    else:
        print(f"Failed to retrieve the web page: {url}")


# Scraping Book Information

The script below scrapes the information about each book by visiting it's URL link and scraping the data from the certain page. Later on, a data frame is created including information about all books, and is saved as a csv file.

In [None]:
# Create a CSV file to store the data
with open("Books.csv", "w", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Title", "Author", "Price", "Publisher", "ISBN", "Publishing Year", "Language", "Age License", "Cover Type", "Pages Number"])

    # Iterate through book URLs
    for url in books_urls:
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            try:
                title = soup.find("h1", class_="product-name mb-4 d-none d-md-block").text.strip() if soup.find("h1", class_="product-name mb-4 d-none d-md-block") else "N/A"
                author = soup.find("a", class_="color-main text-decoration-none author-btn").text.strip() if soup.find("a", class_="color-main text-decoration-none author-btn") else "N/A"
                price = soup.find("div", class_="product-price-view mb-2").text.strip() if soup.find("div", class_="product-price-view mb-2") else "N/A"

               # Extract data from the "tab_details" section with error handling
                details_section = soup.find("div", id="tab_details")
                publisher = details_section.find("div", string="Publishing house").find_next("div").text.strip() if details_section.find("div", string="Publishing house") else "N/A"
                isbn = details_section.find("div", string="EAN").find_next("div").text.strip() if details_section.find("div", string="EAN") else "N/A"
                publishing_year = details_section.find("div", string="Year").find_next("div").text.strip() if details_section.find("div", string="Year") else "N/A"
                language = details_section.find("div", string="Language").find_next("div").text.strip() if details_section.find("div", string="Language") else "N/A"
                age_license = details_section.find("div", string="Age").find_next("div").text.strip() if details_section.find("div", string="Age") else "N/A"
                cover_type = details_section.find("div", string="Coating").find_next("div").text.strip() if details_section.find("div", string="Coating") else "N/A"
                pages_number = details_section.find("div", string="Pages").find_next("div").text.strip() if details_section.find("div", string="Pages") else "N/A"

            except Exception as e:
                print(f"Error while scraping data from {url}: {e}")
                continue


            # Write the data to the CSV file
            writer.writerow([title, author, price, publisher, isbn, publishing_year, language, age_license, cover_type, pages_number])
        else:
            print(f"Failed to retrieve data from {url}")

print("Data has been scraped and saved to 'Books.csv'.")

In [None]:
#Check out the gathered DF
pd.read_csv("Books.csv")

# Data Manipulation

Removing NAs, duplicates and inconvenient formatting of data will make things easier later.

In [None]:
# Option 1
df = pd.read_csv("Books.csv")
df = df.dropna(axis=0).reset_index(drop=True)
df.ISBN = df.ISBN.astype(int)
df.to_csv('Books.csv')

In [None]:
# Option 2 (in case of type errors)
# df = pd.read_csv('Books-13000-14000pp.csv')
# df['ISBN'] = df['ISBN'].str.replace(r'\D', '', regex=True)
# df = df.dropna(axis=0).reset_index(drop=True)
# df.ISBN = df.ISBN.astype(int)