<a href="https://colab.research.google.com/github/dhanshrii2006/R-Language/blob/scraping-with-R/scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary packages if not already installed
if (!require("rvest")) install.packages("rvest")
if (!require("stringr")) install.packages("stringr")

# Load required libraries
library(rvest) # For web scraping
library(stringr) # For string manipulation

# Define the URL to scrape
url <- 'http://books.toscrape.com/catalogue/category/books_1/index.html'

# 1. Scraping Titles
# Extract book titles
titles <- read_html(url) %>%
  html_nodes('h3 a') %>% # Combine 'h3' and 'a' selectors directly
  html_attr('title') # Use 'title' attribute for titles instead of text

# 2. Scraping URLs
# Extract relative URLs for books and convert them to absolute URLs
urls <- read_html(url) %>%
  html_nodes('.image_container a') %>%
  html_attr('href') %>%
  str_replace_all('^../../../', 'http://books.toscrape.com/catalogue/')

# 3. Scraping Ratings
# Extract ratings (e.g., "Three", "Four", etc.)
ratings <- read_html(url) %>%
  html_nodes('.star-rating') %>%
  html_attr('class') %>%
  str_replace_all('star-rating ', '') # Remove the prefix to get only the rating

# 4. Scraping Prices
# Extract prices (formatted as text with symbols)
price <- read_html(url) %>%
  html_nodes('.price_color') %>%
  html_text() %>%
  str_trim() # Remove any leading/trailing whitespace

# 5. Scraping Images
# Extract relative image URLs and convert them to absolute URLs
images <- read_html(url) %>%
  html_nodes('.image_container img') %>%
  html_attr('src') %>%
  str_replace_all('^../../../', 'http://books.toscrape.com/')

# 6. Scraping Availability
# Extract stock availability and clean up the text
availability <- read_html(url) %>%
  html_nodes('.instock.availability') %>%
  html_text() %>%
  str_replace_all('\\n', '') %>% # Remove all newline characters
  str_trim() # Trim any extra whitespace

# 7. Creating a Data Frame
# Combine all the scraped data into a single data frame
scraped <- data.frame(
  Title = titles,
  URL = urls,
  Rating = ratings,
  Price = price,
  Image = images,
  Availability = availability,
  stringsAsFactors = FALSE # Avoid converting text columns to factors
)

# Check the scraped data (optional)
print(scraped)

# 8. Exporting Data to a CSV File
# Save the scraped data to a CSV file
write.csv(scraped, "bookscrapping.csv", row.names = FALSE)

# Print a success message
cat("Data scraping complete! Saved to bookscrapping.csv\n")


Loading required package: rvest

Loading required package: stringr



                                                                                            Title
1                                                                            A Light in the Attic
2                                                                              Tipping the Velvet
3                                                                                      Soumission
4                                                                                   Sharp Objects
5                                                           Sapiens: A Brief History of Humankind
6                                                                                 The Requiem Red
7                                              The Dirty Little Secrets of Getting Your Dream Job
8         The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
9  The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics
10                  