<a href="https://colab.research.google.com/github/dhanshrii2006/R-Language/blob/scraping-with-R/rvest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load required library
library(rvest)

# Define the URL of the Wikipedia page
url <- "https://en.wikipedia.org/wiki/Gross_domestic_product"

# Read the HTML content of the webpage
page <- read_html(url)  # Creates an R object representing the webpage structure

# -------------------------------------------------------------------------
# 1. Extract the Title of the Page
# -------------------------------------------------------------------------
title <- html_text(html_node(page, "h1"))  # Extract the first <h1> tag text
cat("Page Title:", title, "\n\n")  # Print the title

# -------------------------------------------------------------------------
# 2. Extract the First Paragraph
# -------------------------------------------------------------------------
intro_paragraph <- html_text(html_node(page, "p"))  # Extract the first <p> tag text
cat("Introduction Paragraph:\n", intro_paragraph, "\n\n")  # Print the paragraph

# -------------------------------------------------------------------------
# 3. Extract the Table of Contents
# -------------------------------------------------------------------------
toc_items <- html_text(html_nodes(page, "#toc li"))  # Extract all <li> tags in the TOC section
cat("Table of Contents Items:\n")
print(toc_items)  # Print the TOC items
cat("\n")

# -------------------------------------------------------------------------
# 4. Extract Tables
# -------------------------------------------------------------------------
tables <- html_table(html_nodes(page, "table"), fill = TRUE)  # Extract all tables
cat("First Table Extracted:\n")
print(head(tables[[1]]))  # Print the first few rows of the first table
cat("\n")

# -------------------------------------------------------------------------
# 5. Extract All Links
# -------------------------------------------------------------------------
links <- html_nodes(page, "a")  # Extract all <a> tags
link_text <- html_text(links)  # Get the visible text of each link
link_href <- html_attr(links, "href")  # Get the URL (href attribute) of each link

# Combine the text and URLs into a data frame
link_data <- data.frame(Text = link_text, URL = link_href, stringsAsFactors = FALSE)
cat("Sample Links Extracted:\n")
print(head(link_data))  # Print the first few links
cat("\n")

# -------------------------------------------------------------------------
# 6. Extract All Images
# -------------------------------------------------------------------------
images <- html_nodes(page, "img")  # Extract all <img> tags
image_src <- html_attr(images, "src")  # Get the 'src' attribute of each image

cat("Sample Image URLs:\n")
print(head(image_src))  # Print the first few image URLs
cat("\n")

# -------------------------------------------------------------------------
# 7. Extract Metadata
# -------------------------------------------------------------------------
# Extract the description meta tag
description <- html_attr(html_node(page, 'meta[name="description"]'), "content")  # Get 'content' attribute
cat("Meta Description:\n", description, "\n\n")  # Print the meta description

# -------------------------------------------------------------------------
# 8. Extract Specific Section (e.g., History Section)
# -------------------------------------------------------------------------
history_section <- html_nodes(page, xpath = '//span[@id="History"]/parent::h2/following-sibling::p')
history_text <- html_text(history_section)  # Extract paragraphs in the "History" section
cat("History Section Text:\n")
print(history_text)  # Print the extracted text
cat("\n")

# -------------------------------------------------------------------------
# 9. Count Elements (e.g., Number of Paragraphs)
# -------------------------------------------------------------------------
num_paragraphs <- length(html_nodes(page, "p"))  # Count the number of <p> tags
cat("Number of Paragraphs on the Page:", num_paragraphs, "\n\n")


Page Title: Gross domestic product 

Introduction Paragraph:
 Empirical methods
 

Table of Contents Items:
character(0)

First Table Extracted:
[90m# A tibble: 1 × 3[39m
  X1                                                                 X2    X3   
  [3m[90m<chr>[39m[23m                                                              [3m[90m<chr>[39m[23m [3m[90m<chr>[39m[23m
[90m1[39m .mw-parser-output .legend{page-break-inside:avoid;break-inside:av… $750… $50–…

Sample Links Extracted:
             Text                         URL
1 Jump to content                #bodyContent
2       Main page             /wiki/Main_Page
3        Contents    /wiki/Wikipedia:Contents
4  Current events /wiki/Portal:Current_events
5  Random article        /wiki/Special:Random
6 About Wikipedia       /wiki/Wikipedia:About

Sample Image URLs:
[1] "/static/images/icons/wikipedia.png"                                                                                                 
[2] "/static