# Web Scraping BeautifulSoup Assignment

In [1]:
!pip install bs4
!pip install requests 



# 1) Write a python program to display all the header tags from wikipedia.org and make data frame.


In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

def scrape_headers(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create BeautifulSoup object
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all header tags
    headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    
    # Extract header text and store in a list
    header_text = [header.get_text() for header in headers]
    
    # Create a DataFrame from the header text list
    df = pd.DataFrame(header_text, columns=['Header'])
    
    return df

In [3]:
url = 'https://en.wikipedia.org/wiki/Main_Page'
df = scrape_headers(url)
print(df)

                          Header
0                      Main Page
1           Welcome to Wikipedia
2  From today's featured article
3               Did you know ...
4                    In the news
5                    On this day
6       Today's featured picture
7       Other areas of Wikipedia
8    Wikipedia's sister projects
9            Wikipedia languages


# 2) Write s python program to display list of respected former presidents of India(i.e. Name , Term ofoffice) from https://presidentofindia.nic.in/former-presidents.htm and make data frame.

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_former_presidents(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the table containing the former presidents
    table = soup.find('table', class_='tablepress')
    
    # Create lists to store the extracted data
    names = []
    terms = []
    
    # Extract the details from each row in the table
    rows = table.find_all('tr')
    for row in rows[1:]:
        # Extract president's name
        name = row.find('td', class_='column-1').text.strip()
        names.append(name)
        
        # Extract president's term of office
        term = row.find('td', class_='column-2').text.strip()
        terms.append(term)
    
    # Create a dataframe using the extracted data
    data = {
        'Name': names,
        'Term of Office': terms
    }
    df = pd.DataFrame(data)
    
    return df


In [None]:
url = "https://presidentofindia.nic.in/former-presidents.htm"
df = scrape_former_presidents(url)
print(df)


# 3) Write a python program to scrape cricket rankings from icc-cricket.com. You have to scrape and make data frame-
a) Top 10 ODI teams in men’s cricket along with the records for matches, points and rating.

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_mens_odi_rankings(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the table containing the rankings
    table = soup.find('table', class_='table')
    
    # Create lists to store the extracted data
    teams = []
    matches = []
    points = []
    ratings = []
    
    # Extract the details from each row in the table
    rows = table.find_all('tr')
    for row in rows[1:11]:  # Extract only the top 10 teams
        # Extract team name
        team = row.find('span', class_='u-hide-phablet').text.strip()
        teams.append(team)
        
        # Extract matches, points, and rating
        cells = row.find_all('td')
        match = cells[1].text.strip()
        point = cells[2].text.strip()
        rating = cells[3].text.strip()
        matches.append(match)
        points.append(point)
        ratings.append(rating)
    
    # Create a dataframe using the extracted data
    data = {
        'Team': teams,
        'Matches': matches,
        'Points': points,
        'Rating': ratings
    }
    df = pd.DataFrame(data)
    
    return df


In [6]:
url = "https://www.icc-cricket.com/rankings/mens/team-rankings/odi"
df = scrape_mens_odi_rankings(url)
print(df)


           Team           Matches Points Rating
0     Australia    Australia\nAUS     23  2,714
1      Pakistan     Pakistan\nPAK     20  2,316
2         India        India\nIND     33  3,807
3   New Zealand   New Zealand\nNZ     27  2,806
4       England      England\nENG     24  2,426
5  South Africa  South Africa\nSA     19  1,910
6    Bangladesh   Bangladesh\nBAN     25  2,451
7   Afghanistan  Afghanistan\nAFG     10    878
8     Sri Lanka     Sri Lanka\nSL     21  1,682
9   West Indies   West Indies\nWI     25  1,797


# b) Top 10 ODI Batsmen along with the records of their team and rating.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_mens_odi_batsmen_rankings(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the table containing the rankings
    table = soup.find('table', class_='table')
    
    # Create lists to store the extracted data
    players = []
    teams = []
    ratings = []
    
    # Extract the details from each row in the table
    rows = table.find_all('tr')
    for row in rows[1:11]:  # Extract only the top 10 players
        # Extract player name
        player = row.find('td', class_='table-body__cell name').text.strip()
        players.append(player)
        
        # Extract team name
        team = row.find('span', class_='table-body__logo-text').text.strip()
        teams.append(team)
        
        # Extract rating
        rating = row.find('td', class_='table-body__cell u-text-right rating').text.strip()
        ratings.append(rating)
    
    # Create a dataframe using the extracted data
    data = {
        'Player': players,
        'Team': teams,
        'Rating': ratings
    }
    df = pd.DataFrame(data)
    
    return df


In [None]:
url = "https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting"
df = scrape_mens_odi_batsmen_rankings(url)
print(df)


# c) Top 10 ODI bowlers along with the records of their team andrating.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_mens_odi_bowlers_rankings(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the table containing the rankings
    table = soup.find('table', class_='table')
    
    # Create lists to store the extracted data
    players = []
    teams = []
    ratings = []
    
    # Extract the details from each row in the table
    rows = table.find_all('tr')
    for row in rows[1:11]:  # Extract only the top 10 players
        # Extract player name
        player = row.find('td', class_='table-body__cell name').text.strip()
        players.append(player)
        
        # Extract team name
        team = row.find('span', class_='table-body__logo-text').text.strip()
        teams.append(team)
        
        # Extract rating
        rating = row.find('td', class_='table-body__cell u-text-right rating').text.strip()
        ratings.append(rating)
    
    # Create a dataframe using the extracted data
    data = {
        'Player': players,
        'Team': teams,
        'Rating': ratings
    }
    df = pd.DataFrame(data)
    
    return df


In [None]:
url = "https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling"
df = scrape_mens_odi_bowlers_rankings(url)
print(df)


# 4) Write a python program to scrape cricket rankings from icc-cricket.com. You have to scrape and make data frame-
a) Top 10 ODI teams in women’s cricket along with the records for matches, points and rating.

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_womens_odi_rankings(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the table containing the rankings
    table = soup.find('table', class_='table')
    
    # Create lists to store the extracted data
    teams = []
    matches = []
    points = []
    ratings = []
    
    # Extract the details from each row in the table
    rows = table.find_all('tr')
    for row in rows[1:11]:  # Extract only the top 10 teams
        # Extract team name
        team = row.find('span', class_='u-hide-phablet').text.strip()
        teams.append(team)
        
        # Extract matches, points, and rating
        cells = row.find_all('td')
        match = cells[1].text.strip()
        point = cells[2].text.strip()
        rating = cells[3].text.strip()
        matches.append(match)
        points.append(point)
        ratings.append(rating)
    
    # Create a dataframe using the extracted data
    data = {
        'Team': teams,
        'Matches': matches,
        'Points': points,
        'Rating': ratings
    }
    df = pd.DataFrame(data)
    
    return df


In [8]:
url = "https://www.icc-cricket.com/rankings/womens/team-rankings/odi"
df = scrape_womens_odi_rankings(url)
print(df)


           Team           Matches Points Rating
0     Australia    Australia\nAUS     21  3,603
1       England      England\nENG     28  3,342
2  South Africa  South Africa\nSA     26  3,098
3         India        India\nIND     27  2,820
4   New Zealand   New Zealand\nNZ     25  2,553
5   West Indies   West Indies\nWI     27  2,535
6      Thailand     Thailand\nTHA     11    821
7    Bangladesh   Bangladesh\nBAN     14    977
8      Pakistan     Pakistan\nPAK     27  1,678
9     Sri Lanka     Sri Lanka\nSL      9    479


# b) Top 10 women’s ODI Batting players along with the records of their team and rating.

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_womens_odi_batting_rankings(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the table containing the rankings
    table = soup.find('table', class_='table')
    
    # Create lists to store the extracted data
    players = []
    teams = []
    ratings = []
    
    # Extract the details from each row in the table
    rows = table.find_all('tr')
    for row in rows[1:11]:  # Extract only the top 10 players
        # Extract player name
        player = row.find('td', class_='table-body__cell name').text.strip()
        players.append(player)
        
        # Extract team name
        team = row.find('span', class_='table-body__logo-text').text.strip()
        teams.append(team)
        
        # Extract rating
        rating = row.find('td', class_='table-body__cell u-text-right rating').text.strip()
        ratings.append(rating)
    
    # Create a dataframe using the extracted data
    data = {
        'Player': players,
        'Team': teams,
        'Rating': ratings
    }
    df = pd.DataFrame(data)
    
    return df


In [None]:
url = "https://www.icc-cricket.com/rankings/womens/player-rankings/odi/batting"
df = scrape_womens_odi_batting_rankings(url)
print(df)


# c) Top 10 women’s ODI all-rounder along with the records of their team and rating.

In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_womens_odi_allrounder_rankings(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the table containing the rankings
    table = soup.find('table', class_='table')
    
    # Create lists to store the extracted data
    players = []
    teams = []
    ratings = []
    
    # Extract the details from each row in the table
    rows = table.find_all('tr')
    for row in rows[1:11]:  # Extract only the top 10 players
        # Extract player name
        player = row.find('td', class_='table-body__cell name').text.strip()
        players.append(player)
        
        # Extract team name
        team = row.find('span', class_='table-body__logo-text').text.strip()
        teams.append(team)
        
        # Extract rating
        rating = row.find('td', class_='table-body__cell u-text-right rating').text.strip()
        ratings.append(rating)
    
    # Create a dataframe using the extracted data
    data = {
        'Player': players,
        'Team': teams,
        'Rating': ratings
    }
    df = pd.DataFrame(data)
    
    return df


In [None]:
url = "https://www.icc-cricket.com/rankings/womens/player-rankings/odi/all-rounder"
df = scrape_womens_odi_allrounder_rankings(url)
print(df)


# 5) Write a python program to scrape mentioned news details from https://www.cnbc.com/world/?region=world and make data frame-
i) Headline
ii) Time
iii) News Link

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_news_details(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the section containing the news articles
    section = soup.find('div', class_='Latest-news-list-container')
    
    # Create lists to store the extracted data
    headlines = []
    times = []
    news_links = []
    
    # Extract the details from each news article
    articles = section.find_all('div', class_='Card-title')
    for article in articles:
        # Extract headline
        headline = article.find('a').text.strip()
        headlines.append(headline)
        
        # Extract time
        time = article.find('span', class_='Card-time').text.strip()
        times.append(time)
        
        # Extract news link
        news_link = article.find('a')['href']
        news_links.append(news_link)
    
    # Create a dataframe using the extracted data
    data = {
        'Headline': headlines,
        'Time': times,
        'News Link': news_links
    }
    df = pd.DataFrame(data)
    
    return df


In [None]:
url = "https://www.cnbc.com/world/?region=world"
df = scrape_news_details(url)
print(df)


# 6) Write a python program to scrape the details of most downloaded articles from AI in last 90 days.https://www.journals.elsevier.com/artificial-intelligence/most-downloaded-articles Scrape below mentioned details and make data frame-
i) Paper Title
ii) Authors
iii) Published Date
iv) Paper URL

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_most_downloaded_articles(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the section containing the most downloaded articles
    section = soup.find('section', id='body')
    
    # Create lists to store the extracted data
    titles = []
    authors = []
    dates = []
    paper_urls = []
    
    # Extract the details from each article card
    article_cards = section.find_all('li', class_='result-list-item')
    for card in article_cards:
        # Extract paper title
        title = card.find('a', class_='result-list-title-link').text.strip()
        titles.append(title)
        
        # Extract authors
        author_tags = card.find_all('span', class_='author-list')
        author_list = [author.text.strip() for author in author_tags]
        authors.append(', '.join(author_list))
        
        # Extract published date
        date = card.find('span', class_='published-online').text.strip()
        dates.append(date)
        
        # Extract paper URL
        paper_url = card.find('a', class_='result-list-title-link')['href']
        paper_urls.append(paper_url)
    
    # Create a dataframe using the extracted data
    data = {
        'Paper Title': titles,
        'Authors': authors,
        'Published Date': dates,
        'Paper URL': paper_urls
    }
    df = pd.DataFrame(data)
    
    return df


In [None]:
url = "https://www.journals.elsevier.com/artificial-intelligence/most-downloaded-articles"
df = scrape_most_downloaded_articles(url)
print(df)
 

# 7) Write a python program to scrape mentioned details from dineout.co.in and make data frame-
i) Restaurant name
ii) Cuisine
iii) Location
iv) Ratings
v) Image URL

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_restaurant_details(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all the restaurant cards
    restaurant_cards = soup.find_all('div', class_='restnt-card')
    
    # Create lists to store the extracted data
    names = []
    cuisines = []
    locations = []
    ratings = []
    image_urls = []
    
    # Extract the details from each restaurant card
    for card in restaurant_cards:
        # Extract restaurant name
        name = card.find('div', class_='restnt-name ellipsis').text.strip()
        names.append(name)
        
        # Extract cuisine
        cuisine = card.find('div', class_='restnt-cuisine').text.strip()
        cuisines.append(cuisine)
        
        # Extract location
        location = card.find('div', class_='restnt-loc ellipsis').text.strip()
        locations.append(location)
        
        # Extract ratings
        rating = card.find('div', class_='restnt-rating').text.strip()
        ratings.append(rating)
        
        # Extract image URL
        image_url = card.find('div', class_='restnt-thumbnail').find('img')['data-src']
        image_urls.append(image_url)
    
    # Create a dataframe using the extracted data
    data = {
        'Restaurant Name': names,
        'Cuisine': cuisines,
        'Location': locations,
        'Ratings': ratings,
        'Image URL': image_urls
    }
    df = pd.DataFrame(data)
    
    return df


In [None]:
url = "https://www.dineout.co.in/delhi-restaurants"
df = scrape_restaurant_details(url)
print(df)
