In [1]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup

## Write a python program to display all the header tags from wikipedia.org.

In [2]:
def getHeaders(url):
    request = requests.get(url)
    
    soup = BeautifulSoup(request.text, 'lxml')
    header_tags=set(re.compile(pattern=r'<h\d').findall(request.text))
    header_tags=[re.compile(pattern='h\d').findall(header)[0] for header in header_tags]
    headers={}
    for header_tag in header_tags:
        headers[header_tag]=[]
        for tag in soup.find_all(header_tag):
            headers[tag.name].append(tag.text.strip())
        if len(headers[header_tag])==0:
            headers.pop(header_tag)
    return headers

url='https://www.wikipedia.org/'
getHeaders(url)

{'h2': ['1,000,000+\n\n\narticles',
  '100,000+\n\n\narticles',
  '10,000+\n\n\narticles',
  '1,000+\n\n\narticles',
  '100+\n\n\narticles'],
 'h1': ['Wikipedia\n\nThe Free Encyclopedia']}

## Write a python program to display IMDB’s Top rated 100 movies’ data (i.e. name, rating, year of release) and make data frame.

In [3]:
def getTop100IMDBMovies(url, http_headers):
    html_text = requests.get(url, headers=http_headers).text

    soup = BeautifulSoup(html_text, 'lxml')
    p=re.compile(r'^\d+\..+')
    names=[p.search(name.text).group() for name in soup.find_all('h3', class_='ipc-title__text') if p.search(name.text) is not None][:100]
    p=re.compile(r'\s.+')
    names=[p.search(name).group().strip() for name in names]

    ratings=soup.find_all('span', class_='sc-b0691f29-1 grHDBY')[:100]
    p=re.compile('\d+\.\d+')
    audiance_ratings=[float(p.search(rating.text).group()) for rating in ratings]

    other_details=soup.find_all('div', class_='sc-b0691f29-7 hrgukm cli-title-metadata')[:100]

    years=[]
    runtimes=[]
    rated_labels=[]

    for other_detail in other_details:
        details=other_detail.findAll('span', class_='sc-b0691f29-8 ilsLEX cli-title-metadata-item')
        years.append(int(details[0].text))
        runtimes.append(details[1].text)
        rated_labels.append(details[2].text)

    return pd.DataFrame({
        'Name':names,
        'Audiance_ratings':audiance_ratings,
        'Year_of_release':years,
        'Runtime':runtimes,
        'Rated':rated_labels
    })

http_headers={
    'USER-AGENT':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
}
url="https://www.imdb.com/chart/top/?ref_=nv_mv_250&sort=user_rating%2Cdesc"

getTop100IMDBMovies(url, http_headers)

Unnamed: 0,Name,Audiance_ratings,Year_of_release,Runtime,Rated
0,The Shawshank Redemption,9.3,1994,2h 22m,MA15+
1,The Godfather,9.2,1972,2h 55m,R
2,The Dark Knight,9.0,2008,2h 32m,M
3,12th Fail,9.0,2023,2h 27m,PG
4,Schindler's List,9.0,1993,3h 15m,M
...,...,...,...,...,...
95,A Clockwork Orange,8.3,1971,2h 16m,R
96,Scarface,8.3,1983,2h 50m,R
97,Full Metal Jacket,8.3,1987,1h 56m,R
98,Braveheart,8.3,1995,2h 58m,MA15+


## Write a python program to scrape mentioned details from dineout.co.in : i) Restaurant name ii) Cuisine iii) Location iv) Ratings v) Image URL.

In [4]:
def extract_resturant_details(url):
    html_text=requests.get(url).text
    soup=BeautifulSoup(html_text, 'lxml')

    resturant_details_divs=soup.findAll('div', class_='restnt-main-wrap clearfix')

    resturant_details={
        'name':[],
        'cuisine':[],
        'location':[],
        'rating':[],
        'image_url':[]
    }

    for resturant_details_div in resturant_details_divs:
        image_url=resturant_details_div.find('img', class_='no-img')['data-src']
        resturant_details['image_url'].append(image_url)

        name=resturant_details_div.find('div', class_='restnt-info cursor').a.text
        resturant_details['name'].append(name)

        locations=resturant_details_div.find('div', class_='restnt-loc ellipsis').findAll('a')
        full_location=''
        for location in locations:
            full_location+=location.text+', '
        full_location=full_location.strip()[:-1]
        resturant_details['location'].append(full_location)

        cuisines=resturant_details_div.find('span', class_='double-line-ellipsis').findAll('a')
        full_cuisine=''
        for cuisine in cuisines:
            full_cuisine+=cuisine.text+', '
        full_cuisine=full_cuisine.strip()[:-1]
        resturant_details['cuisine'].append(full_cuisine)

        rating=float(resturant_details_div.find('div', class_='img-wrap').findAll('div')[-1].text)
        resturant_details['rating'].append(rating)
    return resturant_details

url='https://www.dineout.co.in/kolkata-restaurants/welcome-back?city_name=kolkata&limit=100&start=0&cityId=6&listing=1&showAvailableTicket=0&sortby=ratingDESC&tag%5B%5D=Welcome%20Back&tag%5B%5D=Welcome%20Back&p=30'
extract_resturant_details(url)

{'name': ['Blue Nile Lounge',
  'The Burger Factory',
  'Eatelicious',
  'Wow! Momo',
  'Loca Kitchen & Lounge',
  'Privy Ultra Lounge',
  'Subway',
  'Ego Retro Lounge',
  'Ocean Grill',
  'Den',
  'Ecstasea',
  'Buddha Bites',
  'Soul Kitchen',
  'Bawarchi Family Restaurant',
  'Jadu Kadai',
  'Subway',
  'Wow! Momo',
  'Wow! Momo',
  'Apna Dhaba',
  'The Hubb',
  'Bhooter Raja Dilo Bor',
  'Fries Before Guys',
  'Rooftop Top View',
  'Subway',
  'Ice Choco Cafe',
  'Since 2016 Cafe',
  'On My Way',
  'Azad Hind Dhaba',
  '7/12 Fried Ice Cream Parlour',
  'Club 21',
  'Eat Out',
  'Banana Leaf',
  'The Swig - Raajkutir, Swabhumi',
  'The Burger Factory',
  'So Southy',
  'Floriana',
  'Azad Hind Dhaba',
  "Sam's Pub",
  'KFC',
  'KFC',
  'KFC',
  'Aminia',
  'Hing',
  'Fuel Resto Bar',
  'Wow! Momo',
  'Urban Kitchen & Bar',
  'Wow! Momo',
  'Wow! Momo',
  'Wow! Momo',
  'Bawarchi Fast Food'],
 'cuisine': ['Finger Food, North Indian, Chinese',
  'Fast Food, North Indian, Chinese',
  

## Write s python program to display list of respected former finance minister of India(i.e. Name , Term of office) from https://presidentofindia.nic.in/former-presidents.htm and make data frame.

### the web-link doesn't have any content