In [None]:
from bs4 import BeautifulSoup # class for parsing HTML documents
import requests # library for making HTTP requests

In [None]:
url = 'https://www.ndtv.com/latest'
page = requests.get(url)
print(page.status_code) # 200 means everything went okay

In [None]:
print(page.text)

In [None]:
demo_html_code = '''
<html>
    <head>
        <title>My First Website</title>
    </head>
    <body>
        <h1>Hello World</h1>
        <p class="subtitle">This is a paragraph</p>
        <p>Here is another paragraph</p>
    </body>
</html>
'''
demo_soup = BeautifulSoup(demo_html_code, 'lxml')
print(demo_soup.find('h1'))
print(demo_soup.find_all('p'))
print(demo_soup.find('p', class_='subtitle'))

In [None]:
type(demo_soup.find('h1'))
item = demo_soup.find('h1')
print(item.text)
item2 = demo_soup.find_all('p')[0] # first paragraph
print(item2)
print(item2.text)
print(item2.attrs)

## parsing data from webpage using request object

In [None]:
soup = BeautifulSoup(page.text, 'lxml')
print(soup.find('a'))
print(soup.find('h2'))

In [None]:
nt = soup.findAll('h2')
for i in nt:
    print(i.text)

In [None]:
for data in soup.find_all('span', class_='posted-by'):
    print(data.text.strip())

## extracting all news from page to dictionary

In [None]:
cards = soup.find_all('div', class_='news_Itm')
print(f"total news cards found: {len(cards)}")

In [None]:
new_data= [] # empty list
for item in cards:
    try:title = item.find('h2').text.strip()
    except:title = None
    try:posted_by = item.find('span', class_='posted-by').text.strip()
    except:posted_by = None
    try:summary = item.find('p', class_='newsCont')
    except:summary = None
    try: imgurl = item.find('img').attrs.get('src')
    except: imgurl = None
    # print(title, posted_by, summary)
    new_data.append({
        'title': title,
        'posted_by': posted_by,
        'summary': summary,
        'imgurl': imgurl
    })
import pandas as pd
pd.DataFrame(new_data)

## using dputils to extract data

In [None]:
from dputils.scrape import Scraper, Tag

In [None]:
url = 'https://www.ndtv.com/india'
scraper = Scraper(url)
result = scraper.get_multiple_page_data(
    target = Tag(cls='lisingNews'),
    items = Tag(cls='news_Itm'),
    title = Tag('h2'),
    posted_by = Tag('span', cls='posted-by'),
    summary = Tag('p', cls='newsCont'),
    imgurl = Tag('img', output='src')
)
df = pd.DataFrame(result)
df

In [None]:
url = "https://www.flipkart.com/search?q=mobiles&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off"
scraper = Scraper(url)
out = scraper.get_multiple_page_data(
    target=Tag('div', cls='_1YokD2 _3Mn1Gg'),
    items=Tag('div', cls='_1AtVbE col-12-12'),
    title=Tag('div', cls='_4rR01T'),
    price=Tag('div', cls='_30jeq3 _1_WHN1'),
    link=Tag('a', cls='_1fQZEK', output='href'),
)
df = pd.DataFrame(out)
df

## extracting data from a single page with dputils

In [90]:
url ='https://www.flipkart.com/apple-iphone-14-blue-128-gb/p/itmdb77f40da6b6d?pid=MOBGHWFHSV7GUFWA&lid=LSTMOBGHWFHSV7GUFWA3AV8J8&marketplace=FLIPKART&q=mobiles&store=tyy%2F4io&srno=s_1_1&otracker=search&otracker1=search&iid=9fa0dc8d-608b-4844-8f16-fca8797cfe2e.MOBGHWFHSV7GUFWA.SEARCH&ssid=nqp9vuwdkg0000001700216770709&qH=eb4af0bf07c16429'
scraper2 = Scraper(url)
scraper2.get_page_data(
    product = Tag('span', cls='B_NuCI'),
    price = Tag(cls='_30jeq3 _16Jk6d'),
    highlights = Tag(cls='_2cM9lP'),
)

{'product': 'APPLE iPhone 14 (Blue, 128 GB)',
 'price': '₹60,999',
 'highlights': 'Highlights128 GB ROM15.49 cm (6.1 inch) Super Retina XDR Display12MP + 12MP | 12MP Front CameraA15 Bionic Chip, 6 Core Processor Processor'}