This is an example of webscrapping a site "https://www.example.com/" using BeautifulSoup

In [2]:
import requests

In [3]:
res = requests.get("https://www.example.com/")

In [4]:
type (res)

requests.models.Response

In [8]:
res.text

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

In [10]:
import bs4

In [11]:
soup = bs4.BeautifulSoup(res.text, "lxml")

In [14]:
soup

<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples

In [16]:
soup.select('title')

[<title>Example Domain</title>]

In [18]:
title_tag = soup.select('title')

In [20]:
title_tag[0]

<title>Example Domain</title>

In [22]:
type(title_tag[0])

bs4.element.Tag

In [24]:
title_tag[0].getText()

'Example Domain'

In [26]:
ptag = soup.select('p')

In [28]:
ptag[0].getText()

'This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.'

# Scrapping Content off of a Wikipedia Page

In [30]:
res1 = requests.get("https://en.wikipedia.org/wiki/Shah_Rukh_Khan")

In [32]:
soup1 = bs4.BeautifulSoup(res1.text,'lxml')

In [34]:
soup1

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Shah Rukh Khan - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-f

In [36]:
soup1.select('.vector-toc-text')

[<div class="vector-toc-text">(Top)</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">1</span>
 <span>Early life and family</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">1.1</span>
 <span>Parents</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">1.2</span>
 <span>Early life</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2</span>
 <span>Acting career</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.1</span>
 <span>1988–1992: Television and film debut</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.2</span>
 <span>1993–1994: Negative characters</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.3</span>
 <span>1995–1998: Romantic roles</span>
 </div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.4</span>
 <span>1999–2003: Career fluctuations</span>
 </div>,
 <div class="

In [38]:
first_item = soup1.select('.vector-toc-text')[1]

In [40]:
first_item.text

'\n1\nEarly life and family\n'

In [42]:
for items in soup1.select('.vector-toc-text'):
    print(items.text)

(Top)

1
Early life and family


1.1
Parents


1.2
Early life


2
Acting career


2.1
1988–1992: Television and film debut


2.2
1993–1994: Negative characters


2.3
1995–1998: Romantic roles


2.4
1999–2003: Career fluctuations


2.5
2004–2009: Comeback


2.6
2010–2014: Expansion to action and comedy


2.7
2015–2022: Career setbacks and hiatus


2.8
2023–present: Resurgence


3
Other work


3.1
Film production and television hosting


3.2
Stage performances


3.3
Ownership of Kolkata Knight Riders


4
In the media


5
Awards and recognitions


6
Personal life


7
See also


8
Footnotes


9
References


10
Bibliography


11
Further reading


12
External links



# Scrapping Image off of a Wikipedia Page 

In [44]:
soup1.select('img')

[<img alt="" aria-hidden="true" class="mw-logo-icon" height="50" src="/static/images/icons/wikipedia.png" width="50"/>,
 <img alt="Wikipedia" class="mw-logo-wordmark" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"/>,
 <img alt="The Free Encyclopedia" class="mw-logo-tagline" height="13" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" style="width: 7.3125em; height: 0.8125em;" width="117"/>,
 <img alt="Featured article" class="mw-file-element" data-file-height="443" data-file-width="466" decoding="async" height="19" src="//upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/20px-Cscr-featured.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/30px-Cscr-featured.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/40px-Cscr-featured.svg.png 2x" width="20"/>,
 <img alt="Page semi-protected" class="mw-file-element" data-file-height="512" data-file-width=

In [46]:
soup1.select('img')[0]

<img alt="" aria-hidden="true" class="mw-logo-icon" height="50" src="/static/images/icons/wikipedia.png" width="50"/>

In [48]:
soup1.select('.mw-file-element')

[<img alt="Featured article" class="mw-file-element" data-file-height="443" data-file-width="466" decoding="async" height="19" src="//upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/20px-Cscr-featured.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/30px-Cscr-featured.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/40px-Cscr-featured.svg.png 2x" width="20"/>,
 <img alt="Page semi-protected" class="mw-file-element" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/>,
 <img class="mw-file-element" data-file-height="529" da

In [50]:
soup1.select('.mw-file-element')[2]

<img class="mw-file-element" data-file-height="529" data-file-width="369" decoding="async" height="315" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6e/Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg/220px-Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/6e/Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg/330px-Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/6/6e/Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg 2x" width="220"/>

In [52]:
image = soup1.select('.mw-file-element')[2]

In [54]:
image['class']

['mw-file-element']

In [56]:
image['src']

'//upload.wikimedia.org/wikipedia/commons/thumb/6/6e/Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg/220px-Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg'

### <img 
      src='//upload.wikimedia.org/wikipedia/commons/thumb/6/6e/Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg/220px-Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg'>

<img 
src ="//upload.wikimedia.org/wikipedia/commons/thumb/6/6e/Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg/220px-Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg">

In [58]:
image_link = requests.get("https://upload.wikimedia.org/wikipedia/commons/thumb/6/6e/Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg/220px-Shah_Rukh_Khan_graces_the_launch_of_the_new_Santro.jpg")

In [60]:
image_link.content

b'\xff\xd8\xff\xdb\x00C\x00\x04\x03\x03\x04\x03\x03\x04\x04\x03\x04\x05\x04\x04\x05\x06\n\x07\x06\x06\x06\x06\r\t\n\x08\n\x0f\r\x10\x10\x0f\r\x0f\x0e\x11\x13\x18\x14\x11\x12\x17\x12\x0e\x0f\x15\x1c\x15\x17\x19\x19\x1b\x1b\x1b\x10\x14\x1d\x1f\x1d\x1a\x1f\x18\x1a\x1b\x1a\xff\xdb\x00C\x01\x04\x05\x05\x06\x05\x06\x0c\x07\x07\x0c\x1a\x11\x0f\x11\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\xff\xc0\x00\x11\x08\x01;\x00\xdc\x03\x01"\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1d\x00\x00\x01\x04\x03\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x03\x05\x06\x07\x01\x02\x08\x00\t\xff\xc4\x00J\x10\x00\x02\x01\x02\x04\x03\x05\x05\x04\x06\x06\x08\x06\x03\x00\x00\x01\x02\x03\x04\x11\x00\x05\x12!\x061A\x07\x13"Qa\x14q\x81\x91\xa1\x082B\xb1\x15#R\xc1\xd1\xf03b\x82\x92\xa2\xe1\x16$Ccrs\xb2\xc2%S\x83\xa3\xd2\xf1\x174\x94\xff\xc4\x00\x1b\x01\x00\x02

In [62]:
f = open ('SRK+image.jpg','wb')

In [64]:
f.write(image_link.content)

19224

In [66]:
f.close()