# Web Scraping with Python and BeautifulSoup


## Loading page source code into Python

Link: https://en.wikipedia.org/wiki/Virat_Kohli

In [1]:
import requests as req

In [2]:
URL = 'https://en.wikipedia.org/wiki/Virat_Kohli'

In [3]:
r = req.get(URL)

In [4]:
print(r.content[:200])

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Virat Kohli - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgB'


In [5]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(r.content)

print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Virat Kohli - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"87a8eabb-43ae-460b-a225-90d03d9a69e2","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Virat_Kohli","wgTitle":"Virat Kohli","wgCurRevisionId":989161545,"wgRevisionId":989161545,"wgArticleId":16017429,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 errors: external links","Articles with short description","Short description is different from Wikidata","Wikipedia pages semi-protected against vandali

In [6]:
title = soup.h1
print(title)

<h1 class="firstHeading" id="firstHeading" lang="en">Virat Kohli</h1>


In [7]:
print(soup.img)

<img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/>


In [12]:
tables = soup.find_all("table")
print(len(tables))
for i in range (len(tables)):
    print('table number :' + str(i+1) ) 
    print(tables[i])

37
table number :1
<table class="infobox vcard" style="width:22em;width: 25em"><caption class="fn" style="font-size: 125%"><span class="fn">Virat Kohli</span></caption><tbody><tr><td colspan="2" style="text-align:center"><a class="image" href="/wiki/File:The_President,_Shri_Pranab_Mukherjee_presenting_the_Padma_Shri_Award_to_Shri_Virat_Kohli,_at_a_Civil_Investiture_Ceremony,_at_Rashtrapati_Bhavan,_in_New_Delhi_on_March_30,_2017_(cropped).jpg"><img alt="The President, Shri Pranab Mukherjee presenting the Padma Shri Award to Shri Virat Kohli, at a Civil Investiture Ceremony, at Rashtrapati Bhavan, in New Delhi on March 30, 2017 (cropped).jpg" data-file-height="865" data-file-width="654" decoding="async" height="344" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/7c/The_President%2C_Shri_Pranab_Mukherjee_presenting_the_Padma_Shri_Award_to_Shri_Virat_Kohli%2C_at_a_Civil_Investiture_Ceremony%2C_at_Rashtrapati_Bhavan%2C_in_New_Delhi_on_March_30%2C_2017_%28cropped%29.jpg/260px-thumbnai

In [13]:
tables[1]["style"]

'width:100%; margin:-1px; white-space:nowrap;'

In [14]:
lists = soup.find_all("li")
print(len(lists))

1126


In [17]:
childs = list(lists[300].children)
print(len(childs))

4


In [18]:
links = soup.find_all("a")
print(len(links))

2845


In [19]:
links[0:5]

[<a id="top"></a>,
 <a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected due to vandalism"><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>,
 <a class="image" href="/wiki/File:The_President,_Shri_Pranab_Mukherjee_presenting_the_Padma_Shri_Award_to_Shri_Virat_Kohli,_at_a_Civil_Investiture_Ceremony,_at_Rashtrapati_Bhavan,_in_New_Delhi_on_March_30,_2017_(cropped).jpg"><img alt="The President, Shri Prana

In [24]:
attr_filter ={ "class" : "mw-jump-link"}
soup.find_all("a",attr_filter)

[<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>]

In [26]:
attr_filter = { "class" : "mw-jump-link" , "href" : "#mw-head"}
soup.find_all("a" , attr_filter)

[<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>]

In [27]:
attr_filter = {"class":"fn"}
soup.find_all(None,attr_filter)

[<caption class="fn" style="font-size: 125%"><span class="fn">Virat Kohli</span></caption>,
 <span class="fn">Virat Kohli</span>,
 <span class="fn"><span style="white-space:nowrap;margin-right:.25em;"><a href="/wiki/File:Virat_Kohli.ogg" title="About this sound"><img alt="About this sound" data-file-height="20" data-file-width="20" decoding="async" height="11" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/8a/Loudspeaker.svg/11px-Loudspeaker.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/8a/Loudspeaker.svg/17px-Loudspeaker.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/8/8a/Loudspeaker.svg/22px-Loudspeaker.svg.png 2x" width="11"/></a></span><a class="internal" href="//upload.wikimedia.org/wikipedia/commons/e/e4/Virat_Kohli.ogg" title="Virat Kohli.ogg">pronunciation</a></span>,
 <div class="fn org" id="India_squad_–_2011_Cricket_World_Cup_(2nd_Title)" style="font-size:114%;margin:0 4em"><a class="mw-redirect" href="/wiki/Indian_cricket_team" title

In [29]:
attr_filter={"id":"firstHeading"}
soup.find_all(None,attr_filter)

[<h1 class="firstHeading" id="firstHeading" lang="en">Virat Kohli</h1>]

In [30]:
selector = ".infobox > tbody:nth-child(2) > tr:nth-child(1) > td:nth-child(1) > a:nth-child(1) > img:nth-child(1)"
soup.select(selector)

[<img alt="The President, Shri Pranab Mukherjee presenting the Padma Shri Award to Shri Virat Kohli, at a Civil Investiture Ceremony, at Rashtrapati Bhavan, in New Delhi on March 30, 2017 (cropped).jpg" data-file-height="865" data-file-width="654" decoding="async" height="344" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/7c/The_President%2C_Shri_Pranab_Mukherjee_presenting_the_Padma_Shri_Award_to_Shri_Virat_Kohli%2C_at_a_Civil_Investiture_Ceremony%2C_at_Rashtrapati_Bhavan%2C_in_New_Delhi_on_March_30%2C_2017_%28cropped%29.jpg/260px-thumbnail.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/7c/The_President%2C_Shri_Pranab_Mukherjee_presenting_the_Padma_Shri_Award_to_Shri_Virat_Kohli%2C_at_a_Civil_Investiture_Ceremony%2C_at_Rashtrapati_Bhavan%2C_in_New_Delhi_on_March_30%2C_2017_%28cropped%29.jpg/390px-thumbnail.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/7/7c/The_President%2C_Shri_Pranab_Mukherjee_presenting_the_Padma_Shri_Award_to_Shri_Virat_Kohli%2C_at

In [32]:
selector = ".thumbimage"
soup.select(selector , limit= 3 )

[<img alt="" class="thumbimage" data-file-height="1191" data-file-width="1163" decoding="async" height="225" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/8b/India_Vs_New_zealand_One_day_International%2C_10_December_2010_%286160488612%29.jpg/220px-India_Vs_New_zealand_One_day_International%2C_10_December_2010_%286160488612%29.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/8b/India_Vs_New_zealand_One_day_International%2C_10_December_2010_%286160488612%29.jpg/330px-India_Vs_New_zealand_One_day_International%2C_10_December_2010_%286160488612%29.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/8/8b/India_Vs_New_zealand_One_day_International%2C_10_December_2010_%286160488612%29.jpg/440px-India_Vs_New_zealand_One_day_International%2C_10_December_2010_%286160488612%29.jpg 2x" width="220"/>,
 <img alt="" class="thumbimage" data-file-height="1163" data-file-width="2000" decoding="async" height="145" src="//upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Virat_Ko