In [1]:
# Display as slides with the Jupyter notebook RISE extension
# https://github.com/damianavila/RISE
from notebook.services.config import ConfigManager
cm = ConfigManager()`
cm.update('livereveal', {
              'theme': 'sans-serif',
              'transition': 'default',
              'start_slideshow_at': 'selected',
})

{'theme': 'sans-serif',
 'transition': 'default',
 'start_slideshow_at': 'selected'}

# Intro to Web Scraping


&nbsp;

### Matt Bauman
#### July 6, 2016

# What is HTML?

* Human *and* machine-readable text
* Supposed to be the semantic structure of a document

* Horribly abused
* Often terribly malformed
* Frequently unreadable by humans and just barely readable by machines

* It's a ~~miracle~~ ton of effort that makes browsers work at all

# Okay, but *what is it*?

* Plain-text markup that wraps content in **tags**
* Tags are marked in brackets like `<body>`
* And everything that follows is considered part of `body` until it's closed with a `</body>`.

* Tags can be nested
* Can be closed immediately without enclosing any content `<div />`.
* Can have attributes to modify their behavior or name them

In [2]:
from IPython.core.display import display, HTML
display(HTML('<p style="color:red;">Hello, world</h1>'))

In [4]:
import requests
print(requests.get('http://www.nytimes.com/').text)

<!DOCTYPE html>
<html lang="en-US"  xmlns:og="http://opengraphprotocol.org/schema/">
  <head>
    <title data-rh="true">The New York Times - Breaking News, US News, World News and Videos</title>
    <meta data-rh="true" name="robots" content="noarchive,noodp,noydir"/><meta data-rh="true" name="application-name" content="The New York Times"/><meta data-rh="true" name="msapplication-starturl" content="https://www.nytimes.com"/><meta data-rh="true" name="msapplication-task" content="name=Search;action-uri=https://www.nytimes.com/search/?src=iepin;icon-uri=https://static01.nyt.com/images/icons/search.ico"/><meta data-rh="true" name="msapplication-task" content="name=Most Popular;action-uri=https://www.nytimes.com/gst/mostpopular.html?src=iepin;icon-uri=https://static01.nyt.com/images/icons/mostpopular.ico"/><meta data-rh="true" name="msapplication-task" content="name=Video;action-uri=https://video.nytimes.com/?src=iepin;icon-uri=https://static01.nyt.com/images/icons/video.ico"/><meta data-

# Important tags for scraping

* `div` - major sections
* `table` - broken down into `tr` (rows) and `td` (datum)
* `form` - contains `input` tags that get submitted
* `ul`/`ol` - lists (ordered and unordered), contains `li` (list items)

# Important attributes for scraping

* `id` and `class`
* They *name* tags; web developers use these names for styling and interactivity
* `id`s are unique; `class`es are groups

# Why web scraping is terrible

### Invalid pages and incompatibilities

* w3c (WWW Consortium) sets standards for HTML, CSS, XML, etc.
* They have [a validator](https://validator.w3.org) to ensure that pages meet their specs

### HTML can be extremely hard to read

* Fortunately, web inspector tools can make your life easier
* Check out [The NY Times](http://www.nytimes.com/) in the browser

### Some sites require javascript to work

* There aren't any libraries (that I'm aware of) that implement Javascript
* Try turning off Javascript in your browser and make sure the site still works
* You can often *emulate* the Javascript code to make the same requests... but it's a pain

### It's fragile 

* While the *markup* is machine readable, that just specifies page layout
* The *same content* can be coded in HTML in an infinite number of ways and still look identical
* Web authors can change their code at any point...

* **and still look very similar**. [An extreme example](https://web.archive.org/web/20001109144000/http://www1.nytimes.com/)

# Working around the terrible-ness

* Don't worry about parsing yourself -- no regexes or string searches!
* Don't worry about traversing individual nested levels (e.g., inside two divs and ...)

### Instead...

* Think of each webpage as a "tag soup"
* Try to find a way to describe the tags you're looking for in a minimal way
* And use a good library

# Scraping in five lines:

In [40]:
# Look for headlines in the NYTimes
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.nytimes.com/')
soup = BeautifulSoup(r.text)
tags = soup.find_all(attrs={'class': 'story-heading'})


In [26]:
tags_story_wrapper = soup.find_all(attrs={'class': 'story-wrapper'})

In [27]:
tags_story_wrapper

[<section class="story-wrapper"><div><a class="css-t059uy" data-story="nyt://article/c0b61f00-cf65-5536-98d3-79fb9db7949f" data-uri="nyt://article/c0b61f00-cf65-5536-98d3-79fb9db7949f" data-visited="" href="https://www.nytimes.com/2021/06/26/us/miami-building-collapse-investigation.html"><h3 class="css-xxaj7r e1lsht870" size="400"><span>Engineer Warned of ‘Major Structural Damage’ at Florida Condo Complex</span></h3></a><div class="css-1a00rns e16ry1jj0" color="orange"></div><a class="css-t059uy" data-story="nyt://article/c0b61f00-cf65-5536-98d3-79fb9db7949f" data-uri="nyt://article/c0b61f00-cf65-5536-98d3-79fb9db7949f" data-visited="" href="https://www.nytimes.com/2021/06/26/us/miami-building-collapse-investigation.html"><ul class="css-ipcj49"><li>An engineer in 2018 urged the managers of the Champlain Towers South condominium complex near Miami to repair cracked columns and crumbling concrete.</li><li>The mayor of Surfside, where the condo is, said the town had received the report bu

In [28]:
tags_css_codfme = soup.find_all(attrs={'class': 'css-codfme e1lsht870'})

In [29]:
tags_css_codfme

[<h3 class="css-codfme e1lsht870" size="200">Miami-Dade Mayor Opens Audit of Buildings 40 Years and Older</h3>,
 <h3 class="css-codfme e1lsht870" size="200">Anxious residents of the sister tower to the fallen condo are wondering: Stay or go?</h3>,
 <h3 class="css-codfme e1lsht870" size="200">Officer Injured in Capitol Riot Asks McCarthy to Disavow Lies About It</h3>,
 <h3 class="css-codfme e1lsht870" size="200">Next year, Britons will fly abroad. For now, it’s bingo by the sea.</h3>,
 <h3 class="css-codfme e1lsht870" size="200">Britain’s health minister, accused of violating social-distancing rules, resigned. Here’s the latest on Covid-19.</h3>,
 <h3 class="css-codfme e1lsht870" size="200">‘A Generational Crisis’: The Virus Devastates Education in Latin America</h3>,
 <h3 class="css-codfme e1lsht870" size="200">What Does Eric Adams, Working-Class Champion, Mean for the Democrats?</h3>,
 <h3 class="css-codfme e1lsht870" size="200">The Bahamas Are My Home. A Threat We Didn’t Create Is De

In [32]:
for tag in tags: display(HTML(str(tag)))

In [33]:
for tag in tags_css_codfme: display(HTML(str(tag)))

In [41]:
r.headers

{'Connection': 'close', 'Content-Length': '260836', 'Server': 'nginx', 'Content-Type': 'text/html; charset=utf-8', 'x-nyt-data-last-modified': 'Sat, 26 Jun 2021 20:33:32 GMT', 'Last-Modified': 'Sat, 26 Jun 2021 20:33:32 GMT', 'X-PageType': 'vi-homepage', 'X-XSS-Protection': '1; mode=block', 'X-Content-Type-Options': 'nosniff', 'Content-Encoding': 'gzip', 'cache-control': 's-maxage=30,no-cache', 'x-nyt-route': 'homepage', 'X-Origin-Time': '2021-06-26 20:38:31 UTC', 'Accept-Ranges': 'bytes', 'Date': 'Sat, 26 Jun 2021 20:38:33 GMT', 'Age': '2', 'X-Served-By': 'cache-lga21922-LGA, cache-ewr18161-EWR', 'X-Cache': 'HIT, HIT', 'X-Cache-Hits': '1, 1', 'X-Timer': 'S1624739914.541306,VS0,VE3', 'Vary': 'Accept-Encoding, Fastly-SSL', 'Set-Cookie': 'nyt-a=VngR6mqfoikbXiiNiKYDvw; Expires=Sun, 26 Jun 2022 20:38:33 GMT; Path=/; Domain=.nytimes.com; SameSite=none; Secure, nyt-gdpr=0; Expires=Sun, 27 Jun 2021 02:38:33 GMT; Path=/; Domain=.nytimes.com, nyt-purr=cfhhcfhhhck; Expires=Sun, 26 Jun 2022 20:38

In [34]:
r.headers['content-type']

'text/html; charset=utf-8'

In [36]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en-US" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <title data-rh="true">
   The New York Times - Breaking News, US News, World News and Videos
  </title>
  <meta content="noarchive,noodp,noydir" data-rh="true" name="robots"/>
  <meta content="The New York Times" data-rh="true" name="application-name"/>
  <meta content="https://www.nytimes.com" data-rh="true" name="msapplication-starturl"/>
  <meta content="name=Search;action-uri=https://www.nytimes.com/search/?src=iepin;icon-uri=https://static01.nyt.com/images/icons/search.ico" data-rh="true" name="msapplication-task"/>
  <meta content="name=Most Popular;action-uri=https://www.nytimes.com/gst/mostpopular.html?src=iepin;icon-uri=https://static01.nyt.com/images/icons/mostpopular.ico" data-rh="true" name="msapplication-task"/>
  <meta content="name=Video;action-uri=https://video.nytimes.com/?src=iepin;icon-uri=https://static01.nyt.com/images/icons/video.ico" data-rh="true" name="msapplication-ta

# Hedging your bets

* There are lots of ways to specify a search through the tag soup
* Some methods may be more robust than others...
* But it's not worth spending too much time trying to out-wit whatever might be updating the site on the other side

In [37]:
# Another way to get the headlines
articles = soup.find_all('article')
import re
[article.find_all(re.compile('^h\d')) for article in articles]

[[<h2 class="css-tzcwp9 e1voiwgp0">Great Weekend Listens</h2>],
 [<h2 class="css-tzcwp9 e1voiwgp0">Listen to ‘The Modern Love Podcast’</h2>],
 [<h2 class="css-tzcwp9 e1voiwgp0">Listen to ‘The Book Review Podcast’</h2>]]

# Advanced topics: HTTP

* HTTP specifies *how* you ask for and retrieve content
* Also specifies metadata in headers that control caching, redirects, sessions, and more

In [39]:
r_goog = requests.get('http://google.com/')
r_goog.headers

{'Date': 'Sat, 26 Jun 2021 20:38:21 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'Content-Length': '5704', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': '1P_JAR=2021-06-26-20; expires=Mon, 26-Jul-2021 20:38:21 GMT; path=/; domain=.google.com; Secure, NID=217=wXtt2VGg7yR1mUgFPR0kS6r6Ub4xtYI6TlpN72L35fbDhkvFlQXdbPSwoCIo_DVo314Kr41uPEvluiziM9k3kyUSNzBeKhV135F_v3qI47bSKAJg4gBfI2aiAFHmIDFXtaHPymzxXiOYd8RWjoI-6P7ig_B42lqdUjjrTccbC8A; expires=Sun, 26-Dec-2021 20:38:21 GMT; path=/; domain=.google.com; HttpOnly'}

# Searches and forms

* Typically, the most interesting things to scrape are hidden behind searches and forms
* How do you enter text into Google's search box via Python?

In [42]:
soup = BeautifulSoup(requests.get('http://google.com').text)
print(soup.find('form').prettify())

<form action="/search" name="f">
 <table cellpadding="0" cellspacing="0">
  <tr valign="top">
   <td width="25%">
   </td>
   <td align="center" nowrap="">
    <input name="ie" type="hidden" value="ISO-8859-1"/>
    <input name="hl" type="hidden" value="en"/>
    <input name="source" type="hidden" value="hp"/>
    <input name="biw" type="hidden"/>
    <input name="bih" type="hidden"/>
    <div class="ds" style="height:32px;margin:4px 0">
     <input autocomplete="off" class="lst" maxlength="2048" name="q" size="57" style="margin:0;padding:5px 8px 0 6px;vertical-align:top;color:#000" title="Google Search" value=""/>
    </div>
    <br style="line-height:0"/>
    <span class="ds">
     <span class="lsbb">
      <input class="lsb" name="btnG" type="submit" value="Google Search"/>
     </span>
    </span>
    <span class="ds">
     <span class="lsbb">
      <input class="lsb" id="tsuid1" name="btnI" type="submit" value="I'm Feeling Lucky"/>
      <script nonce="9m0umWohb/QBZoh2CoV5SQ==">
 

In [44]:
r_search = requests.get('http://google.com/search', 
                 params={'q':  'how long does a walrus live?',
                         'btnI': "I'm Feeling Lucky"})


In [48]:
s_search = BeautifulSoup(r_search.text)

In [57]:
for title in (s_search.find_all('a')): print(title)

<a href="/?sa=X&amp;ved=0ahUKEwjLsp6WnLbxAhUuQjABHX57C3wQOwgC"><span class="V6gwVd">G</span><span class="iWkuvd">o</span><span class="cDrQ7">o</span><span class="V6gwVd">g</span><span class="ntlR9">l</span><span class="iWkuvd tJ3Myc">e</span></a>
<a class="l" href="/?sa=I&amp;output=search&amp;ie=UTF-8&amp;ved=0ahUKEwjLsp6WnLbxAhUuQjABHX57C3wQPAgE"><span class="V6gwVd">G</span><span class="iWkuvd">o</span><span class="cDrQ7">o</span><span class="V6gwVd">g</span><span class="ntlR9">l</span><span class="iWkuvd tJ3Myc">e</span></a>
<a href="/search?q=how+long+does+a+walrus+live%3F&amp;sa=I&amp;ie=UTF-8&amp;gbv=1&amp;sei=B5nXYIuTEq6EwbkP_vat4Ac">here</a>
<a class="eZt8xd" href="/search?q=how+long+does+a+walrus+live%3F&amp;ie=UTF-8&amp;source=lnms&amp;tbm=isch&amp;sa=X&amp;ved=0ahUKEwjLsp6WnLbxAhUuQjABHX57C3wQ_AUICCgB">Images</a>
<a class="eZt8xd" href="/search?q=how+long+does+a+walrus+live%3F&amp;ie=UTF-8&amp;source=lnms&amp;tbm=nws&amp;sa=X&amp;ved=0ahUKEwjLsp6WnLbxAhUuQjABHX57C3wQ_AUICSg

# Types of requests

* `requests.get` is actually doing a `GET`
    * It encodes the parameters (if any) directly into the url: `?param=value&param2=value2...`
    * This means that it gets *saved into your browser history*
    * Back buttons, refresh may send the same parameters again

### Other HTTP verbs:

* `POST` is the other most common method
    * Just like `GET`, except that it sends its parameters hidden in a header
    * Often used for purchases, posts, etc, that you don't want to submit twice
* There's [others](https://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol#Request_methods) (`PUT`, `DELETE`, `HEAD`, ...), but they're rarer

# A slightly more complicated example

* Let's look for satellites! [heavens-above.com](http://heavens-above.com)

In [83]:
soup_ha = BeautifulSoup(r_ha.text)
soup_ha.prettify()

'<!DOCTYPE html>\n<html dir="ltr" lang="en">\n <head>\n  <title>\n   ISS - Visible Passes\n  </title>\n  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>\n  <meta content="Satellite predictions and other astronomical data customised for your location." name="description"/>\n  <link href="css/ha.css" rel="stylesheet" type="text/css"/>\n  <link href="/favicon.ico" rel="shortcut icon" type="image/vnd.microsoft.icon"/>\n  <link href="/favicon.ico" rel="icon" type="image/vnd.microsoft.icon"/>\n  <script src="//code.jquery.com/jquery-1.12.0.min.js" type="text/javascript">\n  </script>\n  <script src="/scripts/standard.min.js" type="text/javascript">\n  </script>\n  <script type="text/javascript">\n   function updateLocalTime(utc) {\r\n\t\t\tvar localdiff = parseInt($(\'#utcOffset\').val());\r\n\t\t\tvar local_now = new Date(utc.getTime() + localdiff);\r\n\t\t\t$(\'#spanTime\').text(formatTime(local_now));\r\n\t\t}\r\n\r\n\t\t$(function () {\r\n\t\t\tonClockTick = updateLocalTime;\r\n\t

In [58]:
# Scrape the times that the ISS is visible
r_ha = requests.get('http://heavens-above.com/PassSummary.aspx?satid=25544&lat=41.8781&lng=-87.6298&loc=Chicago&alt=181&tz=CST')
def scrape_times(text):
    soup = BeautifulSoup(text)
    rows = soup.find_all('tr', attrs={'class':'clickableRow'})
    times = []
    for row in rows:
        cols = row.find_all('td')
        times.append(cols[0].text + ' ' + cols[2].text)
    return times
scrape_times(r_ha.text)

['26 Jun 04:11:33',
 '27 Jun 03:26:11',
 '28 Jun 04:13:47',
 '29 Jun 03:28:20',
 '30 Jun 02:42:50',
 '30 Jun 04:15:48',
 '01 Jul 03:30:15',
 '02 Jul 02:44:38',
 '02 Jul 04:18:34',
 '03 Jul 01:58:59',
 '03 Jul 03:31:56',
 '04 Jul 02:46:14',
 '04 Jul 04:21:50',
 '05 Jul 02:00:29',
 '05 Jul 03:34:08']

In [59]:
# Get the next page
r_ha2 = requests.get('http://heavens-above.com/PassSummary.aspx?satid=25544&lat=41.8781&lng=-87.6298&loc=Chicago&alt=181&tz=CST')
def get_next_page(r):
    soup = BeautifulSoup(r.text)
    inputs = soup.find_all('input')
    d = {input.attrs['name']: input.attrs['value'] for input in inputs}
    d.pop('ctl00$cph1$btnPrev')
    d['ctl00_cph1_radioAll'] = 'radioVisible'
    from urllib.parse import urlparse, urljoin
    url = urljoin(r.url, soup.find('form').attrs['action'])
    return requests.post(url, d)
scrape_times(get_next_page(r_ha2).text)

['06 Jul 01:08:51',
 '06 Jul 02:46:27',
 '06 Jul 04:24:51',
 '06 Jul 06:01:44',
 '06 Jul 07:38:20',
 '07 Jul 00:21:51',
 '07 Jul 01:58:49',
 '07 Jul 03:37:18',
 '07 Jul 05:14:34',
 '07 Jul 06:51:03',
 '07 Jul 08:29:34',
 '07 Jul 23:35:08',
 '08 Jul 01:11:17',
 '08 Jul 02:49:39',
 '08 Jul 04:27:21',
 '08 Jul 06:03:50',
 '08 Jul 07:41:13',
 '08 Jul 22:48:55',
 '09 Jul 00:23:52',
 '09 Jul 02:01:56',
 '09 Jul 03:40:03',
 '09 Jul 05:16:39',
 '09 Jul 06:53:33',
 '09 Jul 23:36:37',
 '10 Jul 01:14:14',
 '10 Jul 02:52:38',
 '10 Jul 04:29:29',
 '10 Jul 06:06:06',
 '10 Jul 22:49:35',
 '11 Jul 00:26:34',
 '11 Jul 02:05:04',
 '11 Jul 03:42:18',
 '11 Jul 05:18:47',
 '11 Jul 06:57:24',
 '11 Jul 22:02:49',
 '11 Jul 23:39:00',
 '12 Jul 01:17:23',
 '12 Jul 02:55:04',
 '12 Jul 04:31:33',
 '12 Jul 06:08:58',
 '12 Jul 21:16:33',
 '12 Jul 22:51:34',
 '13 Jul 00:29:39',
 '13 Jul 02:07:44',
 '13 Jul 03:44:21',
 '13 Jul 05:21:16',
 '13 Jul 22:04:17',
 '13 Jul 23:41:55',
 '14 Jul 01:20:18',
 '14 Jul 02:57:09',


In [60]:
# Get the next 10 pages!
from tqdm import tqdm
r_ha3 = requests.get('http://heavens-above.com/PassSummary.aspx?satid=25544&lat=41.8781&lng=-87.6298&loc=Chicago&alt=181&tz=CST')

times = []
for i in tqdm(range(10)):
    times.extend(scrape_times(r_ha3.text))
    r = get_next_page(r_ha3)
times

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.18s/it]


['26 Jun 04:11:33',
 '27 Jun 03:26:11',
 '28 Jun 04:13:47',
 '29 Jun 03:28:20',
 '30 Jun 02:42:50',
 '30 Jun 04:15:48',
 '01 Jul 03:30:15',
 '02 Jul 02:44:38',
 '02 Jul 04:18:34',
 '03 Jul 01:58:59',
 '03 Jul 03:31:56',
 '04 Jul 02:46:14',
 '04 Jul 04:21:50',
 '05 Jul 02:00:29',
 '05 Jul 03:34:08',
 '26 Jun 04:11:33',
 '27 Jun 03:26:11',
 '28 Jun 04:13:47',
 '29 Jun 03:28:20',
 '30 Jun 02:42:50',
 '30 Jun 04:15:48',
 '01 Jul 03:30:15',
 '02 Jul 02:44:38',
 '02 Jul 04:18:34',
 '03 Jul 01:58:59',
 '03 Jul 03:31:56',
 '04 Jul 02:46:14',
 '04 Jul 04:21:50',
 '05 Jul 02:00:29',
 '05 Jul 03:34:08',
 '26 Jun 04:11:33',
 '27 Jun 03:26:11',
 '28 Jun 04:13:47',
 '29 Jun 03:28:20',
 '30 Jun 02:42:50',
 '30 Jun 04:15:48',
 '01 Jul 03:30:15',
 '02 Jul 02:44:38',
 '02 Jul 04:18:34',
 '03 Jul 01:58:59',
 '03 Jul 03:31:56',
 '04 Jul 02:46:14',
 '04 Jul 04:21:50',
 '05 Jul 02:00:29',
 '05 Jul 03:34:08',
 '26 Jun 04:11:33',
 '27 Jun 03:26:11',
 '28 Jun 04:13:47',
 '29 Jun 03:28:20',
 '30 Jun 02:42:50',


## Scrape nba.com

In [105]:
r_kobe = requests.get('https://en.wikipedia.org/wiki/Kobe_Bryant')

In [106]:
print(r_kobe.text)

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Kobe Bryant - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"dea9a5d7-f53d-487e-85f4-260f5c2e9aa0","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Kobe_Bryant","wgTitle":"Kobe Bryant","wgCurRevisionId":1029967721,"wgRevisionId":1029967721,"wgArticleId":246185,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with dead external links","Articles with dead external links from March 2021","Articles with permanently dead external links","Webarchive template wayback links"

In [109]:
soup_kobe = BeautifulSoup(r_kobe.text)
soup_kobe.prettify()

'<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   Kobe Bryant - Wikipedia\n  </title>\n  <script>\n   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"dea9a5d7-f53d-487e-85f4-260f5c2e9aa0","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Kobe_Bryant","wgTitle":"Kobe Bryant","wgCurRevisionId":1029967721,"wgRevisionId":1029967721,"wgArticleId":246185,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with dead external links","Articles with dead external links from March 2021","Articles with permanently dead external links","Webarch

In [112]:
soup_kobe.find_all('tr')

[<tr><td class="infobox-image" colspan="2"><a class="image" href="/wiki/File:Kobe_Bryant_2014.jpg" title="Bryant handling the basketball"><img alt="Bryant handling the basketball" data-file-height="900" data-file-width="944" decoding="async" height="262" src="//upload.wikimedia.org/wikipedia/commons/thumb/5/56/Kobe_Bryant_2014.jpg/275px-Kobe_Bryant_2014.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/5/56/Kobe_Bryant_2014.jpg/413px-Kobe_Bryant_2014.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/5/56/Kobe_Bryant_2014.jpg/550px-Kobe_Bryant_2014.jpg 2x" width="275"/></a><div class="infobox-caption">Bryant with the <a href="/wiki/Los_Angeles_Lakers" title="Los Angeles Lakers">Los Angeles Lakers</a> in 2014</div></td></tr>,
 <tr><th class="infobox-header" colspan="2" style="background-color:#DCDCDC;color:#000000;border:2.5px solid #DCDCDC;">Personal information</th></tr>,
 <tr><th class="infobox-label" scope="row">Born</th><td class="infobox-data"><span style="display:

In [77]:
import pandas as pd

In [130]:
pd_kobe_set = pd.read_html('https://en.wikipedia.org/wiki/Kobe_Bryant')
print(len(pd_kobe_set))

29


In [156]:
pd_kobe_set[3].columns

Index(['Year', 'Team', 'GP', 'GS', 'MPG', 'FG%', '3P%', 'FT%', 'RPG', 'APG',
       'SPG', 'BPG', 'PPG'],
      dtype='object')

In [163]:
for table_num in range(len(pd_kobe_set)):
    if 'Year' in pd_kobe_set[table_num].columns:
        print(f'Table num: {table_num}')
        print(pd_kobe_set[table_num])
        print('\n')

Table num: 3
             Year           Team    GP    GS   MPG    FG%    3P%    FT%  RPG  \
0         1996–97    L.A. Lakers    71     6  15.5  0.417  0.375  0.819  1.9   
1         1997–98    L.A. Lakers    79     1  26.0  0.428  0.341  0.794  3.1   
2         1998–99    L.A. Lakers   50*   50*  37.9  0.465  0.267  0.839  5.3   
3        1999–00†    L.A. Lakers    66    62  38.2  0.468  0.319  0.821  6.3   
4        2000–01†    L.A. Lakers    68    68  40.9  0.464  0.305  0.853  5.9   
5        2001–02†    L.A. Lakers    80    80  38.3  0.469  0.250  0.829  5.5   
6         2002–03    L.A. Lakers    82   82*  41.5  0.451  0.383  0.843  6.9   
7         2003–04    L.A. Lakers    65    64  37.6  0.438  0.327  0.852  5.5   
8         2004–05    L.A. Lakers    66    66  40.7  0.433  0.339  0.816  5.9   
9         2005–06    L.A. Lakers    80    80  41.0  0.450  0.347  0.850  5.3   
10        2006–07    L.A. Lakers    77    77  40.8  0.463  0.344  0.868  5.7   
11        2007–08    L.A. L

In [164]:
pd_kobe_set[5]

Unnamed: 0,Year,Title,Role,Notes
0,1996,Moesha,Terry Hightower,
1,1996,Arli$$,Himself,"Episode: ""What About the Fans?"""
2,1996,"Sister, Sister",Himself,"Episode: ""Kid-Napped"""
3,1997,Hang Time,Himself,
4,1998,All That,Himself,
5,2000,Bette,Himself,
6,2009,Kobe Doin' Work,Himself,
7,2010,Modern Family,Himself,
8,2014,Nowitzki. The Perfect Shot,Himself,
9,2015,Daddy's Home,Himself,
