Scraping Harvard's Wikipedia Page to extract interesting data.

In [1]:
import requests

req = requests.get("https://en.wikipedia.org/wiki/Harvard_University")

In [2]:
req

<Response [200]>

In [3]:
page = req.text
page[:2000]

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Harvard University - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"Xl2WJApAICkAAKnipJgAAAAP","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Harvard_University","wgTitle":"Harvard University","wgCurRevisionId":942364135,"wgRevisionId":942364135,"wgArticleId":18426501,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: location","Webarchive template wayback links","CS1:

In [4]:
from bs4 import BeautifulSoup

In [5]:
soup = BeautifulSoup(page, 'html.parser')

Get the title of the website

In [6]:
soup.title

<title>Harvard University - Wikipedia</title>

In [7]:
len(soup.find_all('p'))

84



If you look at the Wikipedia page on your browser, you'll notice that it has a couple of tables in it. We will be working with the "Demographics" table, but first we need to find it.

One of the HTML attributes that will be very useful to us is the "class" attribute.

Getting the class of a single element is easy...


In [8]:
soup.table['class']

['infobox', 'vcard']

In [9]:
my_list = [t['class'] for t in soup.find_all('table') if t.get('class')]

In [10]:
my_list

[['infobox', 'vcard'],
 ['toccolours'],
 ['infobox'],
 ['wikitable', 'sortable', 'collapsible', 'collapsed', 'floatright'],
 ['wikitable', 'sortable', 'collapsible', 'collapsed', 'floatright'],
 ['wikitable'],
 ['box-Cleanup_gallery', 'plainlinks', 'metadata', 'ambox', 'ambox-style'],
 ['metadata', 'mbox-small'],
 ['nowraplinks', 'mw-collapsible', 'mw-collapsed', 'navbox-inner'],
 ['nowraplinks', 'navbox-subgroup'],
 ['nowraplinks', 'navbox-subgroup'],
 ['nowraplinks', 'mw-collapsible', 'mw-collapsed', 'navbox-inner'],
 ['nowraplinks', 'mw-collapsible', 'autocollapse', 'navbox-inner'],
 ['nowraplinks', 'mw-collapsible', 'mw-collapsed', 'navbox-inner'],
 ['nowraplinks', 'mw-collapsible', 'autocollapse', 'navbox-inner'],
 ['nowraplinks', 'mw-collapsible', 'autocollapse', 'navbox-inner'],
 ['nowraplinks', 'mw-collapsible', 'autocollapse', 'navbox-inner'],
 ['nowraplinks', 'mw-collapsible', 'autocollapse', 'navbox-inner'],
 ['nowraplinks', 'mw-collapsible', 'autocollapse', 'navbox-inner'],

In [11]:
table_html = soup.find_all('table', class_ = 'wikitable')

In [12]:
from IPython.core.display import HTML

In [13]:
my_table = str(table_html[2])
HTML(my_table)

Unnamed: 0,Undergrad,Grad/prof,US census
Asian,21%,13%,5%
Black,9%,5%,12%
Hispanic or Latino,11%,7%,16%
White,37%,38%,64%
Two or more races,8%,3%,9%
International,12%,32%,


In [14]:
rows = [row for row in soup.find_all('table', class_ = 'wikitable')[2].find_all('tr')]

In [15]:
rows

[<tr>
 <th></th>
 <th>Undergrad</th>
 <th>Grad/prof</th>
 <th>US census
 </th></tr>,
 <tr>
 <th>Asian
 </th>
 <td>21%</td>
 <td>13%</td>
 <td>5%
 </td></tr>,
 <tr>
 <th>Black
 </th>
 <td>9%</td>
 <td>5%</td>
 <td>12%
 </td></tr>,
 <tr>
 <th>Hispanic or Latino
 </th>
 <td>11%</td>
 <td>7%</td>
 <td>16%
 </td></tr>,
 <tr>
 <th>White
 </th>
 <td>37%</td>
 <td>38%</td>
 <td>64%
 </td></tr>,
 <tr>
 <th>Two or more races
 </th>
 <td>8%</td>
 <td>3%</td>
 <td>9%
 </td></tr>,
 <tr>
 <th>International
 </th>
 <td>12%</td>
 <td>32%</td>
 <td>N/A
 </td></tr>]

Get the Column names.

In [16]:
columns = [col.get_text().replace('\n', '') for col in rows[0].find_all('th') if col.get_text()]

In [17]:
columns

['Undergrad', 'Grad/prof', 'US census']

Get the Index.

In [18]:
indexes = [indx.find('th').get_text().replace('\n', '') for indx in rows[1:]]

In [19]:
indexes

['Asian',
 'Black',
 'Hispanic or Latino',
 'White',
 'Two or more races',
 'International']

In [20]:
import numpy as np
import pandas as pd

In [21]:
values = []

for row in rows[1:]:
    for r in row.find_all('td'):
        values.append(r.text.replace('%', ''))

In [22]:
values = [v.replace('\n', '') for v in values]

In [23]:
values

['21',
 '13',
 '5',
 '9',
 '5',
 '12',
 '11',
 '7',
 '16',
 '37',
 '38',
 '64',
 '8',
 '3',
 '9',
 '12',
 '32',
 'N/A']

In [24]:
values = [float(v) for v in values if v != 'N/A']
values.append(np.nan)
values

[21.0,
 13.0,
 5.0,
 9.0,
 5.0,
 12.0,
 11.0,
 7.0,
 16.0,
 37.0,
 38.0,
 64.0,
 8.0,
 3.0,
 9.0,
 12.0,
 32.0,
 nan]

In [25]:
import numpy as np
import pandas as pd

In [26]:
values = np.array(values).reshape(6,3)

In [27]:
df = pd.DataFrame(values, columns = columns, index = indexes)
df

Unnamed: 0,Undergrad,Grad/prof,US census
Asian,21.0,13.0,5.0
Black,9.0,5.0,12.0
Hispanic or Latino,11.0,7.0,16.0
White,37.0,38.0,64.0
Two or more races,8.0,3.0,9.0
International,12.0,32.0,


In [28]:
HTML(my_table)

Unnamed: 0,Undergrad,Grad/prof,US census
Asian,21%,13%,5%
Black,9%,5%,12%
Hispanic or Latino,11%,7%,16%
White,37%,38%,64%
Two or more races,8%,3%,9%
International,12%,32%,
