### Scraping infant mortality data

In [26]:
# Necessary imports
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests


In [27]:
#wiki website 
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_infant_and_under-five_mortality_rates' 
response = requests.get(url)

In [28]:
response.status_code #200 = success!

200

In [29]:
response.text[:1000]  #First 1000 characters of the HTML

'\n<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of countries by infant and under-five mortality rates - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XpNBbQpAIDAAAJer5YsAAAAW","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_countries_by_infant_and_under-five_mortality_rates","wgTitle":"List of countries by infant and under-five mortality rates","wgCurRevisionId":944546870,"wgRevisionId":944546870,"wgArticleId":2525954,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","Ar

In [30]:
page = response.text

In [31]:
soup = BeautifulSoup(page, "lxml")

In [32]:
print (soup.prettify()[:2000])

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of countries by infant and under-five mortality rates - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XpNBbQpAIDAAAJer5YsAAAAW","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_countries_by_infant_and_under-five_mortality_rates","wgTitle":"List of countries by infant and under-five mortality rates","wgCurRevisionId":944546870,"wgRevisionId":944546870,"wgArticleId":2525954,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback 

In [33]:
#looking for a table 
table = soup.find(id='worldbank')
print(table)

<table border="1" class="sortable nowrap wikitable mw-datatable" id="worldbank" style="text-align:right">
<tbody><tr valign="bottom">
<th>Country or territory<br/></th>
<th>2018<br/>mortality rate,<br/>under-5<br/>(per 1000<br/>live births)
</th></tr>
<tr>
<td style="text-align:left;"><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/23px-Flag_of_Afghanistan.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/35px-Flag_of_Afghanistan.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/45px-Flag_of_Afghanistan.svg.png 2x" width="23"/> </span><a href="/wiki/Afghanistan" title="Afghanistan">Afghanistan</a></td>
<td>62.3
</td></tr>
<tr>
<td style="text-align:left;"><span class="flagicon"><img alt="" class="thumbborder" data-file-height="700" data-file-wid

In [34]:
#getting header
th = table.find_all('th')
for i in th:
    print (i.text)

Country or territory
2018mortality rate,under-5(per 1000live births)



In [35]:
#getting all the  rows of the table
table_rows = table.find_all('tr')

In [36]:
# creating list of lists (aka list of rows)
table = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text.strip() for i in td]
    table.append(row)
    print(row)   


[]
['Afghanistan', '62.3']
['Albania', '8.8']
['Algeria', '23.5']
['Andorra', '2.9']
['Angola', '77.2']
['Antigua and Barbuda', '6.4']
['Argentina', '9.9']
['Armenia', '12.4']
['Australia', '3.7']
['Austria', '3.5']
['Azerbaijan', '21.5']
['Bahamas, The', '10.2']
['Bahrain', '7.1']
['Bangladesh', '30.2']
['Barbados', '12.2']
['Belarus', '3.4']
['Belgium', '3.7']
['Belize', '13']
['Benin', '93']
['Bhutan', '29.7']
['Bolivia', '26.8']
['Bosnia and Herzegovina', '5.8']
['Botswana', '36.5']
['Brazil', '14.4']
['Brunei', '11.6']
['Bulgaria', '7.1']
['Burkina Faso', '76.4']
['Burundi', '58.5']
['Cape Verde', '19.5']
['Cambodia', '28']
['Cameroon', '76.1']
['Canada', '5']
['Central African Republic', '116.5']
['Chad', '119']
['Chile', '7.2']
['China', '8.6']
['Colombia', '14.2']
['Comoros', '67.5']
['Congo, Democratic Republic of the', '88.1']
['Congo, Republic of the', '50.1']
['Costa Rica', '8.8']
["Cote d'Ivoire", '80.9']
['Croatia', '4.7']
['Cuba', '5.0']
['Cyprus', '2.4']
['Czech Republi

In [37]:
#convirting into a DataFrame
age = pd.DataFrame(table)
age.head()

Unnamed: 0,0,1
0,,
1,Afghanistan,62.3
2,Albania,8.8
3,Algeria,23.5
4,Andorra,2.9


In [38]:
#removing first row 
age = age.iloc[1:,:]

In [39]:
age.columns = ['country', 'infant_mortality']

In [40]:
age.head(10)

Unnamed: 0,country,infant_mortality
1,Afghanistan,62.3
2,Albania,8.8
3,Algeria,23.5
4,Andorra,2.9
5,Angola,77.2
6,Antigua and Barbuda,6.4
7,Argentina,9.9
8,Armenia,12.4
9,Australia,3.7
10,Austria,3.5


In [41]:
age.isnull().sum()

country             0
infant_mortality    0
dtype: int64

In [42]:
#looks like we have strings as rank and age 
age.dtypes

country             object
infant_mortality    object
dtype: object

In [43]:
age.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 1 to 192
Data columns (total 2 columns):
country             192 non-null object
infant_mortality    192 non-null object
dtypes: object(2)
memory usage: 3.1+ KB


In [44]:
#converting strings into numeric 
age['infant_mortality'] = age['infant_mortality'].apply(float)
age.dtypes

country              object
infant_mortality    float64
dtype: object

In [45]:
age.shape

(192, 2)

In [46]:
age.dropna(inplace=True)
age.reset_index(inplace=True, drop=True)

In [47]:
%store -r countries_dict
age['country'] = age['country'].replace(countries_dict)

In [48]:
%store -r countries
#checking for different versions of counties names
age.country[~age.country.isin(countries)]

1                Albania
48              Dominica
89          Korea, North
113               Monaco
119                Nauru
141    Saint Helena (UK)
146           San Marino
177               Tuvalu
188            West Bank
Name: country, dtype: object

In [49]:
age

Unnamed: 0,country,infant_mortality
0,Afghanistan,62.3
1,Albania,8.8
2,Algeria,23.5
3,Andorra,2.9
4,Angola,77.2
...,...,...
187,Vietnam,20.7
188,West Bank,20.3
189,Yemen,55.0
190,Zambia,57.8


In [50]:
#lets save/pickle our 'age' dataframe
import pickle

with open('../Data/Pickled/inf_mort.pkl', 'wb') as picklefile:
    pickle.dump(age, picklefile)