#### 3 facts to know about HTML to start with web scraping:

1) HTML consists of tags that mark-up the text.

2) HTML tags can have attributes (and attributes have value) which are specified in the opening tag.

3) HTML tags can be nested.

Master url:
https://www.cia.gov/library/publications/the-world-factbook/docs/rankorderguide.html

In [4]:
import requests  
import re
from bs4 import BeautifulSoup  
import pandas as pd
from functools import reduce
import numpy as np

In [2]:
def makeDF(column_name, url):
    
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')  
    results = soup.find_all('tr', attrs={'class':re.compile('rankorder*')})

    records = []
    for result in results:
        country = result.find('a').contents[0]
        feature = result.find('td').next_sibling.next_sibling.next_sibling.next_sibling.contents[0]

        records.append((country, feature))

    return pd.DataFrame(records, columns=['COUNTRY', column_name])

    

In [3]:
features = [{'column name':'POPULATION', 
             'url': 'https://www.cia.gov/library/publications/the-world-factbook/fields/335rank.html'},
            {'column name':'BIRTH RATE', 
             'url': 'https://www.cia.gov/library/publications/the-world-factbook/fields/345rank.html'}, 
            {'column name':'DEATH RATE', 
             'url': 'https://www.cia.gov/library/publications/the-world-factbook/fields/346rank.html'}, 
            {'column name':'MEDIAN AGE', 
             'url': 'https://www.cia.gov/library/publications/the-world-factbook/fields/343rank.html'}, 
            {'column name':'GDP', 
             'url': 'https://www.cia.gov/library/publications/the-world-factbook/fields/211rank.html'},
            {'column name':'UNEMPLOYMENT RATE', 
             'url': 'https://www.cia.gov/library/publications/the-world-factbook/fields/220rank.html'}
            ]


In [4]:
dfs = [] 
for feature in features:
    df = makeDF(feature['column name'], feature['url'])
    dfs.append(df)    

In [5]:
df_final = reduce(lambda left,right: pd.merge(left,right,on='COUNTRY'), dfs)
df_final

Unnamed: 0,COUNTRY,POPULATION,BIRTH RATE,DEATH RATE,MEDIAN AGE,GDP,UNEMPLOYMENT RATE
0,China,1394015977,11.60,8.20,38.4,"$18,200",3.90
1,India,1326093247,18.20,7.30,28.7,"$7,200",8.50
2,United States,332639102,12.40,8.30,38.5,"$59,800",4.40
3,Indonesia,267026366,15.40,6.60,31.1,"$12,400",5.40
4,Pakistan,233500636,27.40,6.20,22.0,"$5,400",6.00
...,...,...,...,...,...,...,...
209,Nauru,11000,21.90,6.00,27.0,"$12,300",23.00
210,Cook Islands,8574,13.30,9.00,38.3,"$16,700",13.10
211,"Saint Helena, Ascension, and Tristan da Cunha",7862,9.40,8.30,43.2,"$7,800",14.00
212,Montserrat,5373,11.70,6.00,34.8,"$34,000",5.60


In [6]:
df_final.sort_values('COUNTRY', inplace=True)
df_final.reset_index(drop=True, inplace=True)
df_final

Unnamed: 0,COUNTRY,POPULATION,BIRTH RATE,DEATH RATE,MEDIAN AGE,GDP,UNEMPLOYMENT RATE
0,Afghanistan,36643815,36.70,12.70,19.5,"$2,000",23.90
1,Albania,3074579,13.00,7.10,34.3,"$12,500",13.80
2,Algeria,42972878,20.00,4.40,28.9,"$15,200",11.70
3,American Samoa,49437,17.80,5.90,27.2,"$11,200",29.80
4,Andorra,77000,7.00,7.70,46.2,"$49,900",3.70
...,...,...,...,...,...,...,...
209,Wallis and Futuna,15854,12.70,5.70,34.0,"$3,800",8.80
210,West Bank,2900034,25.20,3.40,21.9,"$4,300",27.90
211,Yemen,29884405,25.80,5.60,19.8,"$2,500",27.00
212,Zambia,17426623,40.40,11.60,16.9,"$4,000",15.00


In [7]:
df_final.dtypes

COUNTRY              object
POPULATION           object
BIRTH RATE           object
DEATH RATE           object
MEDIAN AGE           object
GDP                  object
UNEMPLOYMENT RATE    object
dtype: object

In [8]:
# Formatting the POPULATION and GDP

df_final['POPULATION'] = df_final['POPULATION'].apply(lambda s: s.replace(',', ''))
df_final['POPULATION'] = df_final['POPULATION'].apply(lambda x: float(x))

df_final['GDP'] = df_final['GDP'].apply(lambda s: s.replace(',', ''))
df_final['GDP'] = df_final['GDP'].apply(lambda s: s.replace('$', ''))
df_final['GDP'] = df_final['GDP'].apply(lambda x: float(x))

In [9]:
df_final.to_csv('countries_data.csv')

In [10]:
df = pd.read_csv('countries_data.csv', index_col=0)
df

Unnamed: 0,COUNTRY,POPULATION,BIRTH RATE,DEATH RATE,MEDIAN AGE,GDP,UNEMPLOYMENT RATE
0,Afghanistan,36643815.0,36.7,12.7,19.5,2000.0,23.9
1,Albania,3074579.0,13.0,7.1,34.3,12500.0,13.8
2,Algeria,42972878.0,20.0,4.4,28.9,15200.0,11.7
3,American Samoa,49437.0,17.8,5.9,27.2,11200.0,29.8
4,Andorra,77000.0,7.0,7.7,46.2,49900.0,3.7
...,...,...,...,...,...,...,...
209,Wallis and Futuna,15854.0,12.7,5.7,34.0,3800.0,8.8
210,West Bank,2900034.0,25.2,3.4,21.9,4300.0,27.9
211,Yemen,29884405.0,25.8,5.6,19.8,2500.0,27.0
212,Zambia,17426623.0,40.4,11.6,16.9,4000.0,15.0


In [11]:
df.dtypes

COUNTRY               object
POPULATION           float64
BIRTH RATE           float64
DEATH RATE           float64
MEDIAN AGE           float64
GDP                  float64
UNEMPLOYMENT RATE    float64
dtype: object

In [12]:
df.isnull().sum()

COUNTRY              0
POPULATION           0
BIRTH RATE           0
DEATH RATE           0
MEDIAN AGE           0
GDP                  0
UNEMPLOYMENT RATE    0
dtype: int64

In [13]:
for feature in df.columns:
    print(feature)
    print(df[feature].unique())

COUNTRY
['Afghanistan' 'Albania' 'Algeria' 'American Samoa' 'Andorra' 'Angola'
 'Anguilla' 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba'
 'Australia' 'Austria' 'Azerbaijan' 'Bahamas, The' 'Bahrain' 'Bangladesh'
 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan'
 'Bolivia' 'Bosnia and Herzegovina' 'Botswana' 'Brazil'
 'British Virgin Islands' 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burma'
 'Cabo Verde' 'Cambodia' 'Cameroon' 'Canada' 'Cayman Islands'
 'Central African Republic' 'Chile' 'China' 'Colombia' 'Comoros'
 'Congo, Republic of the' 'Cook Islands' 'Costa Rica' "Cote d'Ivoire"
 'Croatia' 'Cuba' 'Curacao' 'Cyprus' 'Czechia' 'Denmark' 'Djibouti'
 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador'
 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia'
 'Faroe Islands' 'Fiji' 'Finland' 'France' 'French Polynesia' 'Gabon'
 'Georgia' 'Germany' 'Ghana' 'Gibraltar' 'Greece' 'Greenland' 'Grenada'
 'Guam' 'Guatemala' 'Guernsey' 'Guinea' 'Guyana' 'Haiti

### From Worldometer

In [36]:
url = 'https://www.worldometers.info/coronavirus/#countries'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('tr', attrs={'style': ''})

In [57]:
results[2].find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]

'64'

In [59]:
records = []
for result in results:
    try:
        country = result.find_all('a')[0].contents[0]
        total_cases= result.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.contents[0]
        new_cases = result.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
        total_deaths = result.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
        new_deaths = result.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
        total_recovered = result.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
        new_recovered = result.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
        active_cases = result.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
        serious_cases = result.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
        total_cases_per1M = result.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
        deaths_per1M = result.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
        total_tests = result.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
        tests_per1M = result.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
        population = result.find_all('a')[1].contents[0]
        #one_caseresult.find('td', attrs={'style':re.compile('font-weight*')}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
        
        records.append((country, total_cases, new_cases, total_deaths, new_deaths, total_recovered, new_recovered, 
                        active_cases, serious_cases, total_cases_per1M, deaths_per1M, total_tests, tests_per1M, population))
    except:
        continue
        
        
columns = ['Country', 'Total Cases', 'New Cases', 'Total Deaths', 'New Deaths', 'Total Recovered', 'New Recovered', 
           'Active Cases', 'Serious Cases', 'Total Cases (per 1M)', 'Deaths (per 1M)', 'Total Tests', 'Tests (per 1M)', 'Population']
        
df = pd.DataFrame(records, columns=columns)

for col in columns[1:]:
    df[col] = df[col].apply(lambda s: np.nan if s=='N/A' else float(s.replace(',', '')))
df.to_csv('worldometer.csv')

In [4]:
# df.dtypes

In [3]:
# df.head(12)

In [2]:
# top_affected_countries = df['Country'].head(12)

In [1]:
# ntop = 12
# df.iloc[0:ntop, :].set_index('Country')[['Total Cases', 'Total Deaths']].plot(kind='bar', figsize=(12,5), grid=True, logy=True)