In [1]:
# Libraries to manage data
import numpy as np
import pandas as pd

# Libraries for reqs and scrapping
import requests
from bs4 import BeautifulSoup

In [2]:
# Raw html response
covid_url = 'https://www.worldometers.info/coronavirus/'
req = requests.get(covid_url)
soup = BeautifulSoup(req.content, "html.parser")

In [3]:
# From table get what we want
# html tags: 
# variables names --> 'thead' --> 'tr' --> 'th'
# variables names --> 'tbody' --> 'tr' --> 'td'
variables = []
variables_html = soup.find_all('thead')[-1].find_all('tr')
for tag in variables_html:
    ths = tag.find_all(['th'])
    row = [th.text for th in ths]
    variables.append(row)
# variables

samples = []
samples_html = soup.find_all('tbody')[0].find_all('tr')
for tr in samples_html:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    samples.append(row)
# samples

df = pd.DataFrame(samples[:len(samples)-6], columns=variables[0]).drop(columns=["#"])       
df.head()

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/\n1M pop\n,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl
0,\nNorth America\n,20352667,57205,467481,1486,12343529,20936,7541657,33726,,,,,,North America,\n,,
1,\nAsia\n,19594118,102154,319475,1329,17828448,107048,1446195,27214,,,,,,Asia,\n,,
2,\nSouth America\n,12280038,11430,345686,162,10858312,1294,1076040,16845,,,,,,South America,\n,,
3,\nEurope\n,20884971,176203,482759,3688,9937237,111205,10464975,26564,,,,,,Europe,\n,,
4,\nAfrica\n,2470482,6297,58015,105,2087551,6048,324916,2759,,,,,,Africa,\n,,


In [4]:
df = df.iloc[8:,:-3].reset_index(drop=True)

In [5]:
# rename columns
df.columns = ['Country/Region', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths',
       'TotalRecovered', 'NewRecovered', 'ActiveCases', 'Serious,Critical',
       'Tot Cases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/1M pop',
       'Population', 'Continent']
df = df[['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths',
       'TotalRecovered', 'NewRecovered', 'ActiveCases', 'Serious,Critical',
       'Tot Cases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/1M pop' ]]
df.sample(5)

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop
140,Djibouti,Africa,994610,5780,10.0,61.0,,5641,6.0,78,,5811,61.0,97035,97561
141,Mayotte,Africa,275817,5708,92.0,53.0,,2964,,2691,4.0,20695,192.0,34115,123687
171,Mongolia,Asia,3302544,941,18.0,,,473,89.0,468,4.0,285,,471863,142879
62,Oman,Asia,5165792,127019,,1483.0,,119009,,6527,43.0,24588,287.0,571472,110626
208,Greenland,North America,56816,19,,,,18,,1,,334,,14482,254893


In [21]:
# fix data
pd.set_option('display.max_rows', 500)


for col in df.columns[2:]:
    # replace comma with empty string
    df[col] = df[col].str.replace('[,+ ]', '', regex=True)
    # replace 'N/A' with empty string
    df[col] = df[col].str.replace('N/A', '', regex=False)

# replace empty strings with np.nan
df = df.replace('', np.nan)

# rename here what ever we want
df = df.rename(columns = {
    "Country/Region":"Country"
})
df = df[['Country','Population','TotalCases','TotalDeaths']]


df["Country"].loc[df["Country"]=="USA"]="United States"
df["Country"].loc[df["Country"]=="UK"]="United Kingdom"
df["Country"].loc[df["Country"]=="Taiwan"]="Taiwan Province of China"
df["Country"].loc[df["Country"]=="Congo"]="Congo (Kinshasa)"
df["Country"].loc[df["Country"]=="Czechia"]="Czech Republic"
df["Country"].loc[df["Country"]=="S. Korea"]="South Korea"
df["Country"].loc[df["Country"]=="Palestine"]="Palestinian Territories"

#df

In [22]:
# save as .csv file
df.to_csv('covid19_data.csv', index=False)