In [None]:
# Import packages.
# Analyse the data.
import pandas as pd

# Get data from the internet.
import requests

# Parse data with BeautifulSoup.
from bs4 import BeautifulSoup

In [None]:
# Create a url variable.
url = 'https://www.worldometers.info/coronavirus/'

# Create a requests variable.
r = requests.get(url)

# Make contact with the website.
if r.status_code == 200:
    html_doc = r.text
    
# Get a BeautifulSoup object.
soup = BeautifulSoup(html_doc)

# Print the output.
print(soup.prettify())

In [None]:
# Extract the contents of the table with the table id. 
table = soup.find('table', attrs={'id': 'main_table_countries_today'})

table

In [None]:
# Specify BeautifulSoup to go through the table and find everything 
# with a tr tag.
# Note: th = (table header), tr = (table row), and td = table column
# , attrs={'style': ""})
rows = table.find_all('tr', attrs={'style': ""})
rows

In [None]:
# Store the extracted data.
output = []

column_names = ['Country,Other', 'Total Cases', 'New Cases', 'Total Deaths',
               'New Deaths', 'Total Recovered', 'New Recovered',
               'Active Cases', 'Serious, Critical', 'Tot Cases/ 1M pop',
               'Deaths/ 1M pop', 'Total Tests', 'Tests/ 1M pop', 'Population']

# Create a for loop statement.
for cases in rows:
    cases_data = cases.find_all("td")
    if cases_data:
        # Extract the text within each element.
        cases_text = [td.text for td in cases_data]
        output.append(dict(zip(column_names, cases_text)))
        
# Create an output.
output

In [None]:
# Create a DataFrame directly from the output.
data = pd.DataFrame(output)

# View the DataFrame.
data.head()

In [None]:
# Save the DataFrame as a CSV file without index.
data.to_csv('main_table_countries_today.csv', index=False)

In [None]:
# Create a JSON file.
import json

# Create a JSON file.
output_json = json.dumps(output)

# View the output.
output_json

In [None]:
# Save the JSON file to .json.
with open('main_table_countries_today.json', 'w') as f:
    json.dump(output, f)

In [None]:
# Open the JSON file with Pandas.
data = pd.read_json('main_table_countries_today.json')

# View the DataFrame.
data.head()

In [None]:
# Import the CSV file with Pandas.
# Data = pd.read_json('countries.json').
data = pd.read_csv('main_table_countries_today.csv')

# View.
data.head()

In [None]:
# Read JSON using Pandas, output to .csv.
pd.read_json(output_json).to_csv('main_table_countries_today.csv', index=False)

In [2]:
import pandas as pd

Import the CSV and JSON files and view them with the head() method.

In [3]:
# Import and read the CSV file.
data_csv = pd.read_csv('main_table_countries_today.csv')

# View the data.
print(data_csv.head())

# Import and read the JSON file.
data_json = pd.read_json('main_table_countries_today.json')

# View the DataFrame. 
data_json.head()

   Country,Other Total Cases    New Cases Total Deaths  New Deaths  \
0            NaN       World  551,959,804     +210,158   6,356,640   
1            1.0         USA   89,236,449          NaN  1,042,291    
2            2.0       India   43,452,164          NaN    525,116    
3            3.0      Brazil   32,283,345          NaN    671,194    
4            4.0      France   30,950,513          NaN    149,491    

   Total Recovered New Recovered Active Cases Serious, Critical  \
0            433.0   527,429,333     +193,851        18,173,831   
1              NaN    84,810,554          NaN         3,383,604   
2              NaN    42,822,493          NaN           104,555   
3              NaN    30,764,923          NaN           847,228   
4              NaN    29,603,879          NaN         1,197,143   

  Tot Cases/ 1M pop Deaths/ 1M pop Total Tests  Tests/ 1M pop Population  
0            36,837         70,811       815.5            NaN        NaN  
1             3,323       

Unnamed: 0,"Country,Other",Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,"Serious, Critical",Tot Cases/ 1M pop,Deaths/ 1M pop,Total Tests,Tests/ 1M pop,Population
0,,World,551959804,210158.0,6356640,433.0,527429333,193851.0,18173831,36837,70811,815.5,,
1,1.0,USA,89236449,,1042291,,84810554,,3383604,3323,266488,3113.0,1050907600.0,3138339.0
2,2.0,India,43452164,,525116,,42822493,,104555,698,30883,373.0,862375489.0,612928.0
3,3.0,Brazil,32283345,,671194,,30764923,,847228,8318,149763,3114.0,63776166.0,295858.0
4,4.0,France,30950513,,149491,,29603879,,1197143,869,472090,2280.0,271490188.0,4141057.0


View the data types and column names of both DataFrames

In [None]:
# View the CSV and JSON DataFrames.
print(data_csv.dtypes)
print(data_csv.columns)

print(data_json.dtypes)
print(data_json.columns)

Create a subset consisting of the columns Country,Other, Total Cases, Total Deaths, Total Recovered, Active Cases, Serious, and Critical.

In [4]:
# Create a subset.
data_report = data_csv[['Country,Other', 'Total Cases', 'Total Deaths',
                        'Total Recovered', 'Active Cases', 'Serious, Critical']]

# View the column names.
print(data_report.columns)
data_report

Index(['Country,Other', 'Total Cases', 'Total Deaths', 'Total Recovered',
       'Active Cases', 'Serious, Critical'],
      dtype='object')


Unnamed: 0,"Country,Other",Total Cases,Total Deaths,Total Recovered,Active Cases,"Serious, Critical"
0,,World,+210158,433.0,+193851,18173831
1,1.0,USA,,,,3383604
2,2.0,India,,,,104555
3,3.0,Brazil,,,,847228
4,4.0,France,,,,1197143
...,...,...,...,...,...,...
221,222.0,Micronesia,,,,5
222,226.0,Niue,,,,1
223,228.0,Tuvalu,,,,3
224,230.0,China,+33,,+65,432


Determine the number of missing values in the new DataFrame

In [5]:
# Determine missing values.
data_report.isnull().sum()

Country,Other          2
Total Cases            0
Total Deaths         173
Total Recovered      197
Active Cases         176
Serious, Critical     15
dtype: int64

Save the new DataFrame as a CSV file.
Import and view the new DataFrame with the head() method.

In [6]:
# Save the DataFrame as a CSV file without index.
data_report.to_csv('cases_report.csv', index=False)

In [None]:
# View the saved CSV.
cases_report = pd.read_csv('cases_report.csv')

# View the DataFrame.
cases_report.head()