##### Collecting Latest COVID-19 data

In [None]:
# This block enables multiple outputs or print statements from a single jupyter notebook cell

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<pre>
OPTIONAL:  (can be used in requests.get())
<i>
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36", "X-Requested-With": "XMLHttpRequest"
}
r = requests.get(url, header)
</i></pre>

In [None]:
import requests
import pandas as pd

def getRenamedColumnList():
    renamedColumns = ['#', 'Country', 'Total Cases', 'New Cases', 'Total Deaths', 'New Deaths',
           'Total Recovered', 'Active Cases', 'Serious,Critical',
           'Tot Cases/1M pop', 'Deaths/1M pop', 'Total Tests', 'Tests/1M pop', 'Population']
    # Added first and last columns (# and Population) on 17-May-2020
    # As the number of fields keep changing over time, we need to rectify this method regularly
    return renamedColumns
    

In [None]:
def getCovidDataFromWorldometers(url):
    resp = requests.get(url)
    dfs = pd.read_html(resp.text)
    df = dfs[0] #The first html table contains our data
    print(df.columns)
    df.columns = getRenamedColumnList()

    #Remove comma and plus characters from 'New Cases' and 'New Deaths' field as comma and plus appear there
    #Also convert the column types from object to float (trying to convert to 'int' will result in error for NaN values)
    if (df['New Cases'].dtype == 'object'):
        df['New Cases'] = df['New Cases'].str.replace(',', '').str.replace('+', '')
        df['New Cases'] = df['New Cases'].astype(str).astype(float)

    if (df['New Deaths'].dtype == 'object'):
        df['New Deaths'] = df['New Deaths'].str.replace(',', '').str.replace('+', '')
        df['New Deaths'] = df['New Deaths'].astype(str).astype(float)

    #Remove the last record (Total:) as it is already available at top with 'World'
    df = df[df['Country']!='Total:']

    #Sort the dataframe in decreasing order of Tocal Cases
    df = df.sort_values('Total Cases', ascending=False)
    return df;

In [None]:
def main():
    covid_url = 'https://www.worldometers.info/coronavirus/'
    # covid_india_url = 'https://www.mygov.in/corona-data/covid19-statewise-status'

    dfCovid = getCovidDataFromWorldometers(covid_url)
    # Output the data as an xlsx file skipping the row index 0,1,2,...
    dfCovid.to_excel('Covid19-Data-Worldwide.xlsx', sheet_name='COVID-DATA', index=False);

In [None]:
if __name__ == '__main__':
    main()

In [None]:
dfCovid.dtypes

In [None]:
%run collect_covid_data.py