In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
world_url = 'https://www.worldometers.info/coronavirus/'
india_url = 'https://www.mohfw.gov.in/'

## Web Scrapping the World Dataset

In [3]:
response = requests.get(world_url)
print(response)

<Response [200]>


In [4]:
soup = BeautifulSoup(response.content,'html.parser')
soup.title

<title>Coronavirus Update (Live): 8,602,376 Cases and 456,802 Deaths from COVID-19 Virus Pandemic - Worldometer</title>

In [5]:
divs = soup.find('div',class_='panel_flip')
for i in divs.find_all('div',class_='panel_front'):
    print(i.text)


3,593,083
Currently Infected Patients


3,538,447 (98%)
in Mild Condition

54,636 (2%)
Serious or Critical


Show Graph



In [6]:
coronatable = soup.find_all('table')

In [7]:
ct = coronatable[0]

In [8]:
country = []
total_cases = []
new_cases = []
total_deaths = []
new_deaths = []
total_recovered = []
active_cases = []
rows = ct.find_all('tr')[9:-8]
for row in rows:
    col = row.find_all('td')
    country.append(col[1].text.strip())
    total_cases.append(col[2].text.strip().replace(',',''))
    new_cases.append(col[3].text.strip().replace(',','').replace('+',''))
    total_deaths.append(col[4].text.strip().replace(',',''))
    new_deaths.append(col[5].text.strip().replace(',','').replace('+',''))
    total_recovered.append(col[6].text.strip().replace(',',''))
    active_cases.append(col[8].text.strip().replace(',','').replace('+',''))
    
print(country)
print(active_cases)

['USA', 'Brazil', 'Russia', 'India', 'UK', 'Spain', 'Peru', 'Italy', 'Chile', 'Iran', 'Germany', 'Turkey', 'Mexico', 'Pakistan', 'France', 'Saudi Arabia', 'Bangladesh', 'Canada', 'Qatar', 'South Africa', 'Belgium', 'Colombia', 'Belarus', 'Sweden', 'Egypt', 'Netherlands', 'Ecuador', 'Indonesia', 'UAE', 'Singapore', 'Portugal', 'Kuwait', 'Argentina', 'Ukraine', 'Poland', 'Switzerland', 'Philippines', 'Afghanistan', 'Oman', 'Iraq', 'Ireland', 'Dominican Republic', 'Romania', 'Panama', 'Bolivia', 'Bahrain', 'Israel', 'Armenia', 'Nigeria', 'Japan', 'Austria', 'Kazakhstan', 'Moldova', 'Ghana', 'Serbia', 'Denmark', 'S. Korea', 'Guatemala', 'Algeria', 'Azerbaijan', 'Honduras', 'Cameroon', 'Czechia', 'Morocco', 'Norway', 'Malaysia', 'Sudan', 'Nepal', 'Australia', 'Finland', 'Ivory Coast', 'Uzbekistan', 'Senegal', 'DRC', 'Tajikistan', 'Haiti', 'Guinea', 'North Macedonia', 'Djibouti', 'Gabon', 'El Salvador', 'Kenya', 'Luxembourg', 'Hungary', 'Ethiopia', 'Bulgaria', 'Venezuela', 'Greece', 'Bosnia 

In [9]:
world_df = pd.DataFrame(list(zip(country,new_cases, active_cases, total_recovered, new_deaths, total_deaths, total_cases)),
                  columns = ['Country','NewCases','ActiveCases','TotalRecovered','NewDeaths','TotalDeaths','TotalCases'])

In [10]:
world_df.head(20)

Unnamed: 0,Country,NewCases,ActiveCases,TotalRecovered,NewDeaths,TotalDeaths,TotalCases
0,USA,105.0,1211989.0,931079.0,,120688,2263756
1,Brazil,,415130.0,520360.0,,47869,983359
2,Russia,7972.0,236816.0,324406.0,181.0,7841,569063
3,India,448.0,163688.0,205245.0,2.0,12606,381539
4,UK,,,,,42288,300469
5,Spain,,,,,27136,292348
6,Peru,,105737.0,131190.0,,7461,244388
7,Italy,,23101.0,180544.0,,34514,238159
8,Chile,,34821.0,186441.0,,3841,225103
9,Iran,,31384.0,156991.0,,9272,197647


## Data Cleaning on World Data

In [11]:
world_df.dtypes

Country           object
NewCases          object
ActiveCases       object
TotalRecovered    object
NewDeaths         object
TotalDeaths       object
TotalCases        object
dtype: object

In [12]:
# Check for duplicated rows in the Country column

world_df.Country.duplicated().sum()

0

In [13]:
# Check for null values
world_df.isnull().sum()

Country           0
NewCases          0
ActiveCases       0
TotalRecovered    0
NewDeaths         0
TotalDeaths       0
TotalCases        0
dtype: int64

In [14]:
# It can be seen from the dataset that it does have missing values
# Handling missing values by replacing them with NaN

world_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
world_df.replace('N/A', np.nan, inplace=True)
world_df

Unnamed: 0,Country,NewCases,ActiveCases,TotalRecovered,NewDeaths,TotalDeaths,TotalCases
0,USA,105,1211989,931079,,120688,2263756
1,Brazil,,415130,520360,,47869,983359
2,Russia,7972,236816,324406,181,7841,569063
3,India,448,163688,205245,2,12606,381539
4,UK,,,,,42288,300469
5,Spain,,,,,27136,292348
6,Peru,,105737,131190,,7461,244388
7,Italy,,23101,180544,,34514,238159
8,Chile,,34821,186441,,3841,225103
9,Iran,,31384,156991,,9272,197647


In [15]:
# Finding the number of NaN values in the dataset

world_df.isna().sum()

Country             0
NewCases          166
ActiveCases         4
TotalRecovered      5
NewDeaths         189
TotalDeaths        30
TotalCases          0
dtype: int64

In [16]:
# Missing values in NewCases and New Deaths can be filled with zeros 

world_df.NewCases.replace(np.nan,0,inplace=True)
world_df.NewDeaths.replace(np.nan,0,inplace=True)

In [17]:
world_df

Unnamed: 0,Country,NewCases,ActiveCases,TotalRecovered,NewDeaths,TotalDeaths,TotalCases
0,USA,105,1211989,931079,0,120688,2263756
1,Brazil,0,415130,520360,0,47869,983359
2,Russia,7972,236816,324406,181,7841,569063
3,India,448,163688,205245,2,12606,381539
4,UK,0,,,0,42288,300469
5,Spain,0,,,0,27136,292348
6,Peru,0,105737,131190,0,7461,244388
7,Italy,0,23101,180544,0,34514,238159
8,Chile,0,34821,186441,0,3841,225103
9,Iran,0,31384,156991,0,9272,197647


In [18]:
world_df.isna().sum()

Country            0
NewCases           0
ActiveCases        4
TotalRecovered     5
NewDeaths          0
TotalDeaths       30
TotalCases         0
dtype: int64

In [19]:
# For ActiveCase, TotalRecovered and TotalDeaths 
# The missing values can be replaced using the expression : 
# TotalCases = ActiveCases + TotalRecovered + TotalDeaths

for i in world_df.index:
    if world_df.ActiveCases[i] is np.nan:
        if world_df.TotalRecovered[i] is np.nan or world_df.TotalDeaths[i] is np.nan:
            world_df.ActiveCases[i] = 0 + int(world_df.NewCases[i])
        else:
            world_df.ActiveCases[i] = int(world_df.TotalCases[i]) + int(world_df.NewCases[i]) - int(world_df.TotalDeaths[i]) - int(world_df.TotalRecovered[i])
    if world_df.TotalRecovered[i] is np.nan:
        if world_df.TotalDeaths[i] is np.nan:
            world_df.TotalRecovered[i] = 0
        else:
            world_df.TotalRecovered[i] = int(world_df.TotalCases[i]) - int(world_df.TotalDeaths[i]) - int(world_df.ActiveCases[i])
            
    if world_df.TotalDeaths[i] is np.nan:
        #if world_df.ActiveCases[i] is np.nan or world_df.TotalRecovered[i] is np.nan:
         #   world_df.TotalDeaths = 0
        #else:
        world_df.TotalDeaths[i] = int(world_df.TotalCases[i]) + int(world_df.NewDeaths[i]) - int(world_df.ActiveCases[i]) - int(world_df.TotalRecovered[i])

In [20]:
world_df.head(20)

Unnamed: 0,Country,NewCases,ActiveCases,TotalRecovered,NewDeaths,TotalDeaths,TotalCases
0,USA,105,1211989,931079,0,120688,2263756
1,Brazil,0,415130,520360,0,47869,983359
2,Russia,7972,236816,324406,181,7841,569063
3,India,448,163688,205245,2,12606,381539
4,UK,0,0,258181,0,42288,300469
5,Spain,0,0,265212,0,27136,292348
6,Peru,0,105737,131190,0,7461,244388
7,Italy,0,23101,180544,0,34514,238159
8,Chile,0,34821,186441,0,3841,225103
9,Iran,0,31384,156991,0,9272,197647


In [21]:
# Check if there are anymore missing values

world_df.isna().sum()

Country           0
NewCases          0
ActiveCases       0
TotalRecovered    0
NewDeaths         0
TotalDeaths       0
TotalCases        0
dtype: int64

In [22]:
# Convert all columns except Country into int to aid further calculations

world_df.NewCases = world_df.NewCases.astype(int)
world_df.ActiveCases = world_df.ActiveCases.astype(int)
world_df.TotalRecovered = world_df.TotalRecovered.astype(int)
world_df.NewDeaths = world_df.NewDeaths.astype(int)
world_df.TotalDeaths = world_df.TotalDeaths.astype(int)
world_df.TotalCases = world_df.TotalCases.astype(int)

In [23]:
world_df.dtypes

Country           object
NewCases           int32
ActiveCases        int32
TotalRecovered     int32
NewDeaths          int32
TotalDeaths        int32
TotalCases         int32
dtype: object

In [24]:
world_df

Unnamed: 0,Country,NewCases,ActiveCases,TotalRecovered,NewDeaths,TotalDeaths,TotalCases
0,USA,105,1211989,931079,0,120688,2263756
1,Brazil,0,415130,520360,0,47869,983359
2,Russia,7972,236816,324406,181,7841,569063
3,India,448,163688,205245,2,12606,381539
4,UK,0,0,258181,0,42288,300469
5,Spain,0,0,265212,0,27136,292348
6,Peru,0,105737,131190,0,7461,244388
7,Italy,0,23101,180544,0,34514,238159
8,Chile,0,34821,186441,0,3841,225103
9,Iran,0,31384,156991,0,9272,197647


## Web Scrapping the India Dataset

In [25]:
response = requests.get(india_url)
print(response)

<Response [200]>


In [26]:
soup = BeautifulSoup(response.content,'html.parser')
soup.title

<title>MoHFW | Home</title>

In [27]:
ct = soup.find('table')

In [28]:
state = []
total_cases = []
deaths = []
recovered = []
active_cases = []
rows = ct.find_all('tr')[1:35]
for row in rows:
    col = row.find_all('td')
    state.append(col[1].text.strip())
    active_cases.append(col[2].text.strip())
    recovered.append(col[3].text.strip())
    deaths.append(col[4].text.strip())
    total_cases.append(col[5].text.strip())
    
print(state)

['Andaman and Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh', 'Dadra and Nagar Haveli and Daman and Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Ladakh', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura', 'Uttarakhand', 'Uttar Pradesh']


In [29]:
india_df = pd.DataFrame(list(zip(state,active_cases,recovered,deaths,total_cases)),columns=['State','ActiveCases','RecoveredCases','TotalDeaths','TotalCases'])

In [30]:
india_df

Unnamed: 0,State,ActiveCases,RecoveredCases,TotalDeaths,TotalCases
0,Andaman and Nicobar Islands,11,33,0,44
1,Andhra Pradesh,3637,3789,92,7518
2,Arunachal Pradesh,93,10,0,103
3,Assam,2114,2654,9,4777
4,Bihar,1925,5056,44,7025
5,Chandigarh,62,306,6,374
6,Chhattisgarh,708,1228,10,1946
7,Dadra and Nagar Haveli and Daman and Diu,45,13,0,58
8,Delhi,26669,21341,1969,49979
9,Goa,596,109,0,705


In [31]:
india_df.dtypes

State             object
ActiveCases       object
RecoveredCases    object
TotalDeaths       object
TotalCases        object
dtype: object

In [32]:
# Check for duplicated values in the State column

india_df.State.duplicated().sum()

0

In [33]:
india_df.isnull().sum()

State             0
ActiveCases       0
RecoveredCases    0
TotalDeaths       0
TotalCases        0
dtype: int64

In [34]:
# Convert into integer from object type

india_df.ActiveCases = india_df.ActiveCases.astype(int)
india_df.RecoveredCases = india_df.RecoveredCases.astype(int)
india_df.TotalDeaths = india_df.TotalCases.astype(int)
india_df.TotalCases = india_df.TotalCases.astype(int)

In [35]:
india_df.dtypes

State             object
ActiveCases        int32
RecoveredCases     int32
TotalDeaths        int32
TotalCases         int32
dtype: object

In [36]:
# This dataset does not require cleaning