In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
world_url = 'https://www.worldometers.info/coronavirus/'
india_url = 'https://www.mohfw.gov.in/'

## Web Scrapping the World Dataset

In [None]:
response = requests.get(world_url)
print(response)

In [None]:
soup = BeautifulSoup(response.content,'html.parser')
soup.title

In [None]:
divs = soup.find('div',class_='panel_flip')
for i in divs.find_all('div',class_='panel_front'):
    print(i.text)

In [None]:
coronatable = soup.find_all('table')

In [None]:
ct = coronatable[0]

In [None]:
country = []
total_cases = []
new_cases = []
total_deaths = []
new_deaths = []
total_recovered = []
active_cases = []
rows = ct.find_all('tr')[9:-8]
for row in rows:
    col = row.find_all('td')
    country.append(col[1].text.strip())
    total_cases.append(col[2].text.strip().replace(',',''))
    new_cases.append(col[3].text.strip().replace(',','').replace('+',''))
    total_deaths.append(col[4].text.strip().replace(',',''))
    new_deaths.append(col[5].text.strip().replace(',','').replace('+',''))
    total_recovered.append(col[6].text.strip().replace(',',''))
    active_cases.append(col[8].text.strip().replace(',','').replace('+',''))
    
print(country)
print(active_cases)

In [None]:
world_df = pd.DataFrame(list(zip(country,new_cases, active_cases, total_recovered, new_deaths, total_deaths, total_cases)),
                  columns = ['Country','NewCases','ActiveCases','TotalRecovered','NewDeaths','TotalDeaths','TotalCases'])

In [None]:
world_df.head(20)

## Data Cleaning on World Data

In [None]:
world_df.dtypes

In [None]:
# Check for duplicated rows in the Country column

world_df.Country.duplicated().sum()

In [None]:
# Check for null values
world_df.isnull().sum()

In [None]:
# It can be seen from the dataset that it does have missing values
# Handling missing values by replacing them with NaN

world_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
world_df.replace('N/A', np.nan, inplace=True)
world_df

In [None]:
# Finding the number of NaN values in the dataset

world_df.isna().sum()

In [None]:
# Missing values in NewCases and New Deaths can be filled with zeros 

world_df.NewCases.replace(np.nan,0,inplace=True)
world_df.NewDeaths.replace(np.nan,0,inplace=True)

In [None]:
world_df

In [None]:
world_df.isna().sum()

In [None]:
# For ActiveCase, TotalRecovered and TotalDeaths 
# The missing values can be replaced using the expression : 
# TotalCases = ActiveCases + TotalRecovered + TotalDeaths

for i in world_df.index:
    if world_df.ActiveCases[i] is np.nan:
        if world_df.TotalRecovered[i] is np.nan or world_df.TotalDeaths[i] is np.nan:
            world_df.ActiveCases[i] = 0 + int(world_df.NewCases[i])
        else:
            world_df.ActiveCases[i] = int(world_df.TotalCases[i]) + int(world_df.NewCases[i]) - int(world_df.TotalDeaths[i]) - int(world_df.TotalRecovered[i])
    if world_df.TotalRecovered[i] is np.nan:
        if world_df.TotalDeaths[i] is np.nan:
            world_df.TotalRecovered[i] = 0
        else:
            world_df.TotalRecovered[i] = int(world_df.TotalCases[i]) - int(world_df.TotalDeaths[i]) - int(world_df.ActiveCases[i])
            
    if world_df.TotalDeaths[i] is np.nan:
        world_df.TotalDeaths[i] = int(world_df.TotalCases[i]) + int(world_df.NewDeaths[i]) - int(world_df.ActiveCases[i]) - int(world_df.TotalRecovered[i])

In [None]:
world_df.head(20)

In [None]:
# Check if there are anymore missing values

world_df.isna().sum()

In [None]:
# Convert all columns except Country into int to aid further calculations

world_df.NewCases = world_df.NewCases.astype(int)
world_df.ActiveCases = world_df.ActiveCases.astype(int)
world_df.TotalRecovered = world_df.TotalRecovered.astype(int)
world_df.NewDeaths = world_df.NewDeaths.astype(int)
world_df.TotalDeaths = world_df.TotalDeaths.astype(int)
world_df.TotalCases = world_df.TotalCases.astype(int)

In [None]:
world_df.dtypes

In [None]:
world_df

## Data Analysis

In [None]:
world_df.describe()

Drop 'NewCases' and 'NewDeaths' from the main dataframe as they are daily data and all other columns are cumulative data.

In [None]:
world_df = world_df.drop(columns=['NewDeaths', 'NewCases'])

In [None]:
world_df.sort_values(by=['TotalCases'],ascending=False,inplace=True)
world_df.head(30)

In [None]:
world_df_m = world_df.head(20)
world_df_m

Let's see the total numbers across various categories for the top 20 infected countries.

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

ax.get_yaxis().get_major_formatter().set_scientific(False)

# Add a bar for the total confimred cases column 
ax.bar("Confimred", world_df['TotalCases'].sum())
plt.text(-.1, world_df['TotalCases'].sum() + 50000, str(world_df['TotalCases'].sum()),fontweight='bold')

# Add a bar for the total active cases column 
ax.bar("ActiveCases", world_df['ActiveCases'].sum())
plt.text(-.1+1, world_df['ActiveCases'].sum() + 50000, str(world_df['ActiveCases'].sum()),fontweight='bold')

# Add a bar for the total recovered cases column 
ax.bar("Recovered", world_df['TotalRecovered'].sum())
plt.text(-.1+2, world_df['TotalRecovered'].sum() + 50000, str(world_df['TotalRecovered'].sum()),fontweight='bold')

# Add a bar for the total deaths column 
ax.bar("Deaths", world_df['TotalDeaths'].sum())
plt.text(-.1+3, world_df['TotalDeaths'].sum() + 50000, str(world_df['TotalDeaths'].sum()),fontweight='bold')
# Label the y-axis
ax.set_ylabel("Total Numbers")

# Plot title
plt.title('Total numbers across the world')

plt.show()

Plotting the 20 worst hit countries in terms of confirmed Covid cases.

In [None]:
df = world_df_m.sort_values('TotalCases', ascending=False).set_index('Country').fillna(0)

#rcParams['figure.figsize'] = 15, 5
fig, ax = plt.subplots(figsize=(15,5))

# Plot a bar-chart of total confirmed cases as a function of country
ax.bar(df.index,df['TotalCases'])

# Set the x-axis tick labels to the country names
ax.set_xticklabels(df.index, rotation = 90)

# Set the y-axis label
ax.set_ylabel("Total Confirmed Cases")

# Plot title
plt.title('Total confirmed cases - top 20 hit countries')

plt.show()

We see USA outweighs all other countries by a significant number and dominates the plot, thus is the worst hit country. If we look at the below boxplot which compares the distribution of actives case, deaths and recovered cases, we can clearly see the far out outlier is USA thus confirming how badly it is hit.

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

# Plot a histogram of "Weight" for mens_rowing
ax.boxplot([df['ActiveCases'],df['TotalDeaths'],df['TotalRecovered']])

ax.set_ylabel("Number of cases")
# Add x-axis tick labels:
ax.set_xticklabels(['Active Cases', 'Total Deaths','Total Recovered'])

# Plot title
plt.title('Distribution of various category of cases - top 20 hit countries')

plt.show()

Let's look at the total deaths occured so far.

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

df = df.sort_values('TotalDeaths', ascending=False).fillna(0)

# Plot a bar-chart of total deaths as a function of country
ax.bar(df.index,df['TotalDeaths'])

# Set the x-axis tick labels to the country names
ax.set_xticklabels(df.index, rotation = 90)

# Set the y-axis label
ax.set_ylabel("Total Deaths")

# Plot title
plt.title('Total deaths - top 20 hit countries')

plt.show()

Again, as expected USA is worst. Now, let's see the number of cases recovered so far from Covid-19

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
df = df.sort_values('TotalRecovered', ascending=False).fillna(0)

# Plot a bar-chart of total recovered cases as a function of country
ax.bar(df.index,df['TotalRecovered'])

# Set the x-axis tick labels to the country names
ax.set_xticklabels(df.index, rotation = 90)

# Set the y-axis label
ax.set_ylabel("Total Recovered Cases")

# Plot title
plt.title('Total recovered cases - top 20 hit countries')

plt.show()

Though, as one can guess USA tops the list, other countries are not very far behind. This shows that compared to USA, other countries are doing better in terms of recovery.

We will also create a new column 'Dead_to_Recovered', which is a percentage number of 'TotalDeaths' to 'TotalRecovered'

In [None]:
df_m = world_df.copy()
df_m['Dead_to_Recovered'] = 100*df_m['TotalDeaths']/df_m['TotalRecovered']
df_m = df_m.head(20)
df_m

We can look at Dead_to_Recovered, which measures the immunity levels of people or the effective treatment that infected people are receiving. This can be measured by looking at how many people died when compared to how many people recovered.

In [None]:
df_s = df_m.set_index('Country').sort_values('Dead_to_Recovered',ascending=False)

fig, ax = plt.subplots(figsize=(15,5))

# Plot a bar-chart of dead to recovered as a function of country
ax.bar(df_s.index,df_s['Dead_to_Recovered'])

# Set the x-axis tick labels to the country names
ax.set_xticklabels(df_s.index, rotation = 90)

# Set the y-axis label
ax.set_ylabel("% dead against recovered")

# Plot title

plt.title('Number people dead for people recovered - top 20 hit countries')
plt.show()


We can see from the plot, most of the European and American countries are doing bad on recovery.

## Web Scrapping the India Dataset

In [None]:
response = requests.get(india_url)
print(response)

In [None]:
soup = BeautifulSoup(response.content,'html.parser')
soup.title

In [None]:
ct = soup.find('table')

In [None]:
state = []
total_cases = []
deaths = []
recovered = []
active_cases = []
rows = ct.find_all('tr')[1:35]
for row in rows:
    col = row.find_all('td')
    state.append(col[1].text.strip())
    active_cases.append(col[2].text.strip())
    recovered.append(col[3].text.strip())
    deaths.append(col[4].text.strip())
    total_cases.append(col[5].text.strip())
    
print(state)

In [None]:
india_df = pd.DataFrame(list(zip(state,active_cases,recovered,deaths,total_cases)),columns=['State','ActiveCases','RecoveredCases','TotalDeaths','TotalCases'])

In [None]:
india_df

In [None]:
india_df.dtypes

In [None]:
# Check for duplicated values in the State column

india_df.State.duplicated().sum()

In [None]:
india_df.isnull().sum()

In [None]:
# Convert into integer from object type

india_df.ActiveCases = india_df.ActiveCases.astype(int)
india_df.RecoveredCases = india_df.RecoveredCases.astype(int)
india_df.TotalDeaths = india_df.TotalDeaths.astype(int)
india_df.TotalCases = india_df.TotalCases.astype(int)

In [None]:
india_df.dtypes

This dataset does not require cleaning

## Data Analysis

In [None]:
india_df.sort_values(by=['TotalCases'],ascending=False,inplace=True)
india_df.head()

Let's Look at the overall status of every state in India

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

ax.get_yaxis().get_major_formatter().set_scientific(False)

# Add a bar for the total confimred cases column 
ax.bar("Confimred", india_df['TotalCases'].sum())
plt.text(-.1, india_df['TotalCases'].sum() + 50000, str(india_df['TotalCases'].sum()),fontweight='bold')

# Add a bar for the total active cases column 
ax.bar("ActiveCases", india_df['ActiveCases'].sum())
plt.text(-.1+1, india_df['ActiveCases'].sum() + 50000, str(india_df['ActiveCases'].sum()),fontweight='bold')

# Add a bar for the total recovered cases column 
ax.bar("Recovered", india_df['RecoveredCases'].sum())
plt.text(-.1+2, india_df['RecoveredCases'].sum() + 50000, str(india_df['RecoveredCases'].sum()),fontweight='bold')

# Add a bar for the total deaths column 
ax.bar("Deaths", india_df['TotalDeaths'].sum())
plt.text(-.1+3, india_df['TotalDeaths'].sum() + 50000, str(india_df['TotalDeaths'].sum()),fontweight='bold')
# Label the y-axis
ax.set_ylabel("Total Numbers")

# Plot title
plt.title('Total numbers across India')

plt.show()

Plotting the Total number of Cases against the states

In [None]:
df = india_df.sort_values('TotalCases', ascending=False).set_index('State').fillna(0)

#rcParams['figure.figsize'] = 15, 5
fig, ax = plt.subplots(figsize=(15,5))

# Plot a bar-chart of total confirmed cases as a function of country
ax.bar(df.index,df['TotalCases'])

# Set the x-axis tick labels to the country names
ax.set_xticklabels(df.index, rotation = 90)

# Set the y-axis label
ax.set_ylabel("Total Confirmed Cases")

# Plot title
plt.title('Total confirmed cases in each State')

plt.show()

We see Maharashtra outweighs all other states by a significant number and dominates the plot, thus is the worst hit state. If we look at the below boxplot which compares the distribution of actives case, deaths and recovered cases, we can clearly see the far out outlier is Maharastra thus confirming how badly it is hit.

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

# Plot a histogram of "Weight" for mens_rowing
ax.boxplot([df['ActiveCases'],df['TotalDeaths'],df['RecoveredCases']])

ax.set_ylabel("Number of cases")
# Add x-axis tick labels:
ax.set_xticklabels(['Active Cases', 'Total Deaths','Total Recovered'])

# Plot title
plt.title('Distribution of various category of cases in each State')

plt.show()

Let us look at the total deaths in each state

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

df = df.sort_values('TotalDeaths', ascending=False).fillna(0)

# Plot a bar-chart of total deaths as a function of country
ax.bar(df.index,df['TotalDeaths'])

# Set the x-axis tick labels to the country names
ax.set_xticklabels(df.index, rotation = 90)

# Set the y-axis label
ax.set_ylabel("Total Deaths")

# Plot title
plt.title('Total deaths in each State')

plt.show()

Again, Maharashtra is the worst. Then we see the number of cases recovered so far from Covid-19

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
df = df.sort_values('RecoveredCases', ascending=False).fillna(0)

# Plot a bar-chart of total recovered cases as a function of country
ax.bar(df.index,df['RecoveredCases'])

# Set the x-axis tick labels to the country names
ax.set_xticklabels(df.index, rotation = 90)

# Set the y-axis label
ax.set_ylabel("Total Recovered Cases")

# Plot title
plt.title('Total recovered cases in each State')

plt.show()

Though, as one can guess Maharashtra tops the list, other states like Delhi, Tamil Nadu, Gujarat are not very far behind. This shows that compared to Maharashtra, other states are doing better in terms of recovery.

We will also create a new column 'Dead_to_Recovered', which is a percentage number of 'TotalDeaths' to 'TotalRecovered'

In [None]:
df_m = india_df.copy()
df_m['Dead_to_Recovered'] = 100*df_m['TotalDeaths']/df_m['RecoveredCases']
df_m

In [None]:
df_s = df_m.set_index('State').sort_values('Dead_to_Recovered',ascending=False)

fig, ax = plt.subplots(figsize=(15,5))

# Plot a bar-chart of dead to recovered as a function of country
ax.bar(df_s.index,df_s['Dead_to_Recovered'])

# Set the x-axis tick labels to the country names
ax.set_xticklabels(df_s.index, rotation = 90)

# Set the y-axis label
ax.set_ylabel("% dead against recovered")

# Plot title

plt.title('Number people dead for people recovered in each State')
plt.show()