<a href="https://colab.research.google.com/github/dsharjeel/covid19_trend_analysis/blob/main/Covid19_trend_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Keypoints in this project
# Data Collection
# Data Pre Processing
# EDA
# Conclusions

In [1]:
from bs4 import BeautifulSoup as bs
from datetime import time, datetime, timedelta
from urllib.request import Request, urlopen
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import seaborn as sns
import gc

import warnings
warnings.filterwarnings("ignore")

## Fetching Data from Web

In [2]:
today = datetime.now()
yesterday_str = "%s %d,%d" %(today.today().strftime("%b"), today.day-1, today.year)
yesterday_str

'Apr 3,2023'

In [3]:
url = "https://www.worldometers.info/coronavirus/#countries"
req = Request(url, headers = {'User-Agent' : 'Mozilla/5.0'})
webpage = urlopen(req)
page_soup = bs(webpage, "html.parser")

## Data Processing and Cleaning

In [4]:
table = page_soup.findAll("table", {"id": "main_table_countries_yesterday"})

containers = table[0].findAll("tr", {"style": ""})
title = containers[0]

del containers[0]

all_data = []
clean = True

for country in containers:
    country_data = []
    country_container = country.findAll("td")
    
    if country_container[1].text == "China":
        continue
    for i in range(1, len(country_container)):
        final_feature = country_container[i].text
        if clean:
            if i != 1 and i != len(country_container)-1:
                final_feature = final_feature.replace(",", "")
                
                if final_feature.find('+') != -1:
                    final_feature = final_feature.replace("+", "")
                    final_feature = float(final_feature)
                elif final_feature.find("-") != -1:
                    final_feature = final_feature.replace("-", "")
                    final_feature = float(final_feature)*-1
        if final_feature == 'N/A':
            final_feature = 0
        elif final_feature == "" or final_feature == " ":
            final_feature = -1
        
        country_data.append(final_feature)
        
    all_data.append(country_data)

## Creating DataFrame in Pandas

In [5]:
df = pd.DataFrame(all_data)
df.drop([15,16,17,18,19,20], inplace= True, axis = 1)

column_labels = ["Country", "Total Cases", "New Cases", "Total Deaths", "New Deaths", "Total Recovered",
                 "New Recovered", "Active Cases", "Serious/Critical", "Total Cases/1M", "Deaths/1M",
                "Total Tests", "Tests/1M", "Population", "Continent"]
df.columns = column_labels

df.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Tests/1M,Population,Continent
0,World,684127321,47448.0,6832667,362.0,657109277,78978.0,20185377,39905,87767,876.6,-1,-1,-1,All
1,USA,106253348,1761.0,1155356,10.0,104031817,18798.0,1066175,1984,317359,3451.0,1174666956,3508508,334805269,North America
2,India,44722605,-1.0,530881,-1.0,44173335,-1.0,18389,0,31794,377.0,921676372,655236,1406631776,Asia
3,France,39807735,1453.0,165736,58.0,39501424,9937.0,140575,869,606968,2527.0,271490188,4139547,65584518,Europe
4,Germany,38359611,4985.0,171059,95.0,38089400,8500.0,99152,0,457296,2039.0,122332384,1458359,83883596,Europe


In [6]:
for label in df.columns:
    if label != 'Country' and label != 'Continent':
        df[label] = pd.to_numeric(df[label])

In [7]:
df["%Inc Cases"] = df["New Cases"]/df["Total Cases"]*100
df["%Inc Deaths"] = df["New Deaths"]/df["Total Deaths"]*100
df["%Inc Recovered"] = df["New Recovered"]/df["Total Recovered"]*100

In [8]:
df.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Tests/1M,Population,Continent,%Inc Cases,%Inc Deaths,%Inc Recovered
0,World,684127321,47448.0,6832667,362.0,657109277,78978.0,20185377,39905,87767.0,876.6,-1,-1,-1,All,0.006936,0.005298,0.012019
1,USA,106253348,1761.0,1155356,10.0,104031817,18798.0,1066175,1984,317359.0,3451.0,1174666956,3508508,334805269,North America,0.001657,0.000866,0.018069
2,India,44722605,-1.0,530881,-1.0,44173335,-1.0,18389,0,31794.0,377.0,921676372,655236,1406631776,Asia,-2e-06,-0.000188,-2e-06
3,France,39807735,1453.0,165736,58.0,39501424,9937.0,140575,869,606968.0,2527.0,271490188,4139547,65584518,Europe,0.00365,0.034995,0.025156
4,Germany,38359611,4985.0,171059,95.0,38089400,8500.0,99152,0,457296.0,2039.0,122332384,1458359,83883596,Europe,0.012995,0.055536,0.022316


## EDA

In [9]:
cases = df[["Total Recovered", "Active Cases", "Total Deaths"]].loc[0]

cases_df = pd.DataFrame(cases).reset_index()
cases_df.columns = ["Type", "Total"]

cases_df["Percentage"] = np.round(100*cases_df["Total"]/np.sum(cases_df["Total"]),2)
cases_df["virus"] = ["COVID-19" for i in range(len(cases_df))]

fig = px.bar(cases_df, x = "virus", y = "Percentage", color = "Type", hover_data = ["Total"])
fig.show()

In [10]:
cases = df[["New Cases", "New Recovered", "New Deaths"]].loc[0]

cases_df = pd.DataFrame(cases).reset_index()
cases_df.columns = ["Type", "Total"]

cases_df["Percentage"] = np.round(100*cases_df["Total"]/np.sum(cases_df["Total"]),2)
cases_df["virus"] = ["COVID-19" for i in range(len(cases_df))]

fig = px.bar(cases_df, x = "virus", y = "Percentage", color = "Type", hover_data = ["Total"])
fig.show()

In [11]:
per = np.round(df[["%Inc Cases", "%Inc Deaths", "%Inc Recovered"]].loc[0], 2)

per_df = pd.DataFrame(per)
per_df.columns = ["Percentage"]

fig = go.Figure()

fig.add_trace(go.Bar(x = per_df.index, y = per_df["Percentage"], marker_color = [ "Yellow", "Blue", "Red"]))
fig.show()

## Continent

In [12]:
continent_df = df.groupby("Continent").sum().drop("All")
continent_df = continent_df.reset_index()
continent_df

Unnamed: 0,Continent,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Tests/1M,Population,%Inc Cases,%Inc Deaths,%Inc Recovered
0,Africa,12801864,1283.0,258559,-54.0,10952761,235.0,376522,516,2488133.0,18485.0,110808928,10905528,1402440339,0.970033,87.42956,-48.717487
1,Asia,209970392,13760.0,1532369,-2.0,177891628,10335.0,13387886,1690,7601215.0,34534.0,2221579025,92864574,3217529895,0.333222,-8.406057,0.185537
2,Australia/Oceania,14018291,1722.0,26712,-17.0,13781644,1587.0,82364,48,4629466.0,8817.0,88554101,20825132,43470408,-20.223692,92.173117,199.541962
3,Europe,247932282,24026.0,2031294,254.0,243452093,40309.0,2167411,2679,18670324.0,125507.0,2827797262,214274690,747543038,0.589673,-10.212575,0.594805
4,North America,125725231,5312.0,1622501,-13.0,121181626,24250.0,2017289,2336,8830300.0,58506.0,1312171562,101484787,598140916,0.254001,-112.080478,0.209626
5,South America,58287812,1181.0,1222127,-2.0,55746736,2108.0,448817,1489,1830075.0,30180.0,207686554,10716873,391680670,0.057825,-0.427131,0.035559


In [13]:
def continent_visualization(vis_list):
    for label in vis_list:
        c_df = continent_df[['Continent', label]]
        c_df['Percentage'] = np.round(100*c_df[label]/np.sum(c_df[label]), 2)
        c_df['Virus'] = ['Covid-19' for i in range(len(c_df))]
        
        fig = px.bar(c_df, x = "Virus", y = "Percentage", color = "Continent", hover_data = [label])
        fig.update_layout(title = {"text":f"{label}"})
        fig.show()
        gc.collect()

In [14]:
cases_list = ["Total Cases", "Active Cases", "New Cases", "Serious/Critical", "Total Cases/1M"]
deaths_list = ["Total Deaths", "New Deaths", "Deaths/1M"]
recovered_list = [ "Total Recovered", "New Recovered", "%Inc Recovered"]

continent_visualization(deaths_list)

In [15]:
df = df.drop([len(df)-1])
country_df = df.drop([0])

country_df

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Tests/1M,Population,Continent,%Inc Cases,%Inc Deaths,%Inc Recovered
1,USA,106253348,1761.0,1155356,10.0,104031817,18798.0,1066175,1984,317359.0,3451.0,1174666956,3508508,334805269,North America,0.001657,0.000866,0.018069
2,India,44722605,-1.0,530881,-1.0,44173335,-1.0,18389,0,31794.0,377.0,921676372,655236,1406631776,Asia,-0.000002,-0.000188,-0.000002
3,France,39807735,1453.0,165736,58.0,39501424,9937.0,140575,869,606968.0,2527.0,271490188,4139547,65584518,Europe,0.003650,0.034995,0.025156
4,Germany,38359611,4985.0,171059,95.0,38089400,8500.0,99152,0,457296.0,2039.0,122332384,1458359,83883596,Europe,0.012995,0.055536,0.022316
5,Brazil,37258663,-1.0,700239,-1.0,36249161,-1.0,309263,0,173012.0,3252.0,63776166,296146,215353593,South America,-0.000003,-0.000143,-0.000003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,Tuvalu,2805,-1.0,-1,-1.0,-1,-1.0,2805,-1,232471.0,-1.0,-1,-1,12066,Australia/Oceania,-0.035651,100.000000,100.000000
218,Saint Helena,2166,-1.0,-1,-1.0,2,-1.0,2164,-1,354211.0,-1.0,-1,-1,6115,Africa,-0.046168,100.000000,-50.000000
219,Montserrat,1403,-1.0,8,-1.0,1376,-1.0,19,-1,282578.0,1611.0,17762,3577442,4965,North America,-0.071276,-12.500000,-0.072674
220,Niue,797,-1.0,-1,-1.0,796,-1.0,1,-1,491369.0,-1.0,-1,-1,1622,Australia/Oceania,-0.125471,100.000000,-0.125628


In [16]:
LOOK_AT = 5
country = country_df.columns[1:14]

fig = go.Figure()
c = 0
for i in country_df.index:
    if c < LOOK_AT:
        fig.add_trace(go.Bar(name = country_df['Country'][i], x = country, y = country_df.loc[i][1:14]))
    else:
        break
    c += 1
    
fig.update_layout(title = {"text":f'Top {LOOK_AT} countries affected '}, yaxis_type = "log")
fig.show()