In [299]:
# Storing and analysis
import numpy as np
import pandas as pd

# Visualisation
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
import calmap
import folium


# Hiding warning
import warnings
warnings.filterwarnings('ignore')

In [300]:
# reading csv file
link = '/Users/ari/Documents/Data Ari/Data Science'
corona = pd.read_csv(f'{link}/covid_19.csv', parse_dates=['Date'])
corona.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Thailand,15.0,101.0,2020-01-22,2,0,0
1,,Japan,36.0,138.0,2020-01-22,2,0,0
2,,Singapore,1.2833,103.8333,2020-01-22,0,0,0
3,,Nepal,28.1667,84.25,2020-01-22,0,0,0
4,,Malaysia,2.5,112.5,2020-01-22,0,0,0


## Preprocessing 

In [301]:
# information of covid-19 file
corona.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13932 entries, 0 to 13931
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Province/State  6426 non-null   object        
 1   Country/Region  13932 non-null  object        
 2   Lat             13932 non-null  float64       
 3   Long            13932 non-null  float64       
 4   Date            13932 non-null  datetime64[ns]
 5   Confirmed       13932 non-null  int64         
 6   Deaths          13932 non-null  int64         
 7   Recovered       13932 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(3), object(2)
memory usage: 870.9+ KB


In [302]:
corona.describe()

Unnamed: 0,Lat,Long,Confirmed,Deaths,Recovered
count,13932.0,13932.0,13932.0,13932.0,13932.0
mean,27.608605,6.373762,247.275266,7.865777,91.806632
std,22.736544,81.569208,3213.277628,128.51917,1575.264971
min,-41.4545,-157.4983,0.0,0.0,0.0
25%,15.2,-72.7554,0.0,0.0,0.0
50%,34.42025,11.55575,0.0,0.0,0.0
75%,43.0,78.0,4.0,0.0,0.0
max,64.9631,174.886,67794.0,3085.0,54288.0


In [303]:
# checking the missing values
corona.isna().sum()

Province/State    7506
Country/Region       0
Lat                  0
Long                 0
Date                 0
Confirmed            0
Deaths               0
Recovered            0
dtype: int64

In [304]:
# find the most active cases
corona['Active'] = corona['Confirmed'] - corona['Deaths'] - corona['Recovered']
corona.head(3)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active
0,,Thailand,15.0,101.0,2020-01-22,2,0,0,2
1,,Japan,36.0,138.0,2020-01-22,2,0,0,2
2,,Singapore,1.2833,103.8333,2020-01-22,0,0,0,0


In [305]:
# filling the missing value with empty fill
corona['Province/State'] = corona['Province/State'].fillna('')
corona.head(3)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active
0,,Thailand,15.0,101.0,2020-01-22,2,0,0,2
1,,Japan,36.0,138.0,2020-01-22,2,0,0,2
2,,Singapore,1.2833,103.8333,2020-01-22,0,0,0,0


In [306]:
# sum the confirmed, death, recovered, and active in each date
sum_case = corona.groupby('Date')[['Confirmed','Deaths','Recovered','Active']].sum().reset_index()
sum_case.head(3)

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active
0,2020-01-22,554,17,28,509
1,2020-01-23,652,18,30,604
2,2020-01-24,939,26,36,877


In [307]:
# Finding the current case of coronavirus
current_case = corona[corona['Date']==max(corona['Date'])].reset_index(drop=True)
current_case.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active
0,,Thailand,15.0,101.0,2020-03-15,114,1,35,78
1,,Japan,36.0,138.0,2020-03-15,839,22,118,699
2,,Singapore,1.2833,103.8333,2020-03-15,226,0,105,121
3,,Nepal,28.1667,84.25,2020-03-15,1,0,1,0
4,,Malaysia,2.5,112.5,2020-03-15,428,0,42,386


In [308]:
# Grouping the country
grouping_country = current_case.groupby('Country/Region')[['Confirmed','Deaths','Recovered','Active']].sum().reset_index()
grouping_country.head(3)

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active
0,Afghanistan,16,0,0,16
1,Albania,42,1,0,41
2,Algeria,48,4,12,32


In [309]:
# Sorting the highest infected
highest_infected = grouping_country.sort_values('Confirmed', ascending=False).reset_index(drop=True)
highest_infected.style.background_gradient(cmap='Reds')

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active
0,China,81003,3203,67017,10783
1,Italy,24747,1809,2335,20603
2,Iran,13938,724,4590,8624
3,"Korea, South",8162,75,510,7577
4,Spain,7798,289,517,6992
5,Germany,5795,11,46,5738
6,France,4513,91,12,4410
7,US,3498,63,12,3423
8,Switzerland,2200,14,4,2182
9,Norway,1221,3,1,1217


In [310]:
# Finding country that all positive infections have been recovered
all_positive_recovered = grouping_country[(grouping_country['Confirmed']!= 0) & 
                                          (grouping_country['Confirmed'] == 
                                           (grouping_country['Recovered']))].reset_index()
display(all_positive_recovered[['Country/Region','Confirmed','Recovered']].style.background_gradient(cmap='Greens'))

# Finding country that all positive infections have been death
all_positive_death = grouping_country[(grouping_country['Confirmed']!=0) &
                                     (grouping_country['Confirmed'] == grouping_country['Deaths'])].reset_index()
display(all_positive_death[['Country/Region','Confirmed','Deaths']].style.background_gradient(cmap='Reds'))

# Finding country that has not been infected yet
not_infected = grouping_country[grouping_country['Confirmed']==0].reset_index(drop=True)
display(not_infected.style.background_gradient(cmap='Greens_r'))

Unnamed: 0,Country/Region,Confirmed,Recovered
0,Andorra,1,1
1,Nepal,1,1


Unnamed: 0,Country/Region,Confirmed,Deaths
0,Sudan,1,1


Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active
0,occupied Palestinian territory,0,0,0,0


In [311]:
# finding the maximum case 
maximum_case = sum_case[sum_case['Confirmed'] == max(sum_case['Confirmed'])].reset_index(drop=True)
maximum_case.style.background_gradient(cmap='RdYlGn')

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active
0,2020-03-15 00:00:00,167448,6440,76034,84974


In [312]:
# Show the variable and value
vv = maximum_case.melt(id_vars='Date', value_vars=['Active','Deaths','Recovered'])
vv

# Treemap plot
px.treemap(vv, path=['variable'], values='value', height=600, width=800,
           color_discrete_sequence=['#F6D55C','#3CAEAC', '#ED5538'])

## Visualization 

In [313]:
# Cases over time
case_ot = sum_case.melt(id_vars='Date', value_vars=['Recovered','Deaths','Active'], var_name='Case', value_name='Count')

# Area plot
fig = px.area(case_ot, x='Date', y='Count', color='Case', title='Case Over Time', height=600,
        color_discrete_sequence=['#3CAEAC','#ED5538','#F6D55C'])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

### Maps 

In [314]:
current_case = corona[corona['Date']==max(corona['Date'])].reset_index(drop=True)

m = folium.Map(location=[0,0], tiles='cartodbpositron', min_zoom=1, max_zoom=10, zoom_start=1)

for i in range(0, len(current_case)):
    folium.Circle(
    location=[current_case.iloc[i]['Lat'], current_case.iloc[i]['Long']], color='crimson', fill='crimson',
        tooltip = '<li><bold>Country : '+str(current_case.iloc[i]['Country/Region'])+
                  '<li><bold>Province : '+str(current_case.iloc[i]['Province/State'])+
                  '<li><bold>Confirmed : '+str(current_case.iloc[i]['Confirmed'])+
                  '<li><bold>Deaths : '+str(current_case.iloc[i]['Deaths']),
        radius = int(current_case.iloc[i]['Confirmed'])**0.5).add_to(m)
    
m

In [315]:
full_grouped = corona.groupby(['Date','Country/Region'])[['Confirmed','Deaths','Recovered','Active']].sum().reset_index()
full_grouped.head()

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active
0,2020-01-22,Afghanistan,0,0,0,0
1,2020-01-22,Albania,0,0,0,0
2,2020-01-22,Algeria,0,0,0,0
3,2020-01-22,Andorra,0,0,0,0
4,2020-01-22,Antigua and Barbuda,0,0,0,0


In [316]:
# Countries with confirmed reports
fig = px.choropleth(full_grouped, locations='Country/Region', locationmode='country names',
                   color = np.log(full_grouped['Confirmed']), hover_name='Country/Region', hover_data=['Confirmed'], 
                   animation_frame = full_grouped['Date'].dt.strftime('%Y-%m-%d'),
                   title = 'Countries with confirmed reports over time', color_continuous_scale='Inferno')
fig.update(layout_coloraxis_showscale=False)
fig.show()

In [317]:
# Countries with death reports
figd = px.choropleth(full_grouped, locations='Country/Region', locationmode='country names', 
                    color = np.log(full_grouped['Deaths']), hover_name='Country/Region', hover_data=['Deaths'],
                    animation_frame = full_grouped['Date'].dt.strftime('%Y-%m-%d'), color_continuous_scale='agsunset',
                    title = 'Countries with death reports over time')
figd.update(layout_coloraxis_showscale=False)
figd.show()

In [318]:
# Countries with recovered reports
figd = px.choropleth(full_grouped, locations='Country/Region', locationmode='country names', 
                    color = np.log(full_grouped['Recovered']), hover_name='Country/Region', hover_data=['Recovered'],
                    animation_frame = full_grouped['Date'].dt.strftime('%Y-%m-%d'), color_continuous_scale='Greens',
                    title = 'Countries with recovered reports over time')
figd.update(layout_coloraxis_showscale=False)
figd.show()

In [319]:
case_date = full_grouped.groupby('Date')[['Confirmed','Deaths','Recovered','Active']].sum().reset_index()
case_date['No. of countries'] = full_grouped[full_grouped['Confirmed']!=0].groupby('Date')['Country/Region'].unique().apply(len).values
case_date.head()

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active,No. of countries
0,2020-01-22,554,17,28,509,5
1,2020-01-23,652,18,30,604,7
2,2020-01-24,939,26,36,877,8
3,2020-01-25,1432,42,39,1351,10
4,2020-01-26,2113,56,52,2005,12


In [320]:
# Confirmed
figc = px.bar(case_date, x='Date', y='Confirmed', color_discrete_sequence=['#F6D55C'])

# Deaths
figd = px.bar(case_date, x='Date', y='Deaths', color_discrete_sequence=['#ED5538'])

# Recovered
figr = px.bar(case_date, x='Date', y='Recovered', color_discrete_sequence=['#3CAEAC'])

# No. of countries
fign = px.bar(case_date, x='Date', y='No. of countries', color_discrete_sequence=['#333333'])

# subplots
fig = make_subplots(rows=2, cols=2, shared_xaxes=False, horizontal_spacing=.14, vertical_spacing=.12,
                   subplot_titles=('Confirmed Cases','Death Reports', 'Recovered Reports','No. of Countries'))

fig.add_trace(figc['data'][0], row=1, col=1)
fig.add_trace(figd['data'][0], row=1, col=2)
fig.add_trace(figr['data'][0], row=2, col=1)
fig.add_trace(fign['data'][0], row=2, col=2)

fig.update_layout(height=700)

In [321]:
current_case = current_case.groupby('Country/Region')[['Confirmed','Deaths','Recovered','Active']].sum().reset_index()

# Confirmed - Deaths
figc = px.bar(current_case.sort_values('Confirmed').tail(10), x='Confirmed', y='Country/Region', 
              text='Confirmed',orientation='h', color_discrete_sequence=['#F6D55C'])

figd = px.bar(current_case.sort_values('Deaths').tail(10), x='Deaths', y='Country/Region', text='Deaths',
               orientation='h', color_discrete_sequence=['#ED5538'])

# Recovered - Active
figr = px.bar(current_case.sort_values('Recovered').tail(10), x='Recovered', y='Country/Region',
             text='Recovered', orientation='h', color_discrete_sequence=['#3CAEAC'])

figa = px.bar(current_case.sort_values('Active').tail(10), x='Active', y='Country/Region',
             text='Active', orientation='h', color_discrete_sequence=['#333333'])

# plot
fig = make_subplots(rows=2, cols=2, shared_xaxes=False, horizontal_spacing=.24, vertical_spacing=.08,
                    subplot_titles=('Confirmed Case', 'Death Reports', 'Recovered Reports', 'Active Reports'))

fig.add_trace(figc['data'][0], row=1, col=1)
fig.add_trace(figd['data'][0], row=1, col=2)
fig.add_trace(figr['data'][0], row=2, col=1)
fig.add_trace(figa['data'][0], row=2, col=2)

fig.update_layout(height=1000)