In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.float_format', '{:.2f}'.format)

## 1. Nhập dữ liệu vào notebook

In [2]:
# path to file
data_folder = "./Data"
os.makedirs(data_folder, exist_ok=True)
# Path to the "grouped_grouped.csv" file in the "Data" folder
file_full_path = os.path.join(data_folder, "raw_full_merged_df.csv")

# read file
raw_df = pd.read_csv(file_full_path)

In [3]:
raw_df.shape

(4287473, 14)

In [4]:
raw_df.sample()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
3649256,21191.0,Pendleton,Kentucky,US,2022-11-10 04:21:16,38.7,-84.36,4080.0,43.0,,,"Pendleton, Kentucky, US",27964.36,1.05


## 2. Drop các cột không dùng
(`Combined_Key`, `Incident_Rate`, `Case_Fatality_Ratio`, `Lat`, `Long_`)

update 27/03/2024: Tạm thời chưa drop

In [5]:
# drop_cols = ['Combined_Key','Incident_Rate','Case_Fatality_Ratio','Lat','Long_']
# country_df = raw_df.copy()
# country_df.drop(columns = drop_cols, inplace = True)
# country_df.shape

In [6]:
country_df = raw_df.copy()
country_df.sample(10)

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
2415298,48319.0,Mason,Texas,US,2020-07-31 04:35:18,30.72,-99.23,42.0,0.0,0.0,39.0,"Mason, Texas, US",912.49,0.0
479217,18061.0,Harrison,Indiana,US,2023-02-10 04:20:54,38.19,-86.11,13211.0,165.0,,,"Harrison, Indiana, US",32607.68,1.25
2144959,30035.0,Glacier,Montana,US,2021-07-08 04:21:26,48.71,-112.99,1548.0,44.0,,,"Glacier, Montana, US",11255.73,2.84
3863004,39107.0,Mercer,Ohio,US,2021-11-28 04:22:43,40.54,-84.63,6793.0,102.0,,,"Mercer, Ohio, US",16499.08,1.47
2676139,35047.0,San Miguel,New Mexico,US,2022-08-21 04:20:44,35.48,-104.82,6887.0,73.0,,,"San Miguel, New Mexico, US",25248.38,1.06
3409992,72083.0,Las Marias,Puerto Rico,US,2022-10-21 04:22:23,18.24,-66.98,2502.0,0.0,,,"Las Marias, Puerto Rico, US",31563.01,0.0
481620,54057.0,Mineral,West Virginia,US,2023-02-10 04:20:54,39.42,-78.94,10151.0,167.0,,,"Mineral, West Virginia, US",37781.0,1.65
398204,2050.0,Bethel,Alaska,US,2021-02-04 05:22:26,60.91,-159.86,3347.0,17.0,0.0,3330.0,"Bethel, Alaska, US",18204.07,0.51
3875716,48303.0,Lubbock,Texas,US,2021-11-29 04:22:30,33.61,-101.82,60567.0,1042.0,,,"Lubbock, Texas, US",21670.22,1.55
1967602,39053.0,Gallia,Ohio,US,2021-06-23 04:21:46,38.83,-82.32,2408.0,50.0,,,"Gallia, Ohio, US",8054.05,2.08


## 4. Xử lý date-time

In [7]:
# Tạo một bản sao của cột 'Last_Update' để giữ nguyên giá trị ban đầu (backup cho các dòng ko convert được)
country_df['Last_Update_original'] = country_df['Last_Update'].copy()

try:
    country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update'], errors='coerce')
except:
    print("Can't convert!")

# Lọc và in ra các dòng không thể chuyển đổi
invalid_rows = country_df[country_df['Last_Update'].isna()]
invalid_rows

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Last_Update_original
156818,,,Andaman and Nicobar Islands,India,NaT,11.23,92.97,4976.00,62.00,4891.00,23.00,"Andaman and Nicobar Islands, India",1193.18,1.25,2021-01-15 17:22
156819,,,Andhra Pradesh,India,NaT,15.91,79.74,885616.00,7138.00,876140.00,2338.00,"Andhra Pradesh, India",1642.97,0.81,2021-01-15 17:22
156820,,,Arunachal Pradesh,India,NaT,27.77,96.38,16798.00,56.00,16674.00,68.00,"Arunachal Pradesh, India",1069.62,0.33,2021-01-15 17:22
156821,,,Assam,India,NaT,26.36,92.83,216762.00,1065.00,212706.00,2991.00,"Assam, India",608.76,0.49,2021-01-15 17:22
156822,,,Bihar,India,NaT,25.68,85.60,256895.00,1447.00,251278.00,4170.00,"Bihar, India",205.85,0.56,2021-01-15 17:22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4287468,,,Tennessee,US,NaT,35.75,-86.69,371.00,1.00,0.00,,,,,3/21/2020 23:13
4287469,,,Wisconsin,US,NaT,44.27,-89.62,282.00,4.00,0.00,,,,,3/21/2020 23:13
4287470,,,,Cape Verde,NaT,15.11,-23.62,1.00,0.00,0.00,,,,,3/21/2020 23:43
4287471,,,,Papua New Guinea,NaT,-6.32,143.96,1.00,0.00,0.00,,,,,3/21/2020 23:43


> Lọc ra các dòng có định dạng mm/dd/yy hh:mm

In [8]:
filtered_df = invalid_rows[invalid_rows['Last_Update_original'].str.contains(r'\d{1,2}/\d{1,2}/\d{2}\s\d{1,2}:\d{2}')]
filtered_df['Last_Update'] = pd.to_datetime(filtered_df['Last_Update_original'], format='%m/%d/%y %H:%M')
country_df.loc[filtered_df.index, 'Last_Update'] = filtered_df['Last_Update']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Last_Update'] = pd.to_datetime(filtered_df['Last_Update_original'], format='%m/%d/%y %H:%M')


Lọc lại các dòng không chuyển được

In [9]:
try:
    country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update'], errors='coerce')
except:
    print("Can't convert!")

# Lọc và in ra các dòng không thể chuyển đổi
invalid_rows = country_df[country_df['Last_Update'].isna()]
invalid_rows


Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Last_Update_original
156818,,,Andaman and Nicobar Islands,India,NaT,11.23,92.97,4976.00,62.00,4891.00,23.00,"Andaman and Nicobar Islands, India",1193.18,1.25,2021-01-15 17:22
156819,,,Andhra Pradesh,India,NaT,15.91,79.74,885616.00,7138.00,876140.00,2338.00,"Andhra Pradesh, India",1642.97,0.81,2021-01-15 17:22
156820,,,Arunachal Pradesh,India,NaT,27.77,96.38,16798.00,56.00,16674.00,68.00,"Arunachal Pradesh, India",1069.62,0.33,2021-01-15 17:22
156821,,,Assam,India,NaT,26.36,92.83,216762.00,1065.00,212706.00,2991.00,"Assam, India",608.76,0.49,2021-01-15 17:22
156822,,,Bihar,India,NaT,25.68,85.60,256895.00,1447.00,251278.00,4170.00,"Bihar, India",205.85,0.56,2021-01-15 17:22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4287468,,,Tennessee,US,NaT,35.75,-86.69,371.00,1.00,0.00,,,,,3/21/2020 23:13
4287469,,,Wisconsin,US,NaT,44.27,-89.62,282.00,4.00,0.00,,,,,3/21/2020 23:13
4287470,,,,Cape Verde,NaT,15.11,-23.62,1.00,0.00,0.00,,,,,3/21/2020 23:43
4287471,,,,Papua New Guinea,NaT,-6.32,143.96,1.00,0.00,0.00,,,,,3/21/2020 23:43


> Lọc ra các dòng có định dạng yyyy-mm-dd hh:mm

In [10]:
filtered_df = invalid_rows[invalid_rows['Last_Update_original'].apply(lambda x: len(str(x)) == 16)]
filtered_df['Last_Update'] = pd.to_datetime(filtered_df['Last_Update_original'], format='%Y-%m-%d %H:%M')
country_df.loc[filtered_df.index, 'Last_Update'] = filtered_df['Last_Update']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Last_Update'] = pd.to_datetime(filtered_df['Last_Update_original'], format='%Y-%m-%d %H:%M')


Lọc lại các dòng không chuyển được

In [11]:
try:
    country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update'], errors='coerce')
except:
    print("Can't convert!")

# Lọc và in ra các dòng không thể chuyển đổi
invalid_rows = country_df[country_df['Last_Update'].isna()]
invalid_rows


Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Last_Update_original
4277676,,,Anhui,Mainland China,NaT,,,1.00,,,,,,,1/22/2020 17:00
4277677,,,Beijing,Mainland China,NaT,,,14.00,,,,,,,1/22/2020 17:00
4277678,,,Chongqing,Mainland China,NaT,,,6.00,,,,,,,1/22/2020 17:00
4277679,,,Cook Islands,New Zealand,NaT,,,0.00,0.00,0.00,,,,,1/22/2020 17:00
4277680,,,England,United Kingdom,NaT,,,0.00,0.00,0.00,,,,,1/22/2020 17:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4287468,,,Tennessee,US,NaT,35.75,-86.69,371.00,1.00,0.00,,,,,3/21/2020 23:13
4287469,,,Wisconsin,US,NaT,44.27,-89.62,282.00,4.00,0.00,,,,,3/21/2020 23:13
4287470,,,,Cape Verde,NaT,15.11,-23.62,1.00,0.00,0.00,,,,,3/21/2020 23:43
4287471,,,,Papua New Guinea,NaT,-6.32,143.96,1.00,0.00,0.00,,,,,3/21/2020 23:43


Lọc ra các dòng có định dạng yyyy-mm-dd hh:mm

In [12]:
# filtered_df = invalid_rows[invalid_rows['Last_Update_original'].str.contains(r'\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}')]
# filtered_df['Last_Update'] = pd.to_datetime(filtered_df['Last_Update_original'], format='%Y-%m-%d %H:%M')
# country_df.loc[filtered_df.index, 'Last_Update'] = filtered_df['Last_Update']

Lọc lại các dòng không chuyển được

In [13]:
# try:
#     country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update'], errors='coerce')
# except:
#     print("Can't convert!")

# # Lọc và in ra các dòng không thể chuyển đổi
# invalid_rows = country_df[country_df['Last_Update'].isna()]
# invalid_rows


Lọc ra các dòng có định dạng mm/dd/yyyy hh:mm

In [14]:
filtered_df = invalid_rows[invalid_rows['Last_Update_original'].str.contains(r'\d{1,2}/\d{1,2}/\d{4}\s\d{1,2}:\d{2}')]
#filtered_df
filtered_df['Last_Update'] = pd.to_datetime(filtered_df['Last_Update_original'], format='%m/%d/%Y %H:%M')
country_df.loc[filtered_df.index, 'Last_Update'] = filtered_df['Last_Update']

Lọc lại các dòng không chuyển được

In [15]:
try:
    country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update'], errors='coerce')
except:
    print("Can't convert!")

# Lọc và in ra các dòng không thể chuyển đổi
invalid_rows = country_df[country_df['Last_Update'].isna()]
invalid_rows


Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Last_Update_original


> Đã chuyển xong <3

In [16]:
country_df.dtypes

FIPS                           float64
Admin2                          object
Province_State                  object
Country_Region                  object
Last_Update             datetime64[ns]
Lat                            float64
Long_                          float64
Confirmed                      float64
Deaths                         float64
Recovered                      float64
Active                         float64
Combined_Key                    object
Incident_Rate                  float64
Case_Fatality_Ratio            float64
Last_Update_original            object
dtype: object

In [17]:
latest_update = country_df['Last_Update'].max()
print(latest_update)

newest_update = country_df['Last_Update'].min()
print(newest_update)

2023-03-10 04:21:03
2020-01-22 17:00:00


In [18]:
# Lọc ra các dòng có Last_Update sau ngày 1/8/2021
drop_indices = country_df[country_df['Last_Update'] >= '2021-08-01'].index

# Drop các dòng có index trong drop_indices
country_df = country_df.drop(drop_indices)

In [19]:
country_df.shape

(1936167, 15)

In [20]:
#fixing Country names
# ====================

# renaming countries, regions, provinces
country_df['Country_Region'] = country_df['Country_Region'].replace('Korea, South', 'South Korea')
country_df['Country_Region'] = country_df['Country_Region'].replace('Korea, North', 'North Korea')
country_df['Country_Region'] = country_df['Country_Region'].replace('Republic of Korea', 'South Korea')
country_df['Country_Region'] = country_df['Country_Region'].replace('Russian Federation', 'Russia')
country_df['Country_Region'] = country_df['Country_Region'].replace('French Guiana', 'France')
country_df['Country_Region'] = country_df['Country_Region'].replace('Iran (Islamic Republic of)', 'Iran')
country_df['Country_Region'] = country_df['Country_Region'].replace('Taiwan*', 'Taiwan')


# Greenland
country_df.loc[country_df['Province_State']=='Greenland', 'Country_Region'] = 'Greenland'

# Mainland china to China
country_df['Country_Region'] = country_df['Country_Region'].replace('Mainland China', 'China')
country_df['Country_Region'] = country_df['Country_Region'].replace('Macao SAR', 'Macau')
country_df['Country_Region'] = country_df['Country_Region'].replace('Hong Kong SAR', 'Hong Kong')

# Vietnam
country_df['Country_Region'] = country_df['Country_Region'].replace('Viet Nam', 'Vietnam')


## 3. Drop các dòng không convert được:


In [21]:
# Tính tổng số location riêng biệt trong dữ liệu gốc
total_unique_locations = country_df['Country_Region'].nunique()
total_unique_locations

240

In [22]:
values_to_drop = ['Aruba','St. Martin', 'Cruise Ship', 'Diamond Princess', 'MS Zaandam', 'North Ireland',
                  'occupied Palestinian territory', 'Others', 'Summer Olympics 2020',
                  'Taipei and environs', 'Winter Olympics 2022',' Azerbaijan','Antarctica', 'Holy See', 'Kiribati', 'Marshall Islands', 'Micronesia', 'Nauru', 'North Korea', 'Palau', 'Samoa', 'Solomon Islands', 'Tonga', 'Tuvalu', 'West Bank and Gaza']

# Loại bỏ các dòng có giá trị trong cột 'Country_Region' thuộc list values_to_drop
country_df = country_df[~country_df['Country_Region'].isin(values_to_drop)]
country_df.shape

(1927861, 15)

In [23]:
country_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Last_Update_original
0,,,,Afghanistan,2021-01-02 05:22:33,33.94,67.71,52513.0,2201.0,41727.0,8585.0,Afghanistan,134.9,4.19,2021-01-02 05:22:33
1,,,,Albania,2021-01-02 05:22:33,41.15,20.17,58316.0,1181.0,33634.0,23501.0,Albania,2026.41,2.03,2021-01-02 05:22:33
2,,,,Algeria,2021-01-02 05:22:33,28.03,1.66,99897.0,2762.0,67395.0,29740.0,Algeria,227.81,2.76,2021-01-02 05:22:33
3,,,,Andorra,2021-01-02 05:22:33,42.51,1.52,8117.0,84.0,7463.0,570.0,Andorra,10505.4,1.03,2021-01-02 05:22:33
4,,,,Angola,2021-01-02 05:22:33,-11.2,17.87,17568.0,405.0,11146.0,6017.0,Angola,53.45,2.31,2021-01-02 05:22:33


In [24]:
#Xoá cột `Last_Update_original` và chuyển datetime về date
country_df.drop(columns=['Last_Update_original'], inplace=True)
country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update']).dt.date
print(country_df.shape)
country_df.to_csv('raw_with_converted_date.csv')
country_df.dtypes


(1927861, 14)


FIPS                   float64
Admin2                  object
Province_State          object
Country_Region          object
Last_Update             object
Lat                    float64
Long_                  float64
Confirmed              float64
Deaths                 float64
Recovered              float64
Active                 float64
Combined_Key            object
Incident_Rate          float64
Case_Fatality_Ratio    float64
dtype: object

In [25]:
country_df

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,,,,Afghanistan,2021-01-02,33.94,67.71,52513.00,2201.00,41727.00,8585.00,Afghanistan,134.90,4.19
1,,,,Albania,2021-01-02,41.15,20.17,58316.00,1181.00,33634.00,23501.00,Albania,2026.41,2.03
2,,,,Algeria,2021-01-02,28.03,1.66,99897.00,2762.00,67395.00,29740.00,Algeria,227.81,2.76
3,,,,Andorra,2021-01-02,42.51,1.52,8117.00,84.00,7463.00,570.00,Andorra,10505.40,1.03
4,,,,Angola,2021-01-02,-11.20,17.87,17568.00,405.00,11146.00,6017.00,Angola,53.45,2.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4287468,,,Tennessee,US,2020-03-21,35.75,-86.69,371.00,1.00,0.00,,,,
4287469,,,Wisconsin,US,2020-03-21,44.27,-89.62,282.00,4.00,0.00,,,,
4287470,,,,Cape Verde,2020-03-21,15.11,-23.62,1.00,0.00,0.00,,,,
4287471,,,,Papua New Guinea,2020-03-21,-6.32,143.96,1.00,0.00,0.00,,,,


In [26]:
country_df.describe()

Unnamed: 0,FIPS,Lat,Long_,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio
count,1586919.0,1878003.0,1878003.0,1927833.0,1927428.0,1443126.0,1434327.0,1660377.0,1677459.0
mean,32307.37,35.91,-72.42,19512.12,466.65,15790.96,7416.68,5069.49,2.65
std,18041.84,12.95,53.33,136718.89,3343.67,130796.39,77882.92,4478.81,35.2
min,66.0,-52.37,-178.12,-302844.0,-178.0,-854405.0,0.0,0.0,0.0
25%,19035.0,33.28,-96.51,158.0,2.0,0.0,57.0,878.68,0.92
50%,30051.0,37.92,-86.89,1042.0,19.0,0.0,468.0,3932.48,1.67
75%,47029.0,42.07,-77.92,4617.0,85.0,0.0,2229.0,8818.31,2.66
max,99999.0,72.0,178.06,6296756.0,138702.0,6399531.0,5567509.0,44906.94,5500.0


In [27]:
# # Active Case = confirmed - deaths - recovered
# country_df['Active'] = country_df['Confirmed'] - country_df['Deaths'] - country_df['Recovered']
# country_df.shape


In [28]:
grouped_df = country_df.groupby(['Country_Region', 'Last_Update'])[['Confirmed','Deaths','Recovered','Active']].sum().reset_index()
grouped_df.shape

(93707, 6)

In [29]:
grouped_df

Unnamed: 0,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active
0,Afghanistan,2020-02-24,68.00,0.00,0.00,0.00
1,Afghanistan,2020-03-08,16.00,0.00,0.00,0.00
2,Afghanistan,2020-03-10,8.00,0.00,0.00,0.00
3,Afghanistan,2020-03-11,33.00,0.00,0.00,0.00
4,Afghanistan,2020-03-14,14.00,0.00,0.00,0.00
...,...,...,...,...,...,...
93702,Zimbabwe,2021-07-27,99944.00,3173.00,67827.00,28944.00
93703,Zimbabwe,2021-07-28,101711.00,3280.00,70496.00,27935.00
93704,Zimbabwe,2021-07-29,103567.00,3340.00,71383.00,28844.00
93705,Zimbabwe,2021-07-30,105656.00,3421.00,73394.00,28841.00


In [30]:
grouped_df.describe()

Unnamed: 0,Confirmed,Deaths,Recovered,Active
count,93707.0,93707.0,93707.0,93707.0
mean,401422.61,9598.31,243187.18,113523.48
std,2046189.17,40784.9,1242337.03,882441.67
min,0.0,0.0,0.0,0.0
25%,1710.0,28.0,772.0,201.0
50%,14951.0,265.0,9009.0,2107.0
75%,141936.5,2737.0,92921.0,18280.5
max,35909045.0,632648.0,30781263.0,28568199.0


# WHO Region

https://en.wikipedia.org/wiki/WHO_regions

In [31]:
who_region = {}

# African Region AFRO
afro = "Algeria, Angola, Cabo Verde, Eswatini, Sao Tome and Principe, Benin, South Sudan, Western Sahara, Congo (Brazzaville), Congo (Kinshasa), Cote d'Ivoire, Botswana, Burkina Faso, Burundi, Cameroon, Cape Verde, Central African Republic, Chad, Comoros, Ivory Coast, Democratic Republic of the Congo, Equatorial Guinea, Eritrea, Ethiopia, Gabon, Gambia, Ghana, Guinea, Guinea-Bissau, Kenya, Lesotho, Liberia, Madagascar, Malawi, Mali, Mauritania, Mauritius, Mozambique, Namibia, Niger, Nigeria, Republic of the Congo, Rwanda, São Tomé and Príncipe, Senegal, Seychelles, Sierra Leone, Somalia, South Africa, Swaziland, Togo, Uganda, Tanzania, Zambia, Zimbabwe"
afro = [i.strip() for i in afro.split(',')]
for i in afro:
    who_region[i] = 'Africa'
    
# Region of the Americas PAHO
paho = 'Antigua and Barbuda, Argentina, Bahamas, Barbados, Belize, Bolivia, Brazil, Canada, Chile, Colombia, Costa Rica, Cuba, Dominica, Dominican Republic, Ecuador, El Salvador, Grenada, Guatemala, Guyana, Haiti, Honduras, Jamaica, Mexico, Nicaragua, Panama, Paraguay, Peru, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Suriname, Trinidad and Tobago, United States, US, Uruguay, Venezuela'
paho = [i.strip() for i in paho.split(',')]
for i in paho:
    who_region[i] = 'Americas'

# South-East Asia Region SEARO
searo = 'Bangladesh, Bhutan, North Korea, India, Indonesia, Maldives, Myanmar, Burma, Nepal, Sri Lanka, Thailand, Timor-Leste'
searo = [i.strip() for i in searo.split(',')]
for i in searo:
    who_region[i] = 'South-East Asia'

# European Region EURO
euro = 'Albania, Andorra, Greenland, Kosovo, Holy See, Liechtenstein, Armenia, Czechia, Austria, Azerbaijan, Belarus, Belgium, Bosnia and Herzegovina, Bulgaria, Croatia, Cyprus, Czech Republic, Denmark, Estonia, Finland, France, Georgia, Germany, Greece, Hungary, Iceland, Ireland, Israel, Italy, Kazakhstan, Kyrgyzstan, Latvia, Lithuania, Luxembourg, Malta, Monaco, Montenegro, Netherlands, North Macedonia, Norway, Poland, Portugal, Moldova, Romania, Russia, San Marino, Serbia, Slovakia, Slovenia, Spain, Sweden, Switzerland, Tajikistan, Turkey, Turkmenistan, Ukraine, United Kingdom, Uzbekistan'
euro = [i.strip() for i in euro.split(',')]
for i in euro:
    who_region[i] = 'Europe'

# Eastern Mediterranean Region EMRO
emro = 'Afghanistan, Bahrain, Djibouti, Egypt, Iran, Iraq, Jordan, Kuwait, Lebanon, Libya, Morocco, Oman, Pakistan, Palestine, West Bank and Gaza, Qatar, Saudi Arabia, Somalia, Sudan, Syria, Tunisia, United Arab Emirates, Yemen'
emro = [i.strip() for i in emro.split(',')]
for i in emro:
    who_region[i] = 'Eastern Mediterranean'

# Western Pacific Region WPRO
wpro = 'Australia, Brunei, Cambodia, China, Cook Islands, Fiji, Japan, Kiribati, Laos, Malaysia, Marshall Islands, Micronesia, Mongolia, Nauru, New Zealand, Niue, Palau, Papua New Guinea, Philippines, South Korea, Samoa, Singapore, Solomon Islands, Taiwan, Taiwan*, Tonga, Tuvalu, Vanuatu, Vietnam'
wpro = [i.strip() for i in wpro.split(',')]
for i in wpro:
    who_region[i] = 'Western Pacific'

In [32]:
# add 'WHO Region' column
grouped_df['WHO Region'] = grouped_df['Country_Region'].map(who_region)

# find missing values
grouped_df[grouped_df['WHO Region'].isna()]['Country_Region'].unique()

array(['Bahamas, The', 'Cayman Islands', 'Channel Islands', 'Curacao',
       'East Timor', 'Faroe Islands', 'Gambia, The', 'Gibraltar',
       'Guadeloupe', 'Guam', 'Guernsey', 'Hong Kong', 'Jersey', 'Macau',
       'Martinique', 'Mayotte', 'Puerto Rico', 'Republic of Ireland',
       'Republic of Moldova', 'Reunion', 'Saint Barthelemy',
       'Saint Martin', 'The Bahamas', 'The Gambia', 'Vatican City'],
      dtype=object)

In [33]:
grouped_df

Unnamed: 0,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,WHO Region
0,Afghanistan,2020-02-24,68.00,0.00,0.00,0.00,Eastern Mediterranean
1,Afghanistan,2020-03-08,16.00,0.00,0.00,0.00,Eastern Mediterranean
2,Afghanistan,2020-03-10,8.00,0.00,0.00,0.00,Eastern Mediterranean
3,Afghanistan,2020-03-11,33.00,0.00,0.00,0.00,Eastern Mediterranean
4,Afghanistan,2020-03-14,14.00,0.00,0.00,0.00,Eastern Mediterranean
...,...,...,...,...,...,...,...
93702,Zimbabwe,2021-07-27,99944.00,3173.00,67827.00,28944.00,Africa
93703,Zimbabwe,2021-07-28,101711.00,3280.00,70496.00,27935.00,Africa
93704,Zimbabwe,2021-07-29,103567.00,3340.00,71383.00,28844.00,Africa
93705,Zimbabwe,2021-07-30,105656.00,3421.00,73394.00,28841.00,Africa


## 5. Tính new case...

In [34]:
grouped_df['Last_Update'] = pd.to_datetime(grouped_df['Last_Update'])
# Tạo một cột mới cho 'Previous Day'
grouped_df['Previous Day'] = grouped_df.groupby(['Country_Region'])['Last_Update'].shift(1)
grouped_df

Unnamed: 0,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,WHO Region,Previous Day
0,Afghanistan,2020-02-24,68.00,0.00,0.00,0.00,Eastern Mediterranean,NaT
1,Afghanistan,2020-03-08,16.00,0.00,0.00,0.00,Eastern Mediterranean,2020-02-24
2,Afghanistan,2020-03-10,8.00,0.00,0.00,0.00,Eastern Mediterranean,2020-03-08
3,Afghanistan,2020-03-11,33.00,0.00,0.00,0.00,Eastern Mediterranean,2020-03-10
4,Afghanistan,2020-03-14,14.00,0.00,0.00,0.00,Eastern Mediterranean,2020-03-11
...,...,...,...,...,...,...,...,...
93702,Zimbabwe,2021-07-27,99944.00,3173.00,67827.00,28944.00,Africa,2021-07-26
93703,Zimbabwe,2021-07-28,101711.00,3280.00,70496.00,27935.00,Africa,2021-07-27
93704,Zimbabwe,2021-07-29,103567.00,3340.00,71383.00,28844.00,Africa,2021-07-28
93705,Zimbabwe,2021-07-30,105656.00,3421.00,73394.00,28841.00,Africa,2021-07-29


In [35]:
# Tính toán New cases, New deaths và New recovered cho mỗi quốc gia
grouped_df['New cases'] = grouped_df.groupby('Country_Region')['Confirmed'].diff()
grouped_df['New deaths'] = grouped_df.groupby('Country_Region')['Deaths'].diff()
grouped_df['New recovered'] = grouped_df.groupby('Country_Region')['Recovered'].diff()

# Điền giá trị rỗng của hàng đầu tiên bằng giá trị của ngày hiện tại
grouped_df['New cases'].fillna(grouped_df['Confirmed'], inplace=True)
grouped_df['New deaths'].fillna(grouped_df['Deaths'], inplace=True)
grouped_df['New recovered'].fillna(grouped_df['Recovered'], inplace=True)

In [36]:
grouped_df

Unnamed: 0,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,WHO Region,Previous Day,New cases,New deaths,New recovered
0,Afghanistan,2020-02-24,68.00,0.00,0.00,0.00,Eastern Mediterranean,NaT,68.00,0.00,0.00
1,Afghanistan,2020-03-08,16.00,0.00,0.00,0.00,Eastern Mediterranean,2020-02-24,-52.00,0.00,0.00
2,Afghanistan,2020-03-10,8.00,0.00,0.00,0.00,Eastern Mediterranean,2020-03-08,-8.00,0.00,0.00
3,Afghanistan,2020-03-11,33.00,0.00,0.00,0.00,Eastern Mediterranean,2020-03-10,25.00,0.00,0.00
4,Afghanistan,2020-03-14,14.00,0.00,0.00,0.00,Eastern Mediterranean,2020-03-11,-19.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...
93702,Zimbabwe,2021-07-27,99944.00,3173.00,67827.00,28944.00,Africa,2021-07-26,2050.00,79.00,1914.00
93703,Zimbabwe,2021-07-28,101711.00,3280.00,70496.00,27935.00,Africa,2021-07-27,1767.00,107.00,2669.00
93704,Zimbabwe,2021-07-29,103567.00,3340.00,71383.00,28844.00,Africa,2021-07-28,1856.00,60.00,887.00
93705,Zimbabwe,2021-07-30,105656.00,3421.00,73394.00,28841.00,Africa,2021-07-29,2089.00,81.00,2011.00


In [37]:
grouped_df.describe()

Unnamed: 0,Last_Update,Confirmed,Deaths,Recovered,Active,Previous Day,New cases,New deaths,New recovered
count,93707,93707.0,93707.0,93707.0,93707.0,93492,93707.0,93707.0,93707.0
mean,2020-11-19 11:09:10.409254400,401422.61,9598.31,243187.18,113523.48,2020-11-19 01:02:56.973431040,2108.25,45.18,1377.58
min,2020-01-22 00:00:00,0.0,0.0,0.0,0.0,2020-01-22 00:00:00,-21887422.0,-200626.0,-21360928.0
25%,2020-07-16 00:00:00,1710.0,28.0,772.0,201.0,2020-07-16 00:00:00,3.0,0.0,0.0
50%,2020-11-20 00:00:00,14951.0,265.0,9009.0,2107.0,2020-11-20 00:00:00,77.0,1.0,24.0
75%,2021-03-27 00:00:00,141936.5,2737.0,92921.0,18280.5,2021-03-26 00:00:00,732.0,12.0,425.0
max,2021-07-31 00:00:00,35909045.0,632648.0,30781263.0,28568199.0,2021-07-30 00:00:00,21920743.0,200902.0,21392193.0
std,,2046189.17,40784.9,1242337.03,882441.67,,104198.08,1045.29,101503.4


Kiểm tra các giá trị âm và fix

In [38]:
neg_cases = grouped_df[grouped_df['New cases'] < 0]
neg_deaths = grouped_df[grouped_df['New deaths'] < 0]
neg_recovered = grouped_df[grouped_df['New recovered'] < 0]
print(neg_cases.shape)
print(neg_deaths.shape)
print(neg_recovered.shape)

(500, 11)
(194, 11)
(356, 11)


Lặp 6 lần để fix - update - fix 

In [39]:
i=0
while (i<6):
    # CASES
    for index, row in neg_cases.iterrows():
        # Lấy dòng từ grouped_df dựa trên "Country_Region", "WHO Region", và "Last_Update" là ngày liền trước
        mask = (grouped_df['Country_Region'] == row['Country_Region']) & \
            (grouped_df['WHO Region'] == row['WHO Region']) & \
            (grouped_df['Last_Update'] == row['Previous Day'])
        grouped_df.loc[mask, 'Confirmed'] = grouped_df.loc[mask, 'Confirmed'] + row['New cases']

        mask2 = (grouped_df['Country_Region'] == row['Country_Region']) & \
            (grouped_df['WHO Region'] == row['WHO Region']) & \
            (grouped_df['Last_Update'] == row['Last_Update'])
        grouped_df.loc[mask2, 'New cases']= row['Confirmed'] - grouped_df.loc[mask, 'Confirmed']
        grouped_df['New cases'] = grouped_df.groupby('Country_Region')['Confirmed'].diff()
        grouped_df['New cases'].fillna(grouped_df['Confirmed'], inplace=True)
    neg_cases = grouped_df[grouped_df['New cases'] < 0]

    # DEATHS
    for index, row in neg_deaths.iterrows():
        # Lấy dòng từ grouped_df dựa trên "Country_Region", "WHO Region", và "Last_Update" là ngày liền trước
        mask = (grouped_df['Country_Region'] == row['Country_Region']) & \
            (grouped_df['WHO Region'] == row['WHO Region']) & \
            (grouped_df['Last_Update'] == row['Previous Day'])
        grouped_df.loc[mask, 'Deaths'] = grouped_df.loc[mask, 'Deaths'] + row['New deaths']

        mask2 = (grouped_df['Country_Region'] == row['Country_Region']) & \
            (grouped_df['WHO Region'] == row['WHO Region']) & \
            (grouped_df['Last_Update'] == row['Last_Update'])
        grouped_df.loc[mask2, 'New deaths']=row['Deaths'] - grouped_df.loc[mask, 'Deaths']
        grouped_df['New deaths'] = grouped_df.groupby('Country_Region')['Deaths'].diff()
        grouped_df['New deaths'].fillna(grouped_df['Deaths'], inplace=True)
    neg_deaths = grouped_df[grouped_df['New deaths'] < 0]

    # RECOVERED
    for index, row in neg_recovered.iterrows():
        # Lấy dòng từ grouped_df dựa trên "Country_Region", "WHO Region", và "Last_Update" là ngày liền trước
        mask = (grouped_df['Country_Region'] == row['Country_Region']) & \
            (grouped_df['WHO Region'] == row['WHO Region']) & \
            (grouped_df['Last_Update'] == row['Previous Day'])
        grouped_df.loc[mask, 'Recovered'] = grouped_df.loc[mask, 'Recovered'] + row['New recovered']

        mask2 = (grouped_df['Country_Region'] == row['Country_Region']) & \
            (grouped_df['WHO Region'] == row['WHO Region']) & \
            (grouped_df['Last_Update'] == row['Last_Update'])
        grouped_df.loc[mask2, 'New recovered']= row['Recovered'] - grouped_df.loc[mask, 'Recovered']
        grouped_df['New recovered'] = grouped_df.groupby('Country_Region')['Recovered'].diff()
        grouped_df['New recovered'].fillna(grouped_df['Recovered'], inplace=True)
    neg_recovered = grouped_df[grouped_df['New recovered'] < 0]

    i+=1


In [40]:
neg_cases = grouped_df[grouped_df['New cases'] < 0]
neg_deaths = grouped_df[grouped_df['New deaths'] < 0]
neg_recovered = grouped_df[grouped_df['New recovered'] < 0]
print(neg_cases.shape)
print(neg_deaths.shape)
print(neg_recovered.shape)

(47, 11)
(22, 11)
(43, 11)


Số dòng âm rất nhỏ so với số dòng dữ liệu => drop

In [41]:
grouped_df.drop(neg_cases.index, inplace = True)
neg_deaths = grouped_df[grouped_df['New deaths'] < 0]
grouped_df.drop(neg_deaths.index, inplace = True)
neg_recovered = grouped_df[grouped_df['New recovered'] < 0]
grouped_df.drop(neg_recovered.index, inplace = True)


In [42]:
# Active Case = confirmed - deaths - recovered
grouped_df['Active'] = grouped_df['Confirmed'] - grouped_df['Deaths'] - grouped_df['Recovered']

In [43]:
grouped_df.loc[(grouped_df['Active'] < 0),'Active']=grouped_df.loc[(grouped_df['Active'] < 0),'New cases']
grouped_df.drop(columns= "Previous Day", inplace = True)
grouped_df['Last_Update'] = pd.to_datetime(grouped_df['Last_Update']).dt.date


In [44]:
grouped_df.describe()

Unnamed: 0,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered
count,93615.0,93615.0,93615.0,93615.0,93615.0,93615.0,93615.0
mean,399437.53,9561.83,242651.63,147447.43,2153.53,46.22,1452.56
std,2029144.3,40546.65,1239100.53,1491151.73,14682.68,298.8,10158.6
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1711.0,28.0,772.0,212.0,3.0,0.0,0.0
50%,14933.0,264.0,8989.0,2188.0,76.0,1.0,24.0
75%,141910.0,2730.5,92847.5,17982.5,724.5,11.0,422.5
max,35062519.0,609300.0,30781263.0,34453219.0,2349350.0,45576.0,1123456.0


# Country wise latest

In [45]:
# Country wise
# ============
full_grouped = grouped_df.copy()

full_grouped['Last_Update'] = pd.to_datetime(full_grouped['Last_Update'])

# getting latest values
country_wise = full_grouped[full_grouped['Last_Update']==max(full_grouped['Last_Update'])] \
                    .reset_index(drop=True) \
                    .drop('Last_Update', axis=1)

print(country_wise.shape)

# group by country
country_wise = country_wise.groupby('Country_Region')[['Confirmed', 'Deaths',
                                                      'Recovered', 'Active',
                                                      'New cases', 'New deaths', 'New recovered']].sum().reset_index()
print(country_wise.shape)


# per 100 cases
country_wise['Deaths / 100 Cases'] = round((country_wise['Deaths']/country_wise['Confirmed'])*100, 2)
country_wise['Recovered / 100 Cases'] = round((country_wise['Recovered']/country_wise['Confirmed'])*100, 2)
country_wise['Deaths / 100 Recovered'] = round((country_wise['Deaths']/country_wise['Recovered'])*100, 2)

cols = ['Deaths / 100 Cases', 'Recovered / 100 Cases', 'Deaths / 100 Recovered']
country_wise[cols] = country_wise[cols].fillna(0)

# Tính Deaths / 100 Recovered
country_wise['Deaths / 100 Recovered'] = round((country_wise['Deaths'] / country_wise['Recovered']) * 100, 2)

# Chuyển những giá trị NaN trong Deaths / 100 Recovered thành 0
country_wise['Deaths / 100 Recovered'] = country_wise['Deaths / 100 Recovered'].fillna(0)

# Kiểm tra và gán giá trị 1 cho các trường hợp Recovered bằng 0
country_wise.loc[country_wise['Recovered'] == 0, 'Deaths / 100 Recovered'] = 1

# 1 week increase and % change
# ============================

today = full_grouped[full_grouped['Last_Update']==max(full_grouped['Last_Update'])] \
            .reset_index(drop=True) \
            .drop('Last_Update', axis=1)[['Country_Region', 'Confirmed']]

last_week = full_grouped[full_grouped['Last_Update']==max(full_grouped['Last_Update'])-timedelta(days=7)] \
                .reset_index(drop=True) \
                .drop('Last_Update', axis=1)[['Country_Region', 'Confirmed']]

temp = pd.merge(today, last_week, on='Country_Region', suffixes=(' today', ' last week'))
temp['1 week change'] = temp['Confirmed today'] - temp['Confirmed last week']
temp = temp[['Country_Region', 'Confirmed last week', '1 week change']]

country_wise = pd.merge(country_wise, temp, on='Country_Region')
country_wise['1 week % increase'] = round(country_wise['1 week change']/country_wise['Confirmed last week']*100, 2)
country_wise.head()

country_wise['WHO Region'] = country_wise['Country_Region'].map(who_region)
country_wise[country_wise['WHO Region'].isna()]['Country_Region'].unique()

country_wise.head()

(185, 9)
(185, 8)


Unnamed: 0,Country_Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,146523.0,6677.0,82586.0,57260.0,527.0,62.0,0.0,4.56,56.36,8.08,143439.0,3084.0,2.15,Eastern Mediterranean
1,Albania,133036.0,2457.0,130205.0,374.0,37.0,0.0,18.0,1.85,97.87,1.89,132828.0,208.0,0.16,Europe
2,Algeria,170189.0,4219.0,114531.0,51439.0,1521.0,30.0,824.0,2.48,67.3,3.68,159563.0,10626.0,6.66,Africa
3,Andorra,14678.0,128.0,14210.0,340.0,23.0,1.0,30.0,0.87,96.81,0.9,14498.0,180.0,1.24,Europe
4,Angola,42646.0,1008.0,36708.0,4930.0,160.0,5.0,683.0,2.36,86.08,2.75,41629.0,1017.0,2.44,Africa


In [46]:
country_wise[country_wise['Country_Region']=='US']

Unnamed: 0,Country_Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
172,US,35062519.0,609300.0,0.0,34453219.0,191156.0,885.0,0.0,1.74,0.0,1.0,34523699.0,538820.0,1.56,Americas


In [47]:
country_pop = {'Afghanistan':39009447,'Albania':2877470,'Algeria':43926079,'Andorra':77278,'Angola':32956300,'Antigua and Barbuda':98010,'Argentina':45236884,
'Armenia':2963811,'Australia':25528864,'Austria':9011577,'Azerbaijan':10148243,'Bahamas':393616,'Bahrain':1706669,'Bangladesh':164851401,'Barbados':287411,
'Belarus':9449001,'Belgium':11594739,'Belize':398312,'Benin':12151976,'Bhutan':772443,'Bolivia':11688459,'Bosnia and Herzegovina':3278650,'Botswana':2356075,
'Brazil':212710692,'Brunei':437893,'Bulgaria':6942854,'Burkina Faso':20954852,'Burma':54446389,'Burundi':11922216,'Cabo Verde':556581,'Cambodia':16741375,
'Cameroon':26606188,'Canada':37775022,'Central African Republic':4837752,'Chad':16467965,'Chile':19132514,'China':1425887337,'Colombia':50936262,'Comoros':871326,
'Congo (Brazzaville)':5530506,'Congo (Kinshasa)':5530506,'Costa Rica':5098730,"Cote d'Ivoire":28160542,'Croatia':4102577,'Cuba':11325899,'Cyprus':1208238,
'Czechia':10711019,'Denmark':5794279,'Djibouti':989387,'Dominica':72004,'Dominican Republic':10858648,'Ecuador':17668824,'Egypt':102516525,'El Salvador':6489514,
'Equatorial Guinea':1407001,'Eritrea':3551175,'Estonia':1326627,'Eswatini':1161348,'Ethiopia':115223736,'Fiji':897095,'Finland':5541604,'France':65288306,
'Gabon':2230563,'Gambia':2422754,'Georgia':3988368,'Germany':83811260,'Ghana':31133483,'Greece':10417673,'Greenland':56780,'Grenada':112576,'Guatemala':17946899,
'Guinea':13164905,'Guinea-Bissau':1972277,'Guyana':786936,'Haiti':11416103,'Honduras':9919704,'Hungary':9657785,'Iceland':341465,'India':1381344997,'Indonesia':273808365,
'Iran':84097623,'Iraq':40306025,'Ireland':4943200,'Israel':9197590,'Italy':60452568,'Jamaica':2962478,'Japan':126435859,'Jordan':10213138,'Kazakhstan':18798667,
'Kenya':53881160,'Kosovo':1771315,'Kuwait':4276658,'Kyrgyzstan':6534479,'Laos':7285750,'Latvia':1883936,'Lebanon':6822220,'Lesotho':2143943,'Liberia':5068618,
'Libya':6880353,'Liechtenstein':38139,'Lithuania':2718121,'Luxembourg':626952,'Madagascar':27755708,'Malawi':19174839,'Malaysia':32406372,'Maldives':541448,
'Mali':20302901,'Malta':441663,'Mauritania':4660728,'Mauritius':1271985,'Mexico':129066160,'Moldova':4032983,'Monaco':39270,'Mongolia':3283344,'Montenegro':628074,
'Morocco':36953359,'Mozambique':31333962,'Namibia':2545264,'Nepal':29186486,'Netherlands':17138756,'New Zealand':5002100,'Nicaragua':6632263,'Niger':24281433,
'Nigeria':206606300,'North Macedonia':2083365,'Norway':5425471,'Oman':5118446,'Pakistan':221295851,'Panama':4321282,'Papua New Guinea':8963009,'Paraguay':7141091,
'Peru':33016319,'Philippines':109722719,'Poland':37842302,'Portugal':10193593,'Qatar':2807805,'Romania':19224023,'Russia':145940924,'Rwanda':12981546,
'Saint Kitts and Nevis':53237,'Saint Lucia':183712,'Saint Vincent and the Grenadines':110976,'San Marino':33938,'Sao Tome and Principe':219544,'Saudi Arabia':34865919,
'Senegal':16783877,'Serbia':8733665,'Seychelles':98408,'Sierra Leone':7992169,'Singapore':5854932,'Slovakia':5459915,'Slovenia':2078968,'Somalia':15933012,
'South Africa':59381566,'South Korea':51273732,'South Sudan':11206572,'Spain':46756648,'Sri Lanka':21422362,'Sudan':43943536,'Suriname':587154,'Sweden':10105596,
'Switzerland':8660952,'Syria':17539600,'Taiwan':23821199,'Tajikistan':9557468,'Tanzania':59886383,'Thailand':69817894,'Timor-Leste':1320812,'Togo':8296582,
'Trinidad and Tobago':1399950,'Tunisia':11830801,'Turkey':84428331,'US':331198130,'Uganda':45867852,'Ukraine':43705858,'United Arab Emirates':9902079,'United Kingdom':67922029,
'Uruguay':3474956,'Uzbekistan':33516027,'Vanuatu':326740,'Venezuela':28427499,'Vietnam':97425470,'Yemen':29886897,'Zambia':18430129,'Zimbabwe':14883803
}

In [48]:
country_wise['Population'] = country_wise['Country_Region'].map(country_pop)
country_wise

Unnamed: 0,Country_Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region,Population
0,Afghanistan,146523.00,6677.00,82586.00,57260.00,527.00,62.00,0.00,4.56,56.36,8.08,143439.00,3084.00,2.15,Eastern Mediterranean,39009447
1,Albania,133036.00,2457.00,130205.00,374.00,37.00,0.00,18.00,1.85,97.87,1.89,132828.00,208.00,0.16,Europe,2877470
2,Algeria,170189.00,4219.00,114531.00,51439.00,1521.00,30.00,824.00,2.48,67.30,3.68,159563.00,10626.00,6.66,Africa,43926079
3,Andorra,14678.00,128.00,14210.00,340.00,23.00,1.00,30.00,0.87,96.81,0.90,14498.00,180.00,1.24,Europe,77278
4,Angola,42646.00,1008.00,36708.00,4930.00,160.00,5.00,683.00,2.36,86.08,2.75,41629.00,1017.00,2.44,Africa,32956300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,Venezuela,304726.00,3576.00,288864.00,12286.00,929.00,18.00,1144.00,1.17,94.79,1.24,296863.00,7863.00,2.65,Americas,28427499
181,Vietnam,141122.00,1161.00,35484.00,104477.00,7717.00,139.00,3704.00,0.82,25.14,3.27,86957.00,54165.00,62.29,Western Pacific,97425470
182,Yemen,7058.00,1375.00,4171.00,1512.00,16.00,1.00,0.00,19.48,59.10,32.97,7003.00,55.00,0.79,Eastern Mediterranean,29886897
183,Zambia,195096.00,3376.00,186211.00,5509.00,956.00,21.00,1001.00,1.73,95.45,1.81,190615.00,4481.00,2.35,Africa,18430129


In [49]:
# save as .csv file
country_wise.to_csv('./Data/country_wise_latest_2021.csv', index=False)

In [50]:
grouped_df['Population'] = grouped_df['Country_Region'].map(country_pop)
grouped_df

Unnamed: 0,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,WHO Region,New cases,New deaths,New recovered,Population
0,Afghanistan,2020-02-24,8.00,0.00,0.00,8.00,Eastern Mediterranean,8.00,0.00,0.00,39009447.00
1,Afghanistan,2020-03-08,8.00,0.00,0.00,8.00,Eastern Mediterranean,0.00,0.00,0.00,39009447.00
2,Afghanistan,2020-03-10,8.00,0.00,0.00,8.00,Eastern Mediterranean,0.00,0.00,0.00,39009447.00
3,Afghanistan,2020-03-11,14.00,0.00,0.00,14.00,Eastern Mediterranean,6.00,0.00,0.00,39009447.00
4,Afghanistan,2020-03-14,14.00,0.00,0.00,14.00,Eastern Mediterranean,0.00,0.00,0.00,39009447.00
...,...,...,...,...,...,...,...,...,...,...,...
93702,Zimbabwe,2021-07-27,99944.00,3173.00,67827.00,28944.00,Africa,2050.00,79.00,1914.00,14883803.00
93703,Zimbabwe,2021-07-28,101711.00,3280.00,70496.00,27935.00,Africa,1767.00,107.00,2669.00,14883803.00
93704,Zimbabwe,2021-07-29,103567.00,3340.00,71383.00,28844.00,Africa,1856.00,60.00,887.00,14883803.00
93705,Zimbabwe,2021-07-30,105656.00,3421.00,73394.00,28841.00,Africa,2089.00,81.00,2011.00,14883803.00


Xuất fulldata

In [51]:
grouped_df.to_csv('./Data/full_data_final.csv', index = False)

In [52]:
grouped_df.shape

(93615, 11)

In [53]:
# # Tạo một DataFrame tạm thời để lưu trữ các dòng cuối cùng có giá trị Recovered khác 0 cho mỗi 'Country_Region'
# last_non_zero_recovered = pd.DataFrame()

# # Duyệt qua từng nhóm 'Country_Region'
# for name, group in grouped_df.groupby('Country_Region'):
#     # Lọc ra các dòng có giá trị khác 0 trong cột 'Recovered' và lấy dòng cuối cùng
#     last_non_zero_recovered_row = group[group['Recovered'] != 0].iloc[-1:]
#     # Thêm dòng này vào DataFrame tạm thời
#     last_non_zero_recovered = pd.concat([last_non_zero_recovered, last_non_zero_recovered_row])
# last_non_zero_recovered.sort_values(by = 'Last_Update').to_csv('last_recovered.csv')

In [54]:
# Nhóm theo cột 'Country_Region' và đếm số lượng dòng cho mỗi nhóm
num_records = grouped_df.groupby('Country_Region').size().reset_index(name='Num_of_record')

# Tìm ngày mới nhất trong cột 'Last_Update' cho mỗi nhóm
latest_update = grouped_df.groupby('Country_Region')['Last_Update'].max().reset_index()

# Kết hợp kết quả vào một DataFrame mới
result_df = pd.merge(num_records, latest_update, on='Country_Region')

# Đổi tên cột 'Last_Update' thành 'Latest_Update'
result_df.rename(columns={'Last_Update': 'Latest_Update'}, inplace=True)

result_df.sort_values(by = 'Num_of_record', inplace=True)
# Lọc các dòng trong DataFrame có Num_of_record ít hơn 100
#filtered_df = result_df.loc[result_df['Num_of_record'] < 100, ['Country_Region', 'Num_of_record', 'Latest_Update']]
filtered_df = result_df.loc[result_df['Num_of_record'] < 300, ['Country_Region', 'Num_of_record', 'Latest_Update']]
filtered_df



Unnamed: 0,Country_Region,Num_of_record,Latest_Update
34,Cape Verde,1,2020-03-21
102,Jersey,1,2020-03-14
76,Gibraltar,1,2020-03-04
194,The Bahamas,1,2020-03-16
164,Saint Barthelemy,1,2020-03-04
155,Puerto Rico,1,2020-03-16
57,East Timor,1,2020-03-21
195,The Gambia,1,2020-03-17
157,Republic of Ireland,1,2020-03-08
72,"Gambia, The",1,2020-03-18


drop các nước có records <100

In [55]:
values_to_drop = ['Cape Verde', 'Jersey', 'Gibraltar', 'The Bahamas', 'Saint Barthelemy', 'Puerto Rico', 'East Timor', 'The Gambia', 'Republic of Ireland', 'Gambia, The', 'Saint Martin', 'Bahamas, The', 'Ivory Coast', 'Channel Islands', 'Guernsey', 'Guam', 'Vatican City', 'Curacao', 'Republic of Moldova', 'Faroe Islands', 'Republic of the Congo', 'Cayman Islands', 'Palestine', 'Mayotte', 'Martinique', 'Reunion', 'Guadeloupe', 'Czech Republic', 'Macau', 'Hong Kong']

# Loại bỏ các dòng có giá trị trong cột 'Country_Region' thuộc list values_to_drop
grouped_df = grouped_df[~grouped_df['Country_Region'].isin(values_to_drop)]
grouped_df.shape

(93489, 11)

Xuất fulldata

In [56]:
grouped_df.to_csv('./Data/full_data_final.csv', index = False)

In [57]:
grouped_df.shape

(93489, 11)

In [58]:
# Tính tổng số location riêng biệt trong dữ liệu gốc
total_unique_locations = grouped_df['Country_Region'].nunique()
total_unique_locations

185

In [59]:
grouped_df.sample()

Unnamed: 0,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,WHO Region,New cases,New deaths,New recovered,Population
24200,Denmark,2021-04-18,242393.0,2453.0,231298.0,8642.0,Europe,724.0,0.0,554.0,5794279.0


In [60]:
latest_update = grouped_df['Last_Update'].max()
print(latest_update)

newest_update = grouped_df['Last_Update'].min()
print(newest_update)

2021-07-31
2020-01-22
