In [55]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.float_format', '{:.2f}'.format)

## 1. Nhập dữ liệu vào notebook

In [56]:
# path to file
data_folder = "./Data"
os.makedirs(data_folder, exist_ok=True)
# Path to the "grouped_grouped.csv" file in the "Data" folder
file_full_path = os.path.join(data_folder, "raw_full_merged_df.csv")

# read file
raw_df = pd.read_csv(file_full_path)

In [57]:
raw_df.shape

(4287473, 14)

In [58]:
raw_df.sample()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
1231086,21075.0,Fulton,Kentucky,US,2021-04-20 04:20:41,36.55,-89.2,505.0,14.0,,,"Fulton, Kentucky, US",8460.38,2.77


## 2. Drop các cột không dùng
(`Combined_Key`, `Incident_Rate`, `Case_Fatality_Ratio`, `Lat`, `Long_`)

In [59]:
drop_cols = ['Combined_Key','Incident_Rate','Case_Fatality_Ratio','Lat','Long_']
country_df = raw_df.copy()
country_df.drop(columns = drop_cols, inplace = True)
country_df.shape

(4287473, 9)

In [60]:
country_df.sample(10)

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active
2818818,12023.0,Columbia,Florida,US,2022-09-02 04:21:07,23982.0,428.0,,
2362108,1011.0,Bullock,Alabama,US,2022-07-26 04:20:58,2605.0,54.0,,
1984231,48439.0,Tarrant,Texas,US,2022-06-24 04:20:58,581989.0,5953.0,,
2661463,55109.0,St. Croix,Wisconsin,US,2021-08-20 04:21:33,10235.0,72.0,,
1629297,,,,Kuwait,2022-05-25 04:20:56,632781.0,2555.0,,
1589160,5107.0,Phillips,Arkansas,US,2020-05-22 02:36:51,6.0,1.0,0.0,5.0
2902706,2240.0,Southeast Fairbanks,Alaska,US,2022-09-09 04:21:11,2337.0,20.0,,
2606758,16053.0,Jerome,Idaho,US,2020-08-16 04:27:42,521.0,6.0,0.0,515.0
2508550,72107.0,Orocovis,Puerto Rico,US,2022-08-07 04:20:54,5125.0,0.0,,
3049453,47047.0,Fayette,Tennessee,US,2022-09-21 04:23:22,13034.0,196.0,,


## 3. Xử lý date-time

In [61]:
# Tạo một bản sao của cột 'Last_Update' để giữ nguyên giá trị ban đầu (backup cho các dòng ko convert được)
country_df['Last_Update_original'] = country_df['Last_Update'].copy()

try:
    country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update'], errors='coerce')
except:
    print("Can't convert!")

# Lọc và in ra các dòng không thể chuyển đổi
invalid_rows = country_df[country_df['Last_Update'].isna()]
invalid_rows

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,Last_Update_original
156818,,,Andaman and Nicobar Islands,India,NaT,4976.00,62.00,4891.00,23.00,2021-01-15 17:22
156819,,,Andhra Pradesh,India,NaT,885616.00,7138.00,876140.00,2338.00,2021-01-15 17:22
156820,,,Arunachal Pradesh,India,NaT,16798.00,56.00,16674.00,68.00,2021-01-15 17:22
156821,,,Assam,India,NaT,216762.00,1065.00,212706.00,2991.00,2021-01-15 17:22
156822,,,Bihar,India,NaT,256895.00,1447.00,251278.00,4170.00,2021-01-15 17:22
...,...,...,...,...,...,...,...,...,...,...
4287468,,,Tennessee,US,NaT,371.00,1.00,0.00,,3/21/2020 23:13
4287469,,,Wisconsin,US,NaT,282.00,4.00,0.00,,3/21/2020 23:13
4287470,,,,Cape Verde,NaT,1.00,0.00,0.00,,3/21/2020 23:43
4287471,,,,Papua New Guinea,NaT,1.00,0.00,0.00,,3/21/2020 23:43


Lọc ra các dòng có định dạng mm/dd/yy hh:mm

In [62]:
filtered_df = invalid_rows[invalid_rows['Last_Update_original'].str.contains(r'\d{1,2}/\d{1,2}/\d{2}\s\d{1,2}:\d{2}')]
filtered_df

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,Last_Update_original
915452,36061.00,New York City,New York,US,NaT,9654.00,63.00,0.00,9591.00,3/22/20 23:45
915453,36059.00,Nassau,New York,US,NaT,1900.00,4.00,0.00,1896.00,3/22/20 23:45
915454,36119.00,Westchester,New York,US,NaT,1873.00,0.00,0.00,1873.00,3/22/20 23:45
915455,36103.00,Suffolk,New York,US,NaT,1034.00,9.00,0.00,1025.00,3/22/20 23:45
915456,36087.00,Rockland,New York,US,NaT,455.00,1.00,0.00,454.00,3/22/20 23:45
...,...,...,...,...,...,...,...,...,...,...
1085697,82604.00,,Wales,United Kingdom,NaT,0.00,430.00,0.00,0.00,4/6/20 23:22
1085698,,,,Nauru,NaT,0.00,0.00,0.00,0.00,4/6/20 23:22
1085699,,,Niue,New Zealand,NaT,0.00,0.00,0.00,0.00,4/6/20 23:22
1085700,,,,Tuvalu,NaT,0.00,0.00,0.00,0.00,4/6/20 23:22


Chuyển sang datetime 

In [63]:
filtered_df['Last_Update'] = pd.to_datetime(filtered_df['Last_Update_original'], format='%m/%d/%y %H:%M')
#filtered_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Last_Update'] = pd.to_datetime(filtered_df['Last_Update_original'], format='%m/%d/%y %H:%M')


Gán lại vào country_df

In [64]:
country_df.loc[filtered_df.index, 'Last_Update'] = filtered_df['Last_Update']


Lọc lại các dòng không chuyển được

In [65]:
try:
    country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update'], errors='coerce')
except:
    print("Can't convert!")

# Lọc và in ra các dòng không thể chuyển đổi
invalid_rows = country_df[country_df['Last_Update'].isna()]
invalid_rows


Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,Last_Update_original
156818,,,Andaman and Nicobar Islands,India,NaT,4976.00,62.00,4891.00,23.00,2021-01-15 17:22
156819,,,Andhra Pradesh,India,NaT,885616.00,7138.00,876140.00,2338.00,2021-01-15 17:22
156820,,,Arunachal Pradesh,India,NaT,16798.00,56.00,16674.00,68.00,2021-01-15 17:22
156821,,,Assam,India,NaT,216762.00,1065.00,212706.00,2991.00,2021-01-15 17:22
156822,,,Bihar,India,NaT,256895.00,1447.00,251278.00,4170.00,2021-01-15 17:22
...,...,...,...,...,...,...,...,...,...,...
4287468,,,Tennessee,US,NaT,371.00,1.00,0.00,,3/21/2020 23:13
4287469,,,Wisconsin,US,NaT,282.00,4.00,0.00,,3/21/2020 23:13
4287470,,,,Cape Verde,NaT,1.00,0.00,0.00,,3/21/2020 23:43
4287471,,,,Papua New Guinea,NaT,1.00,0.00,0.00,,3/21/2020 23:43


Lọc ra các dòng có định dạng yyyy-mm-dd hh:mm

In [66]:
filtered_df = invalid_rows[invalid_rows['Last_Update_original'].apply(lambda x: len(str(x)) == 16)]
#filtered_df

Chuyển sang datetime 

In [67]:
filtered_df['Last_Update'] = pd.to_datetime(filtered_df['Last_Update_original'], format='%Y-%m-%d %H:%M')
#filtered_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Last_Update'] = pd.to_datetime(filtered_df['Last_Update_original'], format='%Y-%m-%d %H:%M')


Gán lại vào country_df

In [68]:
country_df.loc[filtered_df.index, 'Last_Update'] = filtered_df['Last_Update']


Lọc lại các dòng không chuyển được

In [69]:
try:
    country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update'], errors='coerce')
except:
    print("Can't convert!")

# Lọc và in ra các dòng không thể chuyển đổi
invalid_rows = country_df[country_df['Last_Update'].isna()]
invalid_rows


Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,Last_Update_original
4277676,,,Anhui,Mainland China,NaT,1.00,,,,1/22/2020 17:00
4277677,,,Beijing,Mainland China,NaT,14.00,,,,1/22/2020 17:00
4277678,,,Chongqing,Mainland China,NaT,6.00,,,,1/22/2020 17:00
4277679,,,Cook Islands,New Zealand,NaT,0.00,0.00,0.00,,1/22/2020 17:00
4277680,,,England,United Kingdom,NaT,0.00,0.00,0.00,,1/22/2020 17:00
...,...,...,...,...,...,...,...,...,...,...
4287468,,,Tennessee,US,NaT,371.00,1.00,0.00,,3/21/2020 23:13
4287469,,,Wisconsin,US,NaT,282.00,4.00,0.00,,3/21/2020 23:13
4287470,,,,Cape Verde,NaT,1.00,0.00,0.00,,3/21/2020 23:43
4287471,,,,Papua New Guinea,NaT,1.00,0.00,0.00,,3/21/2020 23:43


Lọc ra các dòng có định dạng yyyy-mm-dd hh:mm

In [70]:
filtered_df = invalid_rows[invalid_rows['Last_Update_original'].str.contains(r'\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}')]
filtered_df

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,Last_Update_original


Chuyển sang datetime 

In [71]:
filtered_df['Last_Update'] = pd.to_datetime(filtered_df['Last_Update_original'], format='%Y-%m-%d %H:%M')
#filtered_df


Gán lại vào country_df

In [72]:
country_df.loc[filtered_df.index, 'Last_Update'] = filtered_df['Last_Update']


Lọc lại các dòng không chuyển được

In [73]:
try:
    country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update'], errors='coerce')
except:
    print("Can't convert!")

# Lọc và in ra các dòng không thể chuyển đổi
invalid_rows = country_df[country_df['Last_Update'].isna()]
invalid_rows


Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,Last_Update_original
4277676,,,Anhui,Mainland China,NaT,1.00,,,,1/22/2020 17:00
4277677,,,Beijing,Mainland China,NaT,14.00,,,,1/22/2020 17:00
4277678,,,Chongqing,Mainland China,NaT,6.00,,,,1/22/2020 17:00
4277679,,,Cook Islands,New Zealand,NaT,0.00,0.00,0.00,,1/22/2020 17:00
4277680,,,England,United Kingdom,NaT,0.00,0.00,0.00,,1/22/2020 17:00
...,...,...,...,...,...,...,...,...,...,...
4287468,,,Tennessee,US,NaT,371.00,1.00,0.00,,3/21/2020 23:13
4287469,,,Wisconsin,US,NaT,282.00,4.00,0.00,,3/21/2020 23:13
4287470,,,,Cape Verde,NaT,1.00,0.00,0.00,,3/21/2020 23:43
4287471,,,,Papua New Guinea,NaT,1.00,0.00,0.00,,3/21/2020 23:43


Lọc ra các dòng có định dạng mm/dd/yyyy hh:mm

In [74]:
filtered_df = invalid_rows[invalid_rows['Last_Update_original'].str.contains(r'\d{1,2}/\d{1,2}/\d{4}\s\d{1,2}:\d{2}')]
filtered_df

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,Last_Update_original
4277676,,,Anhui,Mainland China,NaT,1.00,,,,1/22/2020 17:00
4277677,,,Beijing,Mainland China,NaT,14.00,,,,1/22/2020 17:00
4277678,,,Chongqing,Mainland China,NaT,6.00,,,,1/22/2020 17:00
4277679,,,Cook Islands,New Zealand,NaT,0.00,0.00,0.00,,1/22/2020 17:00
4277680,,,England,United Kingdom,NaT,0.00,0.00,0.00,,1/22/2020 17:00
...,...,...,...,...,...,...,...,...,...,...
4287468,,,Tennessee,US,NaT,371.00,1.00,0.00,,3/21/2020 23:13
4287469,,,Wisconsin,US,NaT,282.00,4.00,0.00,,3/21/2020 23:13
4287470,,,,Cape Verde,NaT,1.00,0.00,0.00,,3/21/2020 23:43
4287471,,,,Papua New Guinea,NaT,1.00,0.00,0.00,,3/21/2020 23:43


Chuyển sang datetime 

In [75]:
filtered_df['Last_Update'] = pd.to_datetime(filtered_df['Last_Update_original'], format='%m/%d/%Y %H:%M')
#filtered_df


Gán lại vào country_df

In [76]:
country_df.loc[filtered_df.index, 'Last_Update'] = filtered_df['Last_Update']


Lọc lại các dòng không chuyển được

In [77]:
try:
    country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update'], errors='coerce')
except:
    print("Can't convert!")

# Lọc và in ra các dòng không thể chuyển đổi
invalid_rows = country_df[country_df['Last_Update'].isna()]
invalid_rows


Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,Last_Update_original


> Đã chuyển xong <3

In [78]:
country_df.dtypes

FIPS                           float64
Admin2                          object
Province_State                  object
Country_Region                  object
Last_Update             datetime64[ns]
Confirmed                      float64
Deaths                         float64
Recovered                      float64
Active                         float64
Last_Update_original            object
dtype: object

Xoá cột `Last_Update_original` và chuyển datetime về date

In [79]:
country_df.drop(columns=['Last_Update_original'], inplace=True)
country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update']).dt.date
country_df

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active
0,,,,Afghanistan,2021-01-02,52513.00,2201.00,41727.00,8585.00
1,,,,Albania,2021-01-02,58316.00,1181.00,33634.00,23501.00
2,,,,Algeria,2021-01-02,99897.00,2762.00,67395.00,29740.00
3,,,,Andorra,2021-01-02,8117.00,84.00,7463.00,570.00
4,,,,Angola,2021-01-02,17568.00,405.00,11146.00,6017.00
...,...,...,...,...,...,...,...,...,...
4287468,,,Tennessee,US,2020-03-21,371.00,1.00,0.00,
4287469,,,Wisconsin,US,2020-03-21,282.00,4.00,0.00,
4287470,,,,Cape Verde,2020-03-21,1.00,0.00,0.00,
4287471,,,,Papua New Guinea,2020-03-21,1.00,0.00,0.00,


In [80]:
country_df.dtypes

FIPS              float64
Admin2             object
Province_State     object
Country_Region     object
Last_Update        object
Confirmed         float64
Deaths            float64
Recovered         float64
Active            float64
dtype: object

In [81]:
country_df.shape

(4287473, 9)

In [82]:
# Chuyển đổi cột 'Last_Update' sang định dạng datetime
country_df['Last_Update'] = pd.to_datetime(country_df['Last_Update'])

# Lọc ra các dòng có Last_Update sau ngày 5/8/2021
drop_indices = country_df[country_df['Last_Update'] > '2021-08-05'].index

# Drop các dòng có index trong drop_indices
country_df = country_df.drop(drop_indices)

In [83]:
#fixing Country names
# ====================

# renaming countries, regions, provinces
country_df['Country_Region'] = country_df['Country_Region'].replace('Korea, South', 'South Korea')
country_df['Country_Region'] = country_df['Country_Region'].replace('Korea, North', 'North Korea')

# Greenland
country_df.loc[country_df['Province_State']=='Greenland', 'Country_Region'] = 'Greenland'

# Mainland china to China
country_df['Country_Region'] = country_df['Country_Region'].replace('Mainland China', 'China')

# Vietnam
country_df['Country_Region'] = country_df['Country_Region'].replace('Viet Nam', 'Vietnam')


In [84]:
# Active Case = confirmed - deaths - recovered
country_df['Active'] = country_df['Confirmed'] - country_df['Deaths'] - country_df['Recovered']


In [85]:
grouped_df = country_df.groupby(['Country_Region', 'Last_Update'])[['Confirmed','Deaths','Recovered','Active']].sum().reset_index()
grouped_df.shape

(102986, 6)

In [86]:
grouped_df

Unnamed: 0,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active
0,Azerbaijan,2020-02-28,1.00,0.00,0.00,1.00
1,Afghanistan,2020-02-24,68.00,0.00,0.00,68.00
2,Afghanistan,2020-03-08,16.00,0.00,0.00,16.00
3,Afghanistan,2020-03-10,8.00,0.00,0.00,8.00
4,Afghanistan,2020-03-11,33.00,0.00,0.00,33.00
...,...,...,...,...,...,...
102981,Zimbabwe,2021-08-03,110855.00,3635.00,79420.00,27800.00
102982,Zimbabwe,2021-08-04,112435.00,3676.00,81570.00,27189.00
102983,Zimbabwe,2021-08-05,113526.00,3711.00,82994.00,26821.00
102984,occupied Palestinian territory,2020-03-10,25.00,0.00,0.00,25.00


In [87]:
#grouped_df.to_csv("grouped.csv")

In [88]:
# Active Case = confirmed - deaths - recovered
grouped_df['Active'] = grouped_df['Confirmed'] - grouped_df['Deaths'] - grouped_df['Recovered']

# WHO Region

https://en.wikipedia.org/wiki/WHO_regions

In [89]:
who_region = {}

# African Region AFRO
afro = "Algeria, Angola, Cabo Verde, Eswatini, Sao Tome and Principe, Benin, South Sudan, Western Sahara, Congo (Brazzaville), Congo (Kinshasa), Cote d'Ivoire, Botswana, Burkina Faso, Burundi, Cameroon, Cape Verde, Central African Republic, Chad, Comoros, Ivory Coast, Democratic Republic of the Congo, Equatorial Guinea, Eritrea, Ethiopia, Gabon, Gambia, Ghana, Guinea, Guinea-Bissau, Kenya, Lesotho, Liberia, Madagascar, Malawi, Mali, Mauritania, Mauritius, Mozambique, Namibia, Niger, Nigeria, Republic of the Congo, Rwanda, São Tomé and Príncipe, Senegal, Seychelles, Sierra Leone, Somalia, South Africa, Swaziland, Togo, Uganda, Tanzania, Zambia, Zimbabwe"
afro = [i.strip() for i in afro.split(',')]
for i in afro:
    who_region[i] = 'Africa'
    
# Region of the Americas PAHO
paho = 'Antigua and Barbuda, Argentina, Bahamas, Barbados, Belize, Bolivia, Brazil, Canada, Chile, Colombia, Costa Rica, Cuba, Dominica, Dominican Republic, Ecuador, El Salvador, Grenada, Guatemala, Guyana, Haiti, Honduras, Jamaica, Mexico, Nicaragua, Panama, Paraguay, Peru, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Suriname, Trinidad and Tobago, United States, US, Uruguay, Venezuela'
paho = [i.strip() for i in paho.split(',')]
for i in paho:
    who_region[i] = 'Americas'

# South-East Asia Region SEARO
searo = 'Bangladesh, Bhutan, North Korea, India, Indonesia, Maldives, Myanmar, Burma, Nepal, Sri Lanka, Thailand, Timor-Leste'
searo = [i.strip() for i in searo.split(',')]
for i in searo:
    who_region[i] = 'South-East Asia'

# European Region EURO
euro = 'Albania, Andorra, Greenland, Kosovo, Holy See, Liechtenstein, Armenia, Czechia, Austria, Azerbaijan, Belarus, Belgium, Bosnia and Herzegovina, Bulgaria, Croatia, Cyprus, Czech Republic, Denmark, Estonia, Finland, France, Georgia, Germany, Greece, Hungary, Iceland, Ireland, Israel, Italy, Kazakhstan, Kyrgyzstan, Latvia, Lithuania, Luxembourg, Malta, Monaco, Montenegro, Netherlands, North Macedonia, Norway, Poland, Portugal, Moldova, Romania, Russia, San Marino, Serbia, Slovakia, Slovenia, Spain, Sweden, Switzerland, Tajikistan, Turkey, Turkmenistan, Ukraine, United Kingdom, Uzbekistan'
euro = [i.strip() for i in euro.split(',')]
for i in euro:
    who_region[i] = 'Europe'

# Eastern Mediterranean Region EMRO
emro = 'Afghanistan, Bahrain, Djibouti, Egypt, Iran, Iraq, Jordan, Kuwait, Lebanon, Libya, Morocco, Oman, Pakistan, Palestine, West Bank and Gaza, Qatar, Saudi Arabia, Somalia, Sudan, Syria, Tunisia, United Arab Emirates, Yemen'
emro = [i.strip() for i in emro.split(',')]
for i in emro:
    who_region[i] = 'Eastern Mediterranean'

# Western Pacific Region WPRO
wpro = 'Australia, Brunei, Cambodia, China, Cook Islands, Fiji, Japan, Kiribati, Laos, Malaysia, Marshall Islands, Micronesia, Mongolia, Nauru, New Zealand, Niue, Palau, Papua New Guinea, Philippines, South Korea, Samoa, Singapore, Solomon Islands, Taiwan, Taiwan*, Tonga, Tuvalu, Vanuatu, Vietnam'
wpro = [i.strip() for i in wpro.split(',')]
for i in wpro:
    who_region[i] = 'Western Pacific'

In [90]:
# add 'WHO Region' column
grouped_df['WHO Region'] = grouped_df['Country_Region'].map(who_region)

# find missing values
grouped_df[grouped_df['WHO Region'].isna()]['Country_Region'].unique()

array([' Azerbaijan', 'Antarctica', 'Aruba', 'Bahamas, The',
       'Cayman Islands', 'Channel Islands', 'Cruise Ship', 'Curacao',
       'Diamond Princess', 'East Timor', 'Faroe Islands', 'French Guiana',
       'Gambia, The', 'Gibraltar', 'Guadeloupe', 'Guam', 'Guernsey',
       'Hong Kong', 'Hong Kong SAR', 'Iran (Islamic Republic of)',
       'Jersey', 'MS Zaandam', 'Macao SAR', 'Macau', 'Martinique',
       'Mayotte', 'North Ireland', 'Others', 'Puerto Rico',
       'Republic of Ireland', 'Republic of Korea', 'Republic of Moldova',
       'Reunion', 'Russian Federation', 'Saint Barthelemy',
       'Saint Martin', 'St. Martin', 'Summer Olympics 2020',
       'Taipei and environs', 'The Bahamas', 'The Gambia', 'Vatican City',
       'Winter Olympics 2022', 'occupied Palestinian territory'],
      dtype=object)

In [91]:
grouped_df

Unnamed: 0,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,WHO Region
0,Azerbaijan,2020-02-28,1.00,0.00,0.00,1.00,
1,Afghanistan,2020-02-24,68.00,0.00,0.00,68.00,Eastern Mediterranean
2,Afghanistan,2020-03-08,16.00,0.00,0.00,16.00,Eastern Mediterranean
3,Afghanistan,2020-03-10,8.00,0.00,0.00,8.00,Eastern Mediterranean
4,Afghanistan,2020-03-11,33.00,0.00,0.00,33.00,Eastern Mediterranean
...,...,...,...,...,...,...,...
102981,Zimbabwe,2021-08-03,110855.00,3635.00,79420.00,27800.00,Africa
102982,Zimbabwe,2021-08-04,112435.00,3676.00,81570.00,27189.00,Africa
102983,Zimbabwe,2021-08-05,113526.00,3711.00,82994.00,26821.00,Africa
102984,occupied Palestinian territory,2020-03-10,25.00,0.00,0.00,25.00,


In [92]:
grouped_df['Last_Update'] = pd.to_datetime(grouped_df['Last_Update'])
# Tạo một cột mới cho 'Previous Day'
grouped_df['Previous Day'] = grouped_df.groupby(['Country_Region'])['Last_Update'].shift(1)
grouped_df

Unnamed: 0,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,WHO Region,Previous Day
0,Azerbaijan,2020-02-28,1.00,0.00,0.00,1.00,,NaT
1,Afghanistan,2020-02-24,68.00,0.00,0.00,68.00,Eastern Mediterranean,NaT
2,Afghanistan,2020-03-08,16.00,0.00,0.00,16.00,Eastern Mediterranean,2020-02-24
3,Afghanistan,2020-03-10,8.00,0.00,0.00,8.00,Eastern Mediterranean,2020-03-08
4,Afghanistan,2020-03-11,33.00,0.00,0.00,33.00,Eastern Mediterranean,2020-03-10
...,...,...,...,...,...,...,...,...
102981,Zimbabwe,2021-08-03,110855.00,3635.00,79420.00,27800.00,Africa,2021-08-02
102982,Zimbabwe,2021-08-04,112435.00,3676.00,81570.00,27189.00,Africa,2021-08-03
102983,Zimbabwe,2021-08-05,113526.00,3711.00,82994.00,26821.00,Africa,2021-08-04
102984,occupied Palestinian territory,2020-03-10,25.00,0.00,0.00,25.00,,NaT


In [93]:
grouped_df.to_csv("grouped_preD0.csv")

In [94]:

# Tính toán New cases, New deaths và New recovered cho mỗi quốc gia
grouped_df['New cases'] = grouped_df.groupby('Country_Region')['Confirmed'].diff()
grouped_df['New deaths'] = grouped_df.groupby('Country_Region')['Deaths'].diff()
grouped_df['New recovered'] = grouped_df.groupby('Country_Region')['Recovered'].diff()

# Điền giá trị rỗng của hàng đầu tiên bằng giá trị của ngày hiện tại
grouped_df['New cases'].fillna(grouped_df['Confirmed'], inplace=True)
grouped_df['New deaths'].fillna(grouped_df['Deaths'], inplace=True)
grouped_df['New recovered'].fillna(grouped_df['Recovered'], inplace=True)

In [95]:
grouped_df

Unnamed: 0,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Active,WHO Region,Previous Day,New cases,New deaths,New recovered
0,Azerbaijan,2020-02-28,1.00,0.00,0.00,1.00,,NaT,1.00,0.00,0.00
1,Afghanistan,2020-02-24,68.00,0.00,0.00,68.00,Eastern Mediterranean,NaT,68.00,0.00,0.00
2,Afghanistan,2020-03-08,16.00,0.00,0.00,16.00,Eastern Mediterranean,2020-02-24,-52.00,0.00,0.00
3,Afghanistan,2020-03-10,8.00,0.00,0.00,8.00,Eastern Mediterranean,2020-03-08,-8.00,0.00,0.00
4,Afghanistan,2020-03-11,33.00,0.00,0.00,33.00,Eastern Mediterranean,2020-03-10,25.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...
102981,Zimbabwe,2021-08-03,110855.00,3635.00,79420.00,27800.00,Africa,2021-08-02,1309.00,52.00,2755.00
102982,Zimbabwe,2021-08-04,112435.00,3676.00,81570.00,27189.00,Africa,2021-08-03,1580.00,41.00,2150.00
102983,Zimbabwe,2021-08-05,113526.00,3711.00,82994.00,26821.00,Africa,2021-08-04,1091.00,35.00,1424.00
102984,occupied Palestinian territory,2020-03-10,25.00,0.00,0.00,25.00,,NaT,25.00,0.00,0.00


In [96]:
grouped_df.describe()

Unnamed: 0,Last_Update,Confirmed,Deaths,Recovered,Active,Previous Day,New cases,New deaths,New recovered
count,102986,102986.0,102986.0,102986.0,102986.0,102739,102986.0,102986.0,102986.0
mean,2020-11-21 18:44:13.176159744,375539.56,8946.99,228149.44,138443.13,2020-11-21 09:19:30.519471360,1949.59,41.59,1271.08
min,2020-01-22 00:00:00,0.0,0.0,0.0,-163828.0,2020-01-22 00:00:00,-21887422.0,-200626.0,-21360928.0
25%,2020-07-17 00:00:00,771.0,11.0,265.0,90.0,2020-07-17 00:00:00,0.0,0.0,0.0
50%,2020-11-23 00:00:00,11444.0,179.0,6452.5,1530.0,2020-11-23 00:00:00,52.0,0.0,14.0
75%,2021-03-31 00:00:00,124890.5,2274.0,81618.0,14812.5,2021-03-30 00:00:00,627.0,10.0,348.0
max,2021-08-05 00:00:00,35909045.0,632648.0,30974748.0,35276397.0,2021-08-04 00:00:00,21920743.0,200902.0,21392193.0
std,,1990171.94,39650.55,1215686.38,1461087.95,,99398.6,997.29,96824.55


Recheck data

In [97]:
grouped_df.to_csv('danie.csv')
near_full = grouped_df.copy()

# Country wise latest

In [98]:
# Country wise
# ============
full_grouped = grouped_df.copy()

full_grouped['Last_Update'] = pd.to_datetime(full_grouped['Last_Update'])

# getting latest values
country_wise = full_grouped[full_grouped['Last_Update']==max(full_grouped['Last_Update'])] \
                    .reset_index(drop=True) \
                    .drop('Last_Update', axis=1)

print(country_wise.shape)

# group by country
country_wise = country_wise.groupby('Country_Region')[['Confirmed', 'Deaths',
                                                      'Recovered', 'Active',
                                                      'New cases', 'New deaths', 'New recovered']].sum().reset_index()
print(country_wise.shape)


# per 100 cases
country_wise['Deaths / 100 Cases'] = round((country_wise['Deaths']/country_wise['Confirmed'])*100, 2)
country_wise['Recovered / 100 Cases'] = round((country_wise['Recovered']/country_wise['Confirmed'])*100, 2)
country_wise['Deaths / 100 Recovered'] = round((country_wise['Deaths']/country_wise['Recovered'])*100, 2)

cols = ['Deaths / 100 Cases', 'Recovered / 100 Cases', 'Deaths / 100 Recovered']
country_wise[cols] = country_wise[cols].fillna(0)


# 1 week increase and % change
# ============================

today = full_grouped[full_grouped['Last_Update']==max(full_grouped['Last_Update'])] \
            .reset_index(drop=True) \
            .drop('Last_Update', axis=1)[['Country_Region', 'Confirmed']]

last_week = full_grouped[full_grouped['Last_Update']==max(full_grouped['Last_Update'])-timedelta(days=7)] \
                .reset_index(drop=True) \
                .drop('Last_Update', axis=1)[['Country_Region', 'Confirmed']]

temp = pd.merge(today, last_week, on='Country_Region', suffixes=(' today', ' last week'))
temp['1 week change'] = temp['Confirmed today'] - temp['Confirmed last week']
temp = temp[['Country_Region', 'Confirmed last week', '1 week change']]

country_wise = pd.merge(country_wise, temp, on='Country_Region')
country_wise['1 week % increase'] = round(country_wise['1 week change']/country_wise['Confirmed last week']*100, 2)
country_wise.head()

country_wise['WHO Region'] = country_wise['Country_Region'].map(who_region)
country_wise[country_wise['WHO Region'].isna()]['Country_Region'].unique()

country_wise.head()

(202, 10)
(202, 8)


Unnamed: 0,Country_Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,148933.0,6836.0,82586.0,59511.0,361.0,32.0,0.0,4.59,55.45,8.28,145552.0,3381.0,2.32,Eastern Mediterranean
1,Albania,133310.0,2457.0,130314.0,539.0,99.0,0.0,23.0,1.84,97.75,1.89,132952.0,358.0,0.27,Europe
2,Algeria,176724.0,4404.0,118409.0,53911.0,1495.0,34.0,852.0,2.49,67.0,3.72,167131.0,9593.0,5.74,Africa
3,Andorra,14797.0,128.0,14380.0,289.0,31.0,0.0,32.0,0.87,97.18,0.89,14586.0,211.0,1.45,Europe
4,Angola,43158.0,1026.0,39582.0,2550.0,88.0,4.0,193.0,2.38,91.71,2.59,42288.0,870.0,2.06,Africa


In [99]:
# save as .csv file
country_wise.to_csv('./Data/country_wise_latest_2021.csv', index=False)

Kiểm tra các giá trị âm và fix

In [46]:
neg_cases = grouped_df[grouped_df['New cases'] < 0]
neg_deaths = grouped_df[grouped_df['New deaths'] < 0]
neg_recovered = grouped_df[grouped_df['New recovered'] < 0]
print(neg_cases.shape)
print(neg_deaths.shape)
print(neg_recovered.shape)

(515, 11)
(201, 11)
(363, 11)


Lặp 6 lần để fix - update - fix 

In [47]:
i=0
while (i<8):
    # CASES
    for index, row in neg_cases.iterrows():
        # Lấy dòng từ grouped_df dựa trên "Country_Region", "WHO Region", và "Last_Update" là ngày liền trước
        mask = (grouped_df['Country_Region'] == row['Country_Region']) & \
            (grouped_df['WHO Region'] == row['WHO Region']) & \
            (grouped_df['Last_Update'] == row['Previous Day'])
        grouped_df.loc[mask, 'Confirmed'] = grouped_df.loc[mask, 'Confirmed'] + row['New cases']

        mask2 = (grouped_df['Country_Region'] == row['Country_Region']) & \
            (grouped_df['WHO Region'] == row['WHO Region']) & \
            (grouped_df['Last_Update'] == row['Last_Update'])
        grouped_df.loc[mask2, 'New cases']= row['Confirmed'] - grouped_df.loc[mask, 'Confirmed']
        grouped_df['New cases'] = grouped_df.groupby('Country_Region')['Confirmed'].diff()
        grouped_df['New cases'].fillna(grouped_df['Confirmed'], inplace=True)
    neg_cases = grouped_df[grouped_df['New cases'] < 0]

    # DEATHS
    for index, row in neg_deaths.iterrows():
        # Lấy dòng từ grouped_df dựa trên "Country_Region", "WHO Region", và "Last_Update" là ngày liền trước
        mask = (grouped_df['Country_Region'] == row['Country_Region']) & \
            (grouped_df['WHO Region'] == row['WHO Region']) & \
            (grouped_df['Last_Update'] == row['Previous Day'])
        grouped_df.loc[mask, 'Deaths'] = grouped_df.loc[mask, 'Deaths'] + row['New deaths']

        mask2 = (grouped_df['Country_Region'] == row['Country_Region']) & \
            (grouped_df['WHO Region'] == row['WHO Region']) & \
            (grouped_df['Last_Update'] == row['Last_Update'])
        grouped_df.loc[mask2, 'New deaths']=row['Deaths'] - grouped_df.loc[mask, 'Deaths']
        grouped_df['New deaths'] = grouped_df.groupby('Country_Region')['Deaths'].diff()
        grouped_df['New deaths'].fillna(grouped_df['Deaths'], inplace=True)
    neg_deaths = grouped_df[grouped_df['New deaths'] < 0]

    # RECOVERED
    for index, row in neg_recovered.iterrows():
        # Lấy dòng từ grouped_df dựa trên "Country_Region", "WHO Region", và "Last_Update" là ngày liền trước
        mask = (grouped_df['Country_Region'] == row['Country_Region']) & \
            (grouped_df['WHO Region'] == row['WHO Region']) & \
            (grouped_df['Last_Update'] == row['Previous Day'])
        grouped_df.loc[mask, 'Recovered'] = grouped_df.loc[mask, 'Recovered'] + row['New recovered']

        mask2 = (grouped_df['Country_Region'] == row['Country_Region']) & \
            (grouped_df['WHO Region'] == row['WHO Region']) & \
            (grouped_df['Last_Update'] == row['Last_Update'])
        grouped_df.loc[mask2, 'New recovered']= row['Recovered'] - grouped_df.loc[mask, 'Recovered']
        grouped_df['New recovered'] = grouped_df.groupby('Country_Region')['Recovered'].diff()
        grouped_df['New recovered'].fillna(grouped_df['Recovered'], inplace=True)
    neg_recovered = grouped_df[grouped_df['New recovered'] < 0]

    i+=1


In [48]:
neg_cases = grouped_df[grouped_df['New cases'] < 0]
neg_deaths = grouped_df[grouped_df['New deaths'] < 0]
neg_recovered = grouped_df[grouped_df['New recovered'] < 0]
print(neg_cases.shape)
print(neg_deaths.shape)
print(neg_recovered.shape)

(55, 11)
(21, 11)
(40, 11)


Số dòng âm rất nhỏ so với số dòng dữ liệu => drop

In [49]:
grouped_df.drop(neg_cases.index, inplace = True)
neg_deaths = grouped_df[grouped_df['New deaths'] < 0]
grouped_df.drop(neg_deaths.index, inplace = True)
neg_recovered = grouped_df[grouped_df['New recovered'] < 0]
grouped_df.drop(neg_recovered.index, inplace = True)


In [50]:
neg_cases = grouped_df[grouped_df['New cases'] < 0]
neg_deaths = grouped_df[grouped_df['New deaths'] < 0]
neg_recovered = grouped_df[grouped_df['New recovered'] < 0]
print(neg_cases.shape)
print(neg_deaths.shape)
print(neg_recovered.shape)

(0, 11)
(0, 11)
(0, 11)


Xuất fulldata

In [54]:
grouped_df.drop(columns= "Previous Day", inplace = True)
grouped_df.to_csv('./Data/full_data_final.csv', index = False)