In [20]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.float_format', '{:.2f}'.format)

## 1. Nhập dữ liệu vào notebook

In [21]:
# path to file
data_folder = "./Data"
os.makedirs(data_folder, exist_ok=True)
# Path to the "full_grouped.csv" file in the "Data" folder
file_raw_path = os.path.join(data_folder, "raw.csv")
file_bef_path = os.path.join(data_folder, "bef.csv")


In [22]:
# read file
raw_df = pd.read_csv(file_raw_path)
# read file
bef_df = pd.read_csv(file_bef_path)


In [23]:
raw_df.shape

(4278759, 14)

In [24]:
bef_df.shape

(9797, 8)

In [25]:
raw_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,,,,Afghanistan,2021-01-02 05:22:33,33.93911,67.709953,52513,2201,41727,8585,Afghanistan,134.89657830525067,4.191343095995277
1,,,,Albania,2021-01-02 05:22:33,41.1533,20.1683,58316,1181,33634,23501,Albania,2026.409062478282,2.025173194320598
2,,,,Algeria,2021-01-02 05:22:33,28.0339,1.6596,99897,2762,67395,29740,Algeria,227.80986075975437,2.764847793227024
3,,,,Andorra,2021-01-02 05:22:33,42.5063,1.5218,8117,84,7463,570,Andorra,10505.403481524623,1.0348650979425895
4,,,,Angola,2021-01-02 05:22:33,-11.2027,17.8739,17568,405,11146,6017,Angola,53.45298103210258,2.305327868852459


In [26]:
bef_df.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Latitude,Longitude,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,1/22/2020 17:00,,,1.0,,
1,Beijing,Mainland China,1/22/2020 17:00,,,14.0,,
2,Chongqing,Mainland China,1/22/2020 17:00,,,6.0,,
3,Cook Islands,New Zealand,1/22/2020 17:00,,,0.0,0.0,0.0
4,England,United Kingdom,1/22/2020 17:00,,,0.0,0.0,0.0


In [27]:
raw_df.tail()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
4278754,,,,Winter Olympics 2022,2023-01-01 04:21:00,39.9042,116.4074,535.0,0.0,,,Winter Olympics 2022,,0.0
4278755,,,,Yemen,2023-01-01 04:21:00,15.552727,48.516388,11945.0,2159.0,,,Yemen,40.04899354817252,18.07450816241105
4278756,,,,Zambia,2023-01-01 04:21:00,-13.133897,27.849332,334425.0,4024.0,,,Zambia,1819.1133616725367,1.2032593257083053
4278757,,,,Zimbabwe,2023-01-01 04:21:00,-19.015438,29.154857,259981.0,5637.0,,,Zimbabwe,1749.1911250051892,2.168235371046346
4278758,,,,,,,,,,,,,,


In [28]:
# Xóa dòng có index là 4278758
raw_df = raw_df.drop(index=4278758)

# Reset index cho DataFrame và loại bỏ cột index cũ nếu không muốn giữ
raw_df = raw_df.reset_index(drop=True)
raw_df.tail()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
4278753,,,,West Bank and Gaza,2023-01-01 04:21:00,31.9522,35.2332,703228,5708,,,West Bank and Gaza,13784.956960969268,0.8116855415313383
4278754,,,,Winter Olympics 2022,2023-01-01 04:21:00,39.9042,116.4074,535,0,,,Winter Olympics 2022,,0.0
4278755,,,,Yemen,2023-01-01 04:21:00,15.552727,48.516388,11945,2159,,,Yemen,40.04899354817252,18.07450816241105
4278756,,,,Zambia,2023-01-01 04:21:00,-13.133897,27.849332,334425,4024,,,Zambia,1819.1133616725367,1.2032593257083053
4278757,,,,Zimbabwe,2023-01-01 04:21:00,-19.015438,29.154857,259981,5637,,,Zimbabwe,1749.1911250051892,2.168235371046346


In [29]:
bef_df.tail()

Unnamed: 0,Province/State,Country/Region,Last Update,Latitude,Longitude,Confirmed,Deaths,Recovered
9792,Tennessee,US,3/21/2020 23:13,35.75,-86.69,371.0,1.0,0.0
9793,Wisconsin,US,3/21/2020 23:13,44.27,-89.62,282.0,4.0,0.0
9794,,Cape Verde,3/21/2020 23:43,15.11,-23.62,1.0,0.0,0.0
9795,,Papua New Guinea,3/21/2020 23:43,-6.32,143.96,1.0,0.0,0.0
9796,,Uganda,3/21/2020 23:43,1.0,32.0,1.0,0.0,0.0


In [30]:
raw_df.dtypes

FIPS                   object
Admin2                 object
Province_State         object
Country_Region         object
Last_Update            object
Lat                    object
Long_                  object
Confirmed              object
Deaths                 object
Recovered              object
Active                 object
Combined_Key           object
Incident_Rate          object
Case_Fatality_Ratio    object
dtype: object

In [31]:
# Định nghĩa hàm kiểm tra khả năng chuyển đổi
def can_convert_to_float(value):
    try:
        np.float64(value)
        return True
    except ValueError:
        return False
    except TypeError:
        return False

# Áp dụng hàm này và lọc ra những dòng không thể chuyển đổi
rows_with_unconvertible_lat = raw_df[~raw_df['Lat'].apply(can_convert_to_float)]

raw_df.drop(rows_with_unconvertible_lat.index, inplace=True)


In [32]:
raw_df = raw_df.reset_index(drop=True)
raw_df.tail()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
4277671,,,,West Bank and Gaza,2023-01-01 04:21:00,31.9522,35.2332,703228,5708,,,West Bank and Gaza,13784.956960969268,0.8116855415313383
4277672,,,,Winter Olympics 2022,2023-01-01 04:21:00,39.9042,116.4074,535,0,,,Winter Olympics 2022,,0.0
4277673,,,,Yemen,2023-01-01 04:21:00,15.552727,48.516388,11945,2159,,,Yemen,40.04899354817252,18.07450816241105
4277674,,,,Zambia,2023-01-01 04:21:00,-13.133897,27.849332,334425,4024,,,Zambia,1819.1133616725367,1.2032593257083053
4277675,,,,Zimbabwe,2023-01-01 04:21:00,-19.015438,29.154857,259981,5637,,,Zimbabwe,1749.1911250051892,2.168235371046346


In [33]:
num_cols = ['Lat','Long_','Confirmed','Deaths','Recovered','Active','Incident_Rate','Case_Fatality_Ratio']
raw_df[num_cols]=raw_df[num_cols].astype(np.float64)
raw_df.dtypes

FIPS                    object
Admin2                  object
Province_State          object
Country_Region          object
Last_Update             object
Lat                    float64
Long_                  float64
Confirmed              float64
Deaths                 float64
Recovered              float64
Active                 float64
Combined_Key            object
Incident_Rate          float64
Case_Fatality_Ratio    float64
dtype: object

In [34]:
bef_df.dtypes

Province/State     object
Country/Region     object
Last Update        object
Latitude          float64
Longitude         float64
Confirmed         float64
Deaths            float64
Recovered         float64
dtype: object

In [35]:
bef_df.columns = ['Province_State', 'Country_Region', 'Last_Update', 'Lat', 'Long_', 'Confirmed', 'Deaths', 'Recovered']


In [36]:

# Nối DataFrame dựa trên các cột cùng tên
#merged_df = pd.merge(raw_df, bef_df, how='outer', left_on=['Province_State', 'Country_Region', 'Last_Update','Lat',	'Long_', 'Confirmed', 'Deaths', 'Recovered'], right_on=['Province/State', 'Country/Region', 'Last Update', 'Latitude', 'Longitude', 'Confirmed', 'Deaths', 'Recovered'])
merged_df=pd.concat([raw_df,bef_df], ignore_index=True, sort=False)
# Xóa các cột dư thừa sau khi nối
#merged_df.drop(columns=['Province/State', 'Country/Region', 'Last Update'], inplace=True)

# Hiển thị kết quả
merged_df.sample(20)


Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
3530576,48111.0,Dallam,Texas,US,2022-10-31 04:21:36,36.28,-102.6,2330.0,44.0,,,"Dallam, Texas, US",31974.75,1.89
1461060,41047.0,Marion,Oregon,US,2022-05-10 04:20:50,44.9,-122.58,70376.0,720.0,,,"Marion, Oregon, US",20233.57,1.02
1968480,51071.0,Giles,Virginia,US,2021-06-23 04:21:46,37.31,-80.71,1329.0,22.0,,,"Giles, Virginia, US",7948.56,1.66
2658664,13081.0,Crisp,Georgia,US,2021-08-20 04:21:33,31.92,-83.77,2446.0,94.0,,,"Crisp, Georgia, US",10933.31,3.84
1391134,2185.0,North Slope,Alaska,US,2022-05-04 04:20:59,69.31,-153.48,4459.0,11.0,,,"North Slope, Alaska, US",45351.91,0.25
4081773,5011.0,Bradley,Arkansas,US,2022-12-16 04:21:01,33.47,-92.16,3511.0,57.0,,,"Bradley, Arkansas, US",32621.02,1.62
360484,45045.0,Greenville,South Carolina,US,2023-01-31 04:20:32,34.9,-82.37,204588.0,2111.0,,,"Greenville, South Carolina, US",39077.67,1.03
3528751,19167.0,Sioux,Iowa,US,2022-10-31 04:21:36,43.08,-96.18,8589.0,91.0,,,"Sioux, Iowa, US",24642.09,1.06
1184485,22125.0,West Feliciana,Louisiana,US,2020-04-15 22:56:51,30.88,-91.41,47.0,0.0,0.0,47.0,"West Feliciana, Louisiana, US",,
3844350,,,Bremen,Germany,2020-11-27 05:26:53,53.08,8.8,9642.0,114.0,7812.0,1831.0,"Bremen, Germany",1434.73,1.18


In [37]:
# Hiển thị kết quả
merged_df.tail(20)

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
4287453,,,Alabama,US,3/21/2020 23:13,32.32,-86.9,131.0,0.0,0.0,,,,
4287454,,,Alaska,US,3/21/2020 23:13,61.37,-152.4,15.0,0.0,0.0,,,,
4287455,,,Arizona,US,3/21/2020 23:13,33.73,-111.43,118.0,1.0,0.0,,,,
4287456,,,Colorado,US,3/21/2020 23:13,39.06,-105.31,390.0,4.0,0.0,,,,
4287457,,,Delaware,US,3/21/2020 23:13,39.32,-75.51,45.0,0.0,0.0,,,,
4287458,,,Florida,US,3/21/2020 23:13,27.77,-81.69,659.0,13.0,0.0,,,,
4287459,,,Illinois,US,3/21/2020 23:13,40.35,-88.99,753.0,6.0,0.0,,,,
4287460,,,Indiana,US,3/21/2020 23:13,39.85,-86.26,128.0,4.0,0.0,,,,
4287461,,,Kansas,US,3/21/2020 23:13,38.53,-96.73,57.0,2.0,0.0,,,,
4287462,,,Maine,US,3/21/2020 23:13,44.69,-69.38,70.0,0.0,0.0,,,,


In [38]:
merged_df.shape

(4287473, 14)