In [12]:
##################################################
## This work is about obtaining some insights from the data set of Johns Hopkins University COVID-19 tracking project.
## The data can be found on this link: https://data.world/associatedpress/johns-hopkins-coronavirus-case-tracker
## The data was accessed on the date: 6th of August, 2020.
## Credit to "Johns Hopkins University COVID-19 tracking project".
##################################################
## Author: Efe Buyuk
## Credits: Johns Hopkins University COVID-19 tracking project
## Version: 1.0.0
## Maintainer: Efe Buyuk
## Status: Dev
##################################################

In [1]:
import pandas as pd

# save filepaths to variables for easier access
county_level_confirmed_cases_file_path = "datasets/1_county_level_confirmed_cases.csv"
cases_and_deaths_by_county_file_path = "datasets/2_cases_and_deaths_by_county_timeseries.csv"
cases_and_deaths_by_state_file_path = "datasets/3_cases_and_deaths_by_state_timeseries.csv"


# read the data and store data in DataFrames
country_level_data = pd.read_csv(county_level_confirmed_cases_file_path)
cases_and_deaths_by_county_data = pd.read_csv(cases_and_deaths_by_county_file_path)
cases_and_deaths_by_state_data = pd.read_csv(cases_and_deaths_by_state_file_path)


# print a summary of the data in County Level Confirmed Cases
country_level_data.describe()

Unnamed: 0,fips_code,lat,lon,total_population,confirmed,confirmed_per_100000,deaths,deaths_per_100000
count,3239.0,3188.0,3188.0,3180.0,3249.0,3180.0,3249.0,3180.0
mean,32352.570855,37.927155,-91.486168,102326.9,1484.399508,1034.252918,48.701754,24.322091
std,17845.649909,6.073878,13.191672,353053.1,7539.089999,1041.812282,459.310683,38.450354
min,1001.0,17.982429,-174.1596,102.0,0.0,0.0,0.0,0.0
25%,19054.0,34.312874,-97.831977,11428.25,54.0,358.83,0.0,0.0
50%,30067.0,38.160798,-89.900575,26267.0,196.0,704.335,3.0,10.15
75%,47038.0,41.688491,-82.961755,66754.0,689.0,1393.115,15.0,31.065
max,90056.0,69.314792,-65.28813,10098050.0,226581.0,16462.97,23563.0,398.36


In [2]:
# print a summary of the data in Cases and Deaths by County
cases_and_deaths_by_county_data.describe()

Unnamed: 0,uid,fips_code,total_population,cumulative_cases,cumulative_cases_per_100_000,cumulative_deaths,cumulative_deaths_per_100_000,new_cases,new_deaths,new_cases_per_100_000,new_deaths_per_100_000,new_cases_7_day_rolling_avg,new_deaths_7_day_rolling_avg
count,640644.0,638674.0,618186.0,640644.0,618186.0,640644.0,618186.0,637392.0,637392.0,615048.0,615048.0,622800.0,624135.0
mean,84032250.0,32135.425355,102900.9,413.652326,248.871296,19.519171,7.898393,7.535887,0.247868,5.280011,0.125749,7.596179,0.262687
std,17898.08,17802.186102,355290.5,3794.59283,593.420937,314.133793,23.031229,67.623934,5.614353,25.424309,1.149579,62.962349,4.581803
min,84001000.0,1001.0,75.0,0.0,0.0,0.0,0.0,-2028.0,-1868.0,-2202.64,-165.32,0.0,0.0
25%,84019050.0,19043.0,10888.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,84030060.0,30046.0,25715.0,6.0,35.53,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0
75%,84047030.0,47015.0,66944.0,78.0,229.22,2.0,4.71,1.0,0.0,3.73,0.0,1.857143,0.0
max,84100000.0,99999.0,10098050.0,226581.0,16462.97,23563.0,398.36,7837.0,814.0,9370.1,168.07,5214.142857,760.142857


In [3]:
# print a summary of the data in Cases and Deaths by State
cases_and_deaths_by_state_data.describe()

Unnamed: 0,total_population,cumulative_cases,cumulative_cases_per_100_000,cumulative_deaths,cumulative_deaths_per_100_000,new_cases,new_cases_7_day_rolling_avg,new_deaths,new_deaths_7_day_rolling_avg,new_deaths_per_100_000,new_cases_per_100_000
count,10244.0,11032.0,10244.0,11032.0,10244.0,10976.0,10752.0,10976.0,10750.0,10192.0,10192.0
mean,6353127.0,24090.179931,356.957177,1135.081762,15.463199,439.480412,434.212758,14.417547,14.582699,0.200701,6.382597
std,7214623.0,58634.754005,495.328196,3519.008307,29.54791,1180.467935,1134.552938,52.652624,50.75422,0.459596,9.332886
min,577737.0,0.0,0.0,0.0,0.0,-140.0,0.0,-131.0,0.0,-0.99,-4.64
25%,1792926.0,2.0,0.13,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0
50%,4329558.0,2252.0,128.86,57.0,3.78,62.0,71.5,1.0,1.571429,0.05,3.12
75%,7262632.0,20572.25,527.415,673.25,15.6625,411.25,410.428571,10.0,10.714286,0.21,8.3825
max,39557040.0,530606.0,2702.65,32754.0,178.0,15300.0,11870.0,1081.0,974.285714,7.13,82.4


In [4]:
# total population of all counties
total_population = country_level_data['total_population'].sum()
print(total_population)

325399523.0


In [8]:
# check NA values for all columns of country_level_data dataset
country_level_data.columns.isna().sum()

0

In [7]:
# check NA values for all columns of cases_and_deaths_by_county_data dataset
cases_and_deaths_by_county_data.isna().sum()

uid                                  0
location_type                        0
fips_code                         1970
location_name                      394
state                                0
date                                 0
total_population                 22458
cumulative_cases                     0
cumulative_cases_per_100_000     22458
cumulative_deaths                    0
cumulative_deaths_per_100_000    22458
new_cases                         3252
new_deaths                        3252
new_cases_per_100_000            25596
new_deaths_per_100_000           25596
new_cases_7_day_rolling_avg      17844
new_deaths_7_day_rolling_avg     16509
dtype: int64

In [6]:
# check NA values for all columns of cases_and_deaths_by_state_data dataset
cases_and_deaths_by_state_data.isna().sum()

state                              0
date                               0
total_population                 788
cumulative_cases                   0
cumulative_cases_per_100_000     788
cumulative_deaths                  0
cumulative_deaths_per_100_000    788
new_cases                         56
new_cases_7_day_rolling_avg      280
new_deaths                        56
new_deaths_7_day_rolling_avg     282
new_deaths_per_100_000           840
new_cases_per_100_000            840
dtype: int64