# Step 2 - Describe Data 

This notebook provides the Python code for opening Covid Total Cases file and computing descriptive statistics.  

Students will be developing a similar notebook for total deaths.  The corresponding notebook is included in the answer section.

#  		Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np

# Set Input Folder

Depending on the Operating System you are using the file access may differ. 
Choose your operating system by setting its value to True and keep the rest False

In [2]:
using_Google_colab = False
using_Anaconda_on_Mac_or_Linux_or_Azure = True
using_Anaconda_on_windows = False

In [3]:
if using_Google_colab:
    from google.colab import drive
    drive.mount('/content/drive')

In [4]:
if using_Google_colab:
    dir_input = "/content/drive/MyDrive/COVID_Project/input"
if using_Anaconda_on_Mac_or_Linux_or_Azure:
    dir_input = "../input"
if using_Anaconda_on_windows:
    dir_input = r"..\input"   

#  		Q 2-2 Open USA Facts Data Source and list data sets

In [5]:
path_USA_Facts = os.path.join(dir_input, "USA_Facts")
path_USA_Facts

'../input/USA_Facts'

In [6]:
files_USA_Facts = os.listdir(os.path.join(dir_input, "USA_Facts"))
files_USA_Facts

['covid_deaths_usafacts.csv',
 'covid_county_population_usafacts.csv',
 'COVID19_CDC_Vaccination_CSV_Download.csv',
 'covid_confirmed_usafacts.csv',
 'unemployment_rate_usafacts.csv']

# Q 2-2 Understand number of files included in this data source

In [7]:
len(files_USA_Facts)

5

#  Q 2-3 Open “covid_confirmed_usafacts” dataset file from USA Facts Data Source

In [8]:
path_USA_Facts = os.path.join(dir_input, "USA_Facts", "covid_confirmed_usafacts")
path_USA_Facts

'../input/USA_Facts/covid_confirmed_usafacts'

In [9]:
df_covid_confirmed = pd.read_csv(os.path.join(dir_input, "USA_Facts", "covid_confirmed_usafacts.csv"))

# Q 2-3	Review fields included in this  Data Set - “covid_confirmed_usafacts” 

In [10]:
df_covid_confirmed.columns

Index(['countyFIPS', 'County Name', 'State', 'StateFIPS', '2020-01-22',
       '2020-01-23', '2020-01-24', '2020-01-25', '2020-01-26', '2020-01-27',
       ...
       '2022-01-18', '2022-01-19', '2022-01-20', '2022-01-21', '2022-01-22',
       '2022-01-23', '2022-01-24', '2022-01-25', '2022-01-26', '2022-01-27'],
      dtype='object', length=741)

# Q 2-3	Find number of rows and columns in this  Data Set - “covid_confirmed_usafacts” 

In [11]:
df_covid_confirmed.shape


(3193, 741)

# Q 2-3 Sample data contents included in “covid_confirmed_usafacts” dataset

In [12]:
df_covid_confirmed

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2022-01-18,2022-01-19,2022-01-20,2022-01-21,2022-01-22,2022-01-23,2022-01-24,2022-01-25,2022-01-26,2022-01-27
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,12738,12833,12928,13019,13019,13019,13251,13251,13251,13251
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,47143,47662,48338,49168,49168,49168,50313,50313,50313,50313
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,4741,4800,4843,4902,4902,4902,5054,5054,5054,5054
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,5385,5486,5565,5663,5663,5663,5795,5795,5795,5795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,56037,Sweetwater County,WY,56,0,0,0,0,0,0,...,9082,9184,9241,9449,9449,9449,9609,9712,9810,10007
3189,56039,Teton County,WY,56,0,0,0,0,0,0,...,8531,8638,8741,8814,8814,8814,8960,9049,9121,9195
3190,56041,Uinta County,WY,56,0,0,0,0,0,0,...,4660,4751,4827,4927,4927,4927,5034,5081,5167,5222
3191,56043,Washakie County,WY,56,0,0,0,0,0,0,...,1994,2002,2023,2025,2025,2025,2041,2066,2093,2130


# Q 2-4 Determine basic statistics on “covid_confirmed_usafacts” dataset file from USA Facts Data Source

In [13]:
df_covid_confirmed.describe()

Unnamed: 0,countyFIPS,StateFIPS,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,2020-01-29,...,2022-01-18,2022-01-19,2022-01-20,2022-01-21,2022-01-22,2022-01-23,2022-01-24,2022-01-25,2022-01-26,2022-01-27
count,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,...,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0
mean,29898.348262,30.259004,0.226433,0.229878,0.23207,0.235202,0.237707,0.240839,0.242718,0.243971,...,20829.03,21162.64,21356.0,21556.96,21640.71,21721.85,22034.08,22164.89,22376.6,22494.89
std,15515.884232,15.153897,6.899731,6.97775,7.031192,7.075284,7.096643,7.16,7.199078,7.220848,...,71330.25,72551.8,73149.86,73925.26,74255.59,74833.99,75601.79,75974.33,76325.3,76516.86
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18101.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013.0,2066.0,2081.0,2108.0,2108.0,2108.0,2141.0,2149.0,2199.0,2204.0
50%,29125.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5107.0,5227.0,5285.0,5339.0,5359.0,5363.0,5459.0,5492.0,5561.0,5604.0
75%,45055.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13467.0,13799.0,13915.0,14080.0,14172.0,14244.0,14371.0,14528.0,14711.0,14769.0
max,56045.0,56.0,375.0,379.0,382.0,384.0,385.0,388.0,390.0,391.0,...,2276388.0,2343261.0,2367401.0,2384427.0,2390482.0,2430653.0,2453693.0,2468026.0,2472960.0,2473095.0


In [14]:
df_covid_confirmed.describe()[['2020-12-18', '2020-12-19', '2020-12-20']]

Unnamed: 0,2020-12-18,2020-12-19,2020-12-20
count,3193.0,3193.0,3193.0
mean,5471.10523,5538.622612,5589.872534
std,19381.24075,19685.512575,19918.522676
min,0.0,0.0,0.0
25%,571.0,579.0,586.0
50%,1388.0,1406.0,1416.0
75%,3672.0,3708.0,3730.0
max,640412.0,655974.0,668041.0
