### STEP 1: DATA EXTRACTION:

In [98]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.width', 35)
pd.set_option('display.max_columns', 35)

In [101]:

from sqlalchemy import create_engine, MetaData, Table, select, text
import pandas as pd

# Connection string using Windows Authentication (no username/password needed)
# Replace 'your_server_name' and 'your_database_name' with actual values
server_name = r"DEBBY\MSSQLSERVER06"  # e.g., "localhost" or "SERVER01\SQLEXPRESS"
database_name = "datawarehouse"   # your database name

# Create connection string for Windows Authentication
connection_string = f"mssql+pyodbc://{server_name}/{database_name}?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes"

# Alternative connection string format
# connection_string = f"mssql+pyodbc://@{server_name}/{database_name}?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes"

# Create engine
engine = create_engine(connection_string)

# Method 1: Using pandas to execute SELECT * (Recommended for data analysis)
try:
    # Simple SELECT * from table
    query = "SELECT * from [datawarehouse].[dbo].[fact_unified_covid]"
    covid_df = pd.read_sql(query, engine)
    print(f"Retrieved {len(covid_df)} rows from table")
except Exception as e:
    print(f"Error: {e}")

Retrieved 537040 rows from table


In [103]:
covid_df.head()

Unnamed: 0,Date,PHU_ID,PHU_Name,Agegroup,At_least_one_dose_cumulative,Second_dose_cumulative,Fully_vaccinated_cumulative,Third_dose_cumulative,Total_Population,Percent_at_least_one_dose,Percent_fully_vaccinated,Percent_3doses,ACTIVE_CASES,RESOLVED_CASES,DEATHS,POSITIVE_CASES_AGEGROUP,POSITIVE_CASES_TOTAL,CASES_BY_CASE_REPORTED_AGEGRP,CASES_BY_TEST_REPORTED_AGEGRP,CASES_BY_SPECIMEN_AGEGRP,CASES_BY_CASE_REPORTED_TOTAL,CASES_BY_TEST_REPORTED_TOTAL,CASES_BY_SPECIMEN_TOTAL,Reporting_PHU_City,Reporting_PHU_Latitude,Reporting_PHU_Longitude
0,2021-07-26,2226,ALGOMA DISTRICT,12-17yrs,1830,855,0,0,6672,0.27,0.13,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
1,2021-07-26,2226,ALGOMA DISTRICT,18-29yrs,9622,6222,0,0,15684,0.61,0.4,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
2,2021-07-26,2226,ALGOMA DISTRICT,30-39yrs,9125,6667,0,0,12699,0.72,0.52,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
3,2021-07-26,2226,ALGOMA DISTRICT,40-49yrs,9273,7236,0,0,12186,0.76,0.59,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
4,2021-07-26,2226,ALGOMA DISTRICT,50-59yrs,11080,8936,0,0,16238,0.68,0.55,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31


In [105]:
# 35 unique PHUs. Investigate 9999 which is unknown PHU

covid_df["PHU_ID"].value_counts(dropna=False)

PHU_ID
2226    15344
2263    15344
2256    15344
2257    15344
2258    15344
2260    15344
2261    15344
2262    15344
2265    15344
2253    15344
2266    15344
2268    15344
2270    15344
3895    15344
4913    15344
5183    15344
2255    15344
2251    15344
2227    15344
2238    15344
2230    15344
2233    15344
2234    15344
2235    15344
2236    15344
2237    15344
2240    15344
2249    15344
2241    15344
2242    15344
2243    15344
2244    15344
2246    15344
2247    15344
9999    15344
Name: count, dtype: int64

### STEP 2: DATA CLEANING AND STANDARDIZATION

• Handle missing values using imputation or removal as appropriate. • Remove duplicates and check for consistency in identifiers (PHU_ID, Agegroup, Date). • Detect and treat outliers in case counts and vaccination data. • Standardize variable names and formats across datasets (snake_case format preferred). • Validate merged dataset ensuring PHU_ID, Date, and Agegroup align correctly.

In [107]:
covid_df.head()

Unnamed: 0,Date,PHU_ID,PHU_Name,Agegroup,At_least_one_dose_cumulative,Second_dose_cumulative,Fully_vaccinated_cumulative,Third_dose_cumulative,Total_Population,Percent_at_least_one_dose,Percent_fully_vaccinated,Percent_3doses,ACTIVE_CASES,RESOLVED_CASES,DEATHS,POSITIVE_CASES_AGEGROUP,POSITIVE_CASES_TOTAL,CASES_BY_CASE_REPORTED_AGEGRP,CASES_BY_TEST_REPORTED_AGEGRP,CASES_BY_SPECIMEN_AGEGRP,CASES_BY_CASE_REPORTED_TOTAL,CASES_BY_TEST_REPORTED_TOTAL,CASES_BY_SPECIMEN_TOTAL,Reporting_PHU_City,Reporting_PHU_Latitude,Reporting_PHU_Longitude
0,2021-07-26,2226,ALGOMA DISTRICT,12-17yrs,1830,855,0,0,6672,0.27,0.13,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
1,2021-07-26,2226,ALGOMA DISTRICT,18-29yrs,9622,6222,0,0,15684,0.61,0.4,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
2,2021-07-26,2226,ALGOMA DISTRICT,30-39yrs,9125,6667,0,0,12699,0.72,0.52,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
3,2021-07-26,2226,ALGOMA DISTRICT,40-49yrs,9273,7236,0,0,12186,0.76,0.59,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
4,2021-07-26,2226,ALGOMA DISTRICT,50-59yrs,11080,8936,0,0,16238,0.68,0.55,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31


### LOCATE SOLVABLE ISSUES

#### 1. Standardize variable names and formats across datasets (snake_case format preferred)

In [109]:
#####  Standardize variable names and formats across datasets (snake_case format preferred)

import pandas as pd
import re

def standardize_column_names(df):
   """
   Convert DataFrame column names to snake_case format
   """
   covid_df_copy = covid_df.copy()
   old_to_new = {}
   
   for col in covid_df_copy.columns:
       # Convert to snake_case
       new_col = col.strip()  # Remove leading/trailing spaces
       new_col = re.sub(r'[^\w\s]', '', new_col)  # Remove special characters except underscore
       new_col = re.sub(r'\s+', '_', new_col)     # Replace spaces with underscore
       new_col = re.sub(r'_{2,}', '_', new_col)   # Replace multiple underscores with single
       new_col = new_col.lower()                  # Convert to lowercase
       new_col = new_col.strip('_')               # Remove leading/trailing underscores
       
       old_to_new[col] = new_col
   
   # Rename columns
   covid_df_copy = covid_df_copy.rename(columns=old_to_new)
   
   # Print changes
   print("Column name changes:")
   for old, new in old_to_new.items():
       if old != new:
           print(f"  '{old}' → '{new}'")
   
   return covid_df_copy

# Apply to your DataFrame
covid_df = standardize_column_names(covid_df)

print(f"\n✅ Standardized {len(covid_df.columns)} column names to snake_case format")
print(f"New columns: {list(covid_df.columns)}")

Column name changes:
  'Date' → 'date'
  'PHU_ID' → 'phu_id'
  'PHU_Name' → 'phu_name'
  'Agegroup' → 'agegroup'
  'At_least_one_dose_cumulative' → 'at_least_one_dose_cumulative'
  'Second_dose_cumulative' → 'second_dose_cumulative'
  'Fully_vaccinated_cumulative' → 'fully_vaccinated_cumulative'
  'Third_dose_cumulative' → 'third_dose_cumulative'
  'Total_Population' → 'total_population'
  'Percent_at_least_one_dose' → 'percent_at_least_one_dose'
  'Percent_fully_vaccinated' → 'percent_fully_vaccinated'
  'Percent_3doses' → 'percent_3doses'
  'ACTIVE_CASES' → 'active_cases'
  'RESOLVED_CASES' → 'resolved_cases'
  'DEATHS' → 'deaths'
  'POSITIVE_CASES_AGEGROUP' → 'positive_cases_agegroup'
  'POSITIVE_CASES_TOTAL' → 'positive_cases_total'
  'CASES_BY_CASE_REPORTED_AGEGRP' → 'cases_by_case_reported_agegrp'
  'CASES_BY_TEST_REPORTED_AGEGRP' → 'cases_by_test_reported_agegrp'
  'CASES_BY_SPECIMEN_AGEGRP' → 'cases_by_specimen_agegrp'
  'CASES_BY_CASE_REPORTED_TOTAL' → 'cases_by_case_reported_

In [111]:
covid_df.head()

Unnamed: 0,date,phu_id,phu_name,agegroup,at_least_one_dose_cumulative,second_dose_cumulative,fully_vaccinated_cumulative,third_dose_cumulative,total_population,percent_at_least_one_dose,percent_fully_vaccinated,percent_3doses,active_cases,resolved_cases,deaths,positive_cases_agegroup,positive_cases_total,cases_by_case_reported_agegrp,cases_by_test_reported_agegrp,cases_by_specimen_agegrp,cases_by_case_reported_total,cases_by_test_reported_total,cases_by_specimen_total,reporting_phu_city,reporting_phu_latitude,reporting_phu_longitude
0,2021-07-26,2226,ALGOMA DISTRICT,12-17yrs,1830,855,0,0,6672,0.27,0.13,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
1,2021-07-26,2226,ALGOMA DISTRICT,18-29yrs,9622,6222,0,0,15684,0.61,0.4,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
2,2021-07-26,2226,ALGOMA DISTRICT,30-39yrs,9125,6667,0,0,12699,0.72,0.52,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
3,2021-07-26,2226,ALGOMA DISTRICT,40-49yrs,9273,7236,0,0,12186,0.76,0.59,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
4,2021-07-26,2226,ALGOMA DISTRICT,50-59yrs,11080,8936,0,0,16238,0.68,0.55,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31


#### 2. Remove duplicates and check for consistency in identifiers (PHU_ID, Agegroup, Date)

In [113]:
# CHECK FOR DUPLICATES: NO DUPLICATES

covid_df[covid_df.duplicated()]

Unnamed: 0,date,phu_id,phu_name,agegroup,at_least_one_dose_cumulative,second_dose_cumulative,fully_vaccinated_cumulative,third_dose_cumulative,total_population,percent_at_least_one_dose,percent_fully_vaccinated,percent_3doses,active_cases,resolved_cases,deaths,positive_cases_agegroup,positive_cases_total,cases_by_case_reported_agegrp,cases_by_test_reported_agegrp,cases_by_specimen_agegrp,cases_by_case_reported_total,cases_by_test_reported_total,cases_by_specimen_total,reporting_phu_city,reporting_phu_latitude,reporting_phu_longitude


#### 3. Handle missing values using imputation or removal as appropriate

In [115]:
#  CHECK MISSING VALUES

covid_df.isna().sum()

date                                  0
phu_id                                0
phu_name                              0
agegroup                              0
at_least_one_dose_cumulative          0
second_dose_cumulative                0
fully_vaccinated_cumulative           0
third_dose_cumulative                 0
total_population                      0
percent_at_least_one_dose             0
percent_fully_vaccinated              0
percent_3doses                        0
active_cases                     328280
resolved_cases                   328280
deaths                           328280
positive_cases_agegroup          537040
positive_cases_total             182612
cases_by_case_reported_agegrp    537040
cases_by_test_reported_agegrp    537040
cases_by_specimen_agegrp         537040
cases_by_case_reported_total     185724
cases_by_test_reported_total     191393
cases_by_specimen_total          185803
reporting_phu_city                15344
reporting_phu_latitude            15344


#### 4. Detect and treat outliers in case counts and vaccination data

In [117]:
# DETECTING AND HANDLING OUTLIERS USING IQR METHOD:


import pandas as pd
import numpy as np
# Work on a copy to avoid modifying the original DataFrame
covid_df_copy= covid_df.copy()
# Outlier handling – cap using IQR
for col in covid_df_copy.select_dtypes(include=[np.number]).columns:
   Q1 = covid_df[col].quantile(0.25)
   Q3 = covid_df[col].quantile(0.75)
   IQR = Q3 - Q1
   lower_bound = Q1 - 1.5 * IQR
   upper_bound = Q3 + 1.5 * IQR
   
   # Convert nullable Int64 columns to float64 to handle negative values
   if covid_df_copy[col].dtype == 'Int64':
       covid_df_copy[col] = covid_df_copy[col].astype('float64')
   
   covid_df_copy.loc[covid_df_copy[col] < lower_bound, col] = lower_bound
   covid_df_copy.loc[covid_df_copy[col] > upper_bound, col] = upper_bound

print("✅ Missing values imputed and outliers capped on a safe copy.")

  covid_df_copy.loc[covid_df_copy[col] < lower_bound, col] = lower_bound
  covid_df_copy.loc[covid_df_copy[col] < lower_bound, col] = lower_bound
  covid_df_copy.loc[covid_df_copy[col] < lower_bound, col] = lower_bound
  covid_df_copy.loc[covid_df_copy[col] < lower_bound, col] = lower_bound
  covid_df_copy.loc[covid_df_copy[col] < lower_bound, col] = lower_bound


✅ Missing values imputed and outliers capped on a safe copy.


In [119]:
covid_df.head()

Unnamed: 0,date,phu_id,phu_name,agegroup,at_least_one_dose_cumulative,second_dose_cumulative,fully_vaccinated_cumulative,third_dose_cumulative,total_population,percent_at_least_one_dose,percent_fully_vaccinated,percent_3doses,active_cases,resolved_cases,deaths,positive_cases_agegroup,positive_cases_total,cases_by_case_reported_agegrp,cases_by_test_reported_agegrp,cases_by_specimen_agegrp,cases_by_case_reported_total,cases_by_test_reported_total,cases_by_specimen_total,reporting_phu_city,reporting_phu_latitude,reporting_phu_longitude
0,2021-07-26,2226,ALGOMA DISTRICT,12-17yrs,1830,855,0,0,6672,0.27,0.13,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
1,2021-07-26,2226,ALGOMA DISTRICT,18-29yrs,9622,6222,0,0,15684,0.61,0.4,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
2,2021-07-26,2226,ALGOMA DISTRICT,30-39yrs,9125,6667,0,0,12699,0.72,0.52,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
3,2021-07-26,2226,ALGOMA DISTRICT,40-49yrs,9273,7236,0,0,12186,0.76,0.59,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31
4,2021-07-26,2226,ALGOMA DISTRICT,50-59yrs,11080,8936,0,0,16238,0.68,0.55,0.0,4.0,395.0,6.0,,1.0,,,,,,,Sault Ste. Marie,46.53,-84.31


In [121]:
covid_df.dtypes

date                              object
phu_id                             int64
phu_name                          object
agegroup                          object
at_least_one_dose_cumulative       int64
second_dose_cumulative             int64
fully_vaccinated_cumulative        int64
third_dose_cumulative              int64
total_population                   int64
percent_at_least_one_dose        float64
percent_fully_vaccinated         float64
percent_3doses                   float64
active_cases                     float64
resolved_cases                   float64
deaths                           float64
positive_cases_agegroup           object
positive_cases_total             float64
cases_by_case_reported_agegrp     object
cases_by_test_reported_agegrp     object
cases_by_specimen_agegrp          object
cases_by_case_reported_total     float64
cases_by_test_reported_total     float64
cases_by_specimen_total          float64
reporting_phu_city                object
reporting_phu_la

In [58]:
len(covid_df["phu_id"].value_counts(dropna=False))

35

In [123]:
# 3 FORMATTING:  NUMERICAL DATA FORMATTING

# Convert phu_id to int
covid_df['active_cases'] = pd.to_numeric(covid_df['active_cases'], errors='coerce').astype('Int64')
covid_df['resolved_cases'] = pd.to_numeric(covid_df['resolved_cases'], errors='coerce').astype('Int64')
covid_df['deaths'] = pd.to_numeric(covid_df['deaths'], errors='coerce').astype('Int64')
covid_df['positive_cases_agegroup'] = pd.to_numeric(covid_df['positive_cases_agegroup'], errors='coerce').astype('Int64')
covid_df['positive_cases_total'] = pd.to_numeric(covid_df['positive_cases_total'], errors='coerce').astype('Int64')
covid_df['cases_by_case_reported_agegrp'] = pd.to_numeric(covid_df['cases_by_case_reported_agegrp'], errors='coerce').astype('Int64')
covid_df['cases_by_test_reported_agegrp'] = pd.to_numeric(covid_df['cases_by_test_reported_agegrp'], errors='coerce').astype('Int64')
covid_df['cases_by_specimen_agegrp'] = pd.to_numeric(covid_df['cases_by_specimen_agegrp'], errors='coerce').astype('Int64')
covid_df['cases_by_case_reported_total'] = pd.to_numeric(covid_df['cases_by_case_reported_total'], errors='coerce').astype('Int64')
covid_df['cases_by_test_reported_total'] = pd.to_numeric(covid_df['cases_by_test_reported_total'], errors='coerce').astype('Int64')
covid_df['cases_by_specimen_total'] = pd.to_numeric(covid_df['cases_by_specimen_total'], errors='coerce').astype('Int64')

In [62]:
covid_df.dtypes

date                              object
phu_id                             int64
phu_name                          object
agegroup                          object
at_least_one_dose_cumulative       int64
second_dose_cumulative             int64
fully_vaccinated_cumulative        int64
third_dose_cumulative              int64
total_population                   int64
percent_at_least_one_dose        float64
percent_fully_vaccinated         float64
percent_3doses                   float64
active_cases                       Int64
resolved_cases                     Int64
deaths                             Int64
positive_cases_agegroup            Int64
positive_cases_total               Int64
cases_by_case_reported_agegrp      Int64
cases_by_test_reported_agegrp      Int64
cases_by_specimen_agegrp           Int64
cases_by_case_reported_total       Int64
cases_by_test_reported_total       Int64
cases_by_specimen_total            Int64
reporting_phu_city                object
reporting_phu_la

In [125]:
len(covid_df["phu_id"].value_counts(dropna=False))

35

#### 5. Validate merged dataset ensuring PHU_ID, Date, and Agegroup align correctly. Not including this

### VALIDATE CLEANED DATA

In [127]:
#  CHECK MISSING VALUES

covid_df.isna().sum()

date                                  0
phu_id                                0
phu_name                              0
agegroup                              0
at_least_one_dose_cumulative          0
second_dose_cumulative                0
fully_vaccinated_cumulative           0
third_dose_cumulative                 0
total_population                      0
percent_at_least_one_dose             0
percent_fully_vaccinated              0
percent_3doses                        0
active_cases                     328280
resolved_cases                   328280
deaths                           328280
positive_cases_agegroup          537040
positive_cases_total             182612
cases_by_case_reported_agegrp    537040
cases_by_test_reported_agegrp    537040
cases_by_specimen_agegrp         537040
cases_by_case_reported_total     185724
cases_by_test_reported_total     191393
cases_by_specimen_total          185803
reporting_phu_city                15344
reporting_phu_latitude            15344


### EVALUATE UNSOLVABLE ISSUE

In [129]:
covid_df["phu_id"].value_counts(dropna=False)

phu_id
2226    15344
2263    15344
2256    15344
2257    15344
2258    15344
2260    15344
2261    15344
2262    15344
2265    15344
2253    15344
2266    15344
2268    15344
2270    15344
3895    15344
4913    15344
5183    15344
2255    15344
2251    15344
2227    15344
2238    15344
2230    15344
2233    15344
2234    15344
2235    15344
2236    15344
2237    15344
2240    15344
2249    15344
2241    15344
2242    15344
2243    15344
2244    15344
2246    15344
2247    15344
9999    15344
Name: count, dtype: int64

In [131]:
# phu_id of 9999 that seem to be data error was spotted but was converted to null which was kept as is

covid_df[covid_df["phu_id"] == 9999] = 0

In [133]:
covid_df['phu_id'] = pd.to_numeric(covid_df['phu_id'], errors='coerce').astype('Int64')

In [135]:
covid_df["phu_id"].value_counts(dropna=False)

phu_id
2226    15344
2263    15344
2256    15344
2257    15344
2258    15344
2260    15344
2261    15344
2262    15344
2265    15344
2253    15344
2266    15344
2268    15344
2270    15344
3895    15344
4913    15344
5183    15344
2255    15344
2251    15344
2227    15344
2238    15344
2230    15344
2233    15344
2234    15344
2235    15344
2236    15344
2237    15344
2240    15344
2249    15344
2241    15344
2242    15344
2243    15344
2244    15344
2246    15344
2247    15344
0       15344
Name: count, dtype: Int64

In [137]:
#  CHECK MISSING VALUES: all columns have multiples nulls that were kept as is.Get the percent of null data for documentation
# other columns that was 0 null now contains nulls after making the phu_1d 9999 null. They are kept as is 
covid_df.isna().sum()

date                                  0
phu_id                                0
phu_name                              0
agegroup                              0
at_least_one_dose_cumulative          0
second_dose_cumulative                0
fully_vaccinated_cumulative           0
third_dose_cumulative                 0
total_population                      0
percent_at_least_one_dose             0
percent_fully_vaccinated              0
percent_3doses                        0
active_cases                     312936
resolved_cases                   312936
deaths                           312936
positive_cases_agegroup          521696
positive_cases_total             167268
cases_by_case_reported_agegrp    521696
cases_by_test_reported_agegrp    521696
cases_by_specimen_agegrp         521696
cases_by_case_reported_total     170380
cases_by_test_reported_total     176049
cases_by_specimen_total          170459
reporting_phu_city                    0
reporting_phu_latitude                0


In [146]:
len(covid_df["phu_id"].value_counts(dropna=False))

35

In [141]:
covid_df["reporting_phu_city"].value_counts(dropna=False)

reporting_phu_city
Sault Ste. Marie    15344
New Liskeard        15344
Timmins             15344
Pembroke            15344
Cornwall            15344
Barrie              15344
Sudbury             15344
Thunder Bay         15344
Waterloo            15344
Mississauga         15344
Guelph              15344
Windsor             15344
Newmarket           15344
Toronto             15344
St. Thomas          15344
Stratford           15344
Peterborough        15344
Ottawa              15344
Brantford           15344
Belleville          15344
Whitby              15344
Owen Sound          15344
Simcoe              15344
Port Hope           15344
Oakville            15344
Hamilton            15344
Chatham             15344
Kenora              15344
Kingston            15344
Point Edward        15344
Brockville          15344
London              15344
Thorold             15344
North Bay           15344
0                   15344
Name: count, dtype: int64

In [143]:
covid_df.head()

Unnamed: 0,date,phu_id,phu_name,agegroup,at_least_one_dose_cumulative,second_dose_cumulative,fully_vaccinated_cumulative,third_dose_cumulative,total_population,percent_at_least_one_dose,percent_fully_vaccinated,percent_3doses,active_cases,resolved_cases,deaths,positive_cases_agegroup,positive_cases_total,cases_by_case_reported_agegrp,cases_by_test_reported_agegrp,cases_by_specimen_agegrp,cases_by_case_reported_total,cases_by_test_reported_total,cases_by_specimen_total,reporting_phu_city,reporting_phu_latitude,reporting_phu_longitude
0,2021-07-26,2226,ALGOMA DISTRICT,12-17yrs,1830,855,0,0,6672,0.27,0.13,0.0,4,395,6,,1,,,,,,,Sault Ste. Marie,46.53,-84.31
1,2021-07-26,2226,ALGOMA DISTRICT,18-29yrs,9622,6222,0,0,15684,0.61,0.4,0.0,4,395,6,,1,,,,,,,Sault Ste. Marie,46.53,-84.31
2,2021-07-26,2226,ALGOMA DISTRICT,30-39yrs,9125,6667,0,0,12699,0.72,0.52,0.0,4,395,6,,1,,,,,,,Sault Ste. Marie,46.53,-84.31
3,2021-07-26,2226,ALGOMA DISTRICT,40-49yrs,9273,7236,0,0,12186,0.76,0.59,0.0,4,395,6,,1,,,,,,,Sault Ste. Marie,46.53,-84.31
4,2021-07-26,2226,ALGOMA DISTRICT,50-59yrs,11080,8936,0,0,16238,0.68,0.55,0.0,4,395,6,,1,,,,,,,Sault Ste. Marie,46.53,-84.31


In [92]:
covid_df.head()

Unnamed: 0,date,phu_id,phu_name,agegroup,at_least_one_dose_cumulative,second_dose_cumulative,fully_vaccinated_cumulative,third_dose_cumulative,total_population,percent_at_least_one_dose,percent_fully_vaccinated,percent_3doses,active_cases,resolved_cases,deaths,positive_cases_agegroup,positive_cases_total,cases_by_case_reported_agegrp,cases_by_test_reported_agegrp,cases_by_specimen_agegrp,cases_by_case_reported_total,cases_by_test_reported_total,cases_by_specimen_total,reporting_phu_city,reporting_phu_latitude,reporting_phu_longitude
0,2021-07-26,2226,ALGOMA DISTRICT,12-17yrs,1830.0,855.0,0.0,0.0,6672.0,0.27,0.13,0.0,4,395,6,,1,,,,,,,Sault Ste. Marie,46.53,-84.31
1,2021-07-26,2226,ALGOMA DISTRICT,18-29yrs,9622.0,6222.0,0.0,0.0,15684.0,0.61,0.4,0.0,4,395,6,,1,,,,,,,Sault Ste. Marie,46.53,-84.31
2,2021-07-26,2226,ALGOMA DISTRICT,30-39yrs,9125.0,6667.0,0.0,0.0,12699.0,0.72,0.52,0.0,4,395,6,,1,,,,,,,Sault Ste. Marie,46.53,-84.31
3,2021-07-26,2226,ALGOMA DISTRICT,40-49yrs,9273.0,7236.0,0.0,0.0,12186.0,0.76,0.59,0.0,4,395,6,,1,,,,,,,Sault Ste. Marie,46.53,-84.31
4,2021-07-26,2226,ALGOMA DISTRICT,50-59yrs,11080.0,8936.0,0.0,0.0,16238.0,0.68,0.55,0.0,4,395,6,,1,,,,,,,Sault Ste. Marie,46.53,-84.31


In [145]:
# Export the clean data to my local computer

covid_df.to_excel("covid_19_clean.xlsx", index=False)

### STEP 3: INITIAL DATA ANALYSIS(EDA)

• Generate descriptive statistics for key variables (mean, median, variance). 

• Visualize distributions of cases and vaccinations (histograms, boxplots). 

• Identify initial trends and patterns in COVID-19 spread and vaccination coverage. 

• Highlight potential outliers in regional or demographic data.