### ETL to Dashboard: Using Python, PostgreSQL and Power BI to create a Covid Dashboard using up to date data.

This project aims to showcase my skills to create a Covid Dashboard using data from the ECDE - European Centre for Disease Prevention and Control. 

In [None]:
#Import essential libraries
import pandas as pd
import requests
import psycopg2
from sqlalchemy import create_engine

In [None]:
#Setting up the postgres database connection
engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/CaseDB')

In [None]:
################# Getting Data and converting to Dataframe

def JsonUrlToDf(url):
    return pd.DataFrame(requests.get(url).json())

def CsvToDf(url):
    return pd.DataFrame(pd.read_csv(url))

def FromSqltoDf(data,arg):
    return pd.DataFrame(pd.read_sql(data,con=arg))

################# SQL related functions

#SQL database connection

def DBInfo(dbengine,TableName,SchemaName,IfExists):
    return [dbengine,TableName,SchemaName,IfExists] 

def ToSql(data,args):
   data.to_sql(args[1],con=args[0],schema=args[2],if_exists=args[3],index=False)

def QuerySql(sql,dbengine):
    return pd.read_sql(sql,con=dbengine)

################# Covid Data ETL and Cleaning Pipelines

def CleaningCovidData(data):
    data['year_week'] = data['year_week'].str.replace('-','').astype(int)                      #transforming the date format "year-week" to numbers only
    return data.drop(['source','country_code','note','rate_14_day'],axis=1)                    #Removing unnecessary columns from the covid data

def CreateCovidDataSql(data,args=[engine,"CovidData","CaseSchema","replace"]):
   data.to_sql(args[1],con=args[0],schema=args[2],if_exists=args[3],index=False)               #sending covid data to PostgreSQL table "CovidData"

def PipelineCreateCovidDataSql(data):                                                          #Pipeline organizing the steps of getting the data, cleaning it and sending it to the PostgreSQL server.
    return (JsonUrlToDf(data)                                                                  #
    .pipe(CleaningCovidData)                                                                   #
    .pipe(CreateCovidDataSql)                                                                  #
    )

################# Countries Data ETL and Cleaning Pipelines
def CleaningCountriesData(data):
    data['Country'] = data['Country'].astype(str).str[0:-1]                                     #Removing a space at the end of every country name
    data['Country'] = data['Country'].replace('Czech Republic','Czechia')                       #Czechia changed its name after the cration of the dataset
    cvd = FromSqltoDf('select * from "CaseSchema"."VisualData"',engine)                         #Getting the list of countries name from the Covid Data to filter
    data = data.loc[data['Country'].isin(list(set(cvd['Country'])))].reset_index(drop='true')   #
    return data

def CreateCountriesDataSql(data,args=[engine,"CountriesData","CaseSchema","replace"]):
   data.to_sql(args[1],con=args[0],schema=args[2],if_exists=args[3],index=False)                #sending countries data to PostgreSQL table "CountriesData"

def PipelineCreateCountriesDataSql(data):                                                       #Pipeline organizing the steps of getting the data, cleaning it and sending it to the PostgreSQL server.
    return (CsvToDf(data)                                                                       #
    .pipe(CleaningCountriesData)                                                                #
    .pipe(CreateCountriesDataSql)                                                               #
    )  

## Modeling the data to be used in the Power BI dashboard
def VisualData(data,arg):
    return FromSqltoDf(data,arg).pivot_table(index=['country','population'],columns='indicator',values=['IndicatorCountPer100k','cumulative_count']).reset_index().set_axis(['Country','Population','CasesPer100k','DeathsPer100k','TotalCases','TotalDeaths'],axis=1)

In [None]:
#Create Table in Postgres with Covid Data from url with json format
PipelineCreateCovidDataSql("https://opendata.ecdc.europa.eu/covid19/nationalcasedeath/json")

In [None]:
#Creating a ETL pipeline to get data from the "Countries of the World" available here: https://www.kaggle.com/datasets/fernandol/countries-of-the-world?select=countries+of+the+world.csv 
PipelineCreateCountriesDataSql('datasource2.csv')

In [None]:
##clean more of the countries data
## insert log in pipelines?
#FromSqltoDf('select * from "CaseSchema"."CountriesData"',engine)

In [None]:
## Enrich data coparing it with HDI / IHDI
#https://hdr.undp.org/data-center/documentation-and-downloads

In [None]:
## Query data from database and modeling it using python to create table data to be used in PBI Covid Dashboard
SQL = 'select CD.*, (CD."cumulative_count"/CD."population"*100000) as "IndicatorCountPer100k" from "CaseSchema"."CovidData" CD INNER JOIN(SELECT "country", max("year_week") "maxdate" FROM "CaseSchema"."CovidData" WHERE "weekly_count" NOTNULL AND "cumulative_count" NOTNULL group by "country")latest ON latest."country" = CD."country" WHERE CD."year_week" = latest."maxdate" ORDER BY CD."country"'
ToSql(VisualData(SQL,engine),DBInfo(engine,"VisualData","CaseSchema","replace"))