### ETL to Dashboard: Using Python, PostgreSQL and Power BI to create a Covid Dashboard using up to date data.

This project aims to showcase my skills to create a Covid Dashboard using data from the ECDE - European Centre for Disease Prevention and Control. 

In [45]:
#Import essential libraries
import pandas as pd
import requests
import psycopg2
from sqlalchemy import create_engine

In [46]:
#Setting up the postgres database connection
engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/CaseDB')

In [47]:
################# Getting Data and converting to Dataframe

def JsonUrlToDf(url):
    return pd.DataFrame(requests.get(url).json())

def CsvToDf(url):
    return pd.DataFrame(pd.read_csv(url))

def FromSqltoDf(data,arg):
    return pd.DataFrame(pd.read_sql(data,con=arg))

################# SQL related functions

#SQL database connection

def DBInfo(dbengine,TableName,SchemaName,IfExists):
    return [dbengine,TableName,SchemaName,IfExists] 

def ToSql(data,args):
   data.to_sql(args[1],con=args[0],schema=args[2],if_exists=args[3],index=False)

def QuerySql(sql,dbengine):
    return pd.read_sql(sql,con=dbengine)

################# Covid Data ETL and Cleaning Pipelines

def CleaningCovidData(data):
    data['year_week'] = data['year_week'].str.replace('-','').astype(int)
    return data.drop(['source','country_code','note','rate_14_day'],axis=1)

def CreateCovidDataSql(data,args=[engine,"CovidData","CaseSchema","replace"]):
   data.to_sql(args[1],con=args[0],schema=args[2],if_exists=args[3],index=False)

def PipelineCreateCovidDataSql(data):
    return (JsonUrlToDf(data)
    .pipe(CleaningCovidData)
    .pipe(CreateCovidDataSql)
    )

################# Countries Data ETL and Cleaning Pipelines
def CleaningCountriesData(data):
    data.rename(columns={"Country":"country"},inplace=True)
    data['country'] = data['country'].astype(str).str[0:-1]
    return data

def CreateCountriesDataSql(data,args=[engine,"CountriesData","CaseSchema","replace"]):
   data.to_sql(args[1],con=args[0],schema=args[2],if_exists=args[3],index=False)

def PipelineCreateCountriesDataSql(data):
    return (CsvToDf(data)
    .pipe(CleaningCountriesData)
    .pipe(CreateCountriesDataSql)
    )  

## Preparing data to be used in Power BI
def VisualData(data,arg):
    return FromSqltoDf(data,arg).pivot_table(index=['country','population'],columns='indicator',values=['IndicatorCountPer100k','cumulative_count']).reset_index().set_axis(['Country','Population','CasesPer100k','DeathsPer100k','TotalCases','TotalDeaths'],axis=1)

In [48]:
#Create Table in Postgres with Covid Data from url with json format
PipelineCreateCovidDataSql("https://opendata.ecdc.europa.eu/covid19/nationalcasedeath/json")

In [49]:
#Create table in Postgres with countries data from local file with csv format
#### Improve: get data directly from kaggle using API
PipelineCreateCountriesDataSql("../DashboardCovid/datasource2.csv")

In [50]:
## Enrich data coparing it with HDI / IHDI
#https://hdr.undp.org/data-center/documentation-and-downloads

CovidData will be used to make two visuals:

    1. Visual showing latest data and total cases (PBI Covid Dashboard)
    
    2. Visual showing evolution of cases and deaths and comparison with "Countries of The World" and "HDI" data, showing interesting patterns. (PBI Covid x Indexes)

In [51]:
## Query data from database and modeling it using python to create table data to be used in PBI Covid Dashboard
SQL = 'select CD.*, (CD."cumulative_count"/CD."population"*100000) as "IndicatorCountPer100k" from "CaseSchema"."CovidData" CD INNER JOIN(SELECT "country", max("year_week") "maxdate" FROM "CaseSchema"."CovidData" WHERE "weekly_count" NOTNULL AND "cumulative_count" NOTNULL group by "country")latest ON latest."country" = CD."country" WHERE CD."year_week" = latest."maxdate" ORDER BY CD."country"'
ToSql(VisualData(SQL,engine),DBInfo(engine,"VisualData","CaseSchema","replace"))