C19 - SQLite

In [1]:
import numpy as np
import pandas as pd
import sqlite3 as sql

In [2]:
database = "c19.db"
connection = sql.connect(database)

In [7]:
query1 = '''SELECT * FROM covidDeaths'''
df = pd.read_sql_query(query1, connection)
df.head()

Unnamed: 0,iso_code,continent,location,date,population,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,...,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million
0,AFG,Asia,Afghanistan,24/02/2020,398354280.0,50.0,50.0,,,,...,,,,,,,,,,
1,AFG,Asia,Afghanistan,25/02/2020,398354280.0,50.0,0.0,,,,...,,,,,,,,,,
2,AFG,Asia,Afghanistan,26/02/2020,398354280.0,50.0,0.0,,,,...,,,,,,,,,,
3,AFG,Asia,Afghanistan,27/02/2020,398354280.0,50.0,0.0,,,,...,,,,,,,,,,
4,AFG,Asia,Afghanistan,28/02/2020,398354280.0,50.0,0.0,,,,...,,,,,,,,,,


In [8]:
query2 = '''SELECT * FROM covidVaccinations'''
df = pd.read_sql_query(query2, connection)
df.head()

Unnamed: 0,iso_code,continent,location,date,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,24/02/2020,,,,,,,...,,,37746.0,5.0,6483.0,511.0,,,,
1,AFG,Asia,Afghanistan,25/02/2020,,,,,,,...,,,37746.0,5.0,6483.0,511.0,,,,
2,AFG,Asia,Afghanistan,26/02/2020,,,,,,,...,,,37746.0,5.0,6483.0,511.0,,,,
3,AFG,Asia,Afghanistan,27/02/2020,,,,,,,...,,,37746.0,5.0,6483.0,511.0,,,,
4,AFG,Asia,Afghanistan,28/02/2020,,,,,,,...,,,37746.0,5.0,6483.0,511.0,,,,


In [25]:
# total cases vs total deaths

query3 = '''SELECT location, date, total_cases, total_deaths, ((total_deaths * 1.0) / total_cases)*100 as deathPercentage
FROM covidDeaths
WHERE location = 'Italy'
ORDER BY 1, 2'''
df = pd.read_sql_query(query3, connection)
df.head()

Unnamed: 0,location,date,total_cases,total_deaths,deathPercentage
0,Italy,01/01/2021,21293760,746210.0,3.50436
1,Italy,01/01/2022,62669390,1375130.0,2.194261
2,Italy,01/02/2020,20,,
3,Italy,01/02/2021,25609570,888450.0,3.469211
4,Italy,01/02/2022,111164220,1469250.0,1.321693


In [26]:
# total cases vs population

query4 = '''SELECT location, date, total_cases, population, ((total_cases * 1.0) / population)*100 as casesPopulation
FROM covidDeaths
WHERE location = 'Italy'
ORDER BY 1, 2'''
df = pd.read_sql_query(query4, connection)
df.head()

Unnamed: 0,location,date,total_cases,population,casesPopulation
0,Italy,01/01/2021,21293760,603674710,3.527357
1,Italy,01/01/2022,62669390,603674710,10.381318
2,Italy,01/02/2020,20,603674710,3e-06
3,Italy,01/02/2021,25609570,603674710,4.24228
4,Italy,01/02/2022,111164220,603674710,18.41459


In [27]:
# countries with highest infection rate compared to population

query5 = '''SELECT location, population, MAX(total_cases) AS highestInfection, MAX((total_cases * 1.0 / population))*100 AS percentagePopInfected
FROM covidDeaths
GROUP BY location, population
ORDER BY percentagePopInfected DESC'''
df = pd.read_sql_query(query5, connection)
df.head()

Unnamed: 0,location,population,highestInfection,percentagePopInfected
0,Faeroe Islands,490530.0,256250.0,52.239415
1,Andorra,773540.0,369890.0,47.817825
2,Gibraltar,336910.0,140390.0,41.669882
3,San Marino,340100.0,137880.0,40.541017
4,Slovenia,20787230.0,8162990.0,39.269253


In [28]:
# countries with highest death count per population

query6 = '''SELECT location, population, MAX(total_deaths) AS highestDeaths, MAX((total_deaths * 1.0 / population))*100 AS percentagePopDeath
FROM covidDeaths
GROUP BY location, population
ORDER BY percentagePopDeath DESC'''
df = pd.read_sql_query(query6, connection)
df.head()

Unnamed: 0,location,population,highestDeaths,percentagePopDeath
0,Peru,333594150.0,2075360.0,0.622121
1,Bulgaria,68966550.0,341520.0,0.495197
2,Bosnia and Herzegovina,32634590.0,148970.0,0.456479
3,Hungary,96341620.0,421700.0,0.437713
4,Montenegro,6280510.0,26220.0,0.417482


In [29]:
# countries with highest death count

query7 = '''SELECT location, MAX(total_deaths)
FROM covidDeaths
GROUP BY location
ORDER BY MAX(total_deaths) DESC'''
df = pd.read_sql_query(query7, connection)
df.head()

Unnamed: 0,location,MAX(total_deaths)
0,World,57779520.0
1,Upper middle income,23709270.0
2,High income,21170150.0
3,Europe,16521240.0
4,North America,13279330.0


In [30]:
# remove continents from result

query8 = '''SELECT location, MAX(total_deaths)
FROM covidDeaths
WHERE continent IS NOT NULL
GROUP BY location
ORDER BY MAX(total_deaths) DESC'''
df = pd.read_sql_query(query8, connection)
df.head()

Unnamed: 0,location,MAX(total_deaths)
0,United States,9122550.0
1,Brazil,6354210.0
2,India,5065200.0
3,Russia,3306090.0
4,Mexico,3106270.0


In [31]:
# continents with highest death count per population

query9 = '''SELECT continent, MAX(total_deaths) as totalDeathsCount
FROM covidDeaths
WHERE continent IS NOT NULL
GROUP BY continent
ORDER BY totalDeathsCount DESC'''
df = pd.read_sql_query(query9, connection)
df.head()

Unnamed: 0,continent,totalDeathsCount
0,North America,9122550
1,South America,6354210
2,Asia,5065200
3,Europe,3306090
4,Africa,965020


In [36]:
# global numbers

query10 = '''SELECT date, SUM(new_cases), SUM(new_deaths), SUM(new_deaths * 1.0)/SUM(new_cases) AS deathPercentage
FROM covidDeaths
WHERE continent IS NOT NULL
GROUP BY date
ORDER BY 2 DESC'''
df = pd.read_sql_query(query10, connection)
df.head()

Unnamed: 0,date,SUM(new_cases),SUM(new_deaths),deathPercentage
0,19/01/2022,42353180.0,104980.0,0.002479
1,21/01/2022,38238510.0,97750.0,0.002556
2,26/01/2022,37463160.0,108930.0,0.002908
3,25/01/2022,37302960.0,105300.0,0.002823
4,18/01/2022,37192450.0,91670.0,0.002465


In [33]:
# total

query11 = '''SELECT SUM(new_cases), SUM(new_deaths), SUM(new_deaths * 1.0)/SUM(new_cases) AS deathPercentage
FROM covidDeaths
WHERE continent IS NOT NULL'''
df = pd.read_sql_query(query11, connection)
df.head()

Unnamed: 0,SUM(new_cases),SUM(new_deaths),deathPercentage
0,4018437120,57513170,0.014312


In [34]:
# join the two tables

query12 = '''SELECT *
FROM covidDeaths AS cD
JOIN covidVaccinations AS cV
ON cD.location = cV.location AND cD.date = cV.date'''
df = pd.read_sql_query(query12, connection)
df.head()

Unnamed: 0,iso_code,continent,location,date,population,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,24/02/2020,398354280.0,50.0,50.0,,,,...,,,37746.0,5.0,6483.0,511.0,,,,
1,AFG,Asia,Afghanistan,25/02/2020,398354280.0,50.0,0.0,,,,...,,,37746.0,5.0,6483.0,511.0,,,,
2,AFG,Asia,Afghanistan,26/02/2020,398354280.0,50.0,0.0,,,,...,,,37746.0,5.0,6483.0,511.0,,,,
3,AFG,Asia,Afghanistan,27/02/2020,398354280.0,50.0,0.0,,,,...,,,37746.0,5.0,6483.0,511.0,,,,
4,AFG,Asia,Afghanistan,28/02/2020,398354280.0,50.0,0.0,,,,...,,,37746.0,5.0,6483.0,511.0,,,,


In [39]:
# total population vs vaccinations

query13 = '''SELECT cD.continent, cD.location, cD.date, cD.population, cV.new_vaccinations
FROM covidDeaths AS cD
JOIN covidVaccinations AS cV
ON cD.location = cV.location AND cD.date = cV.date
WHERE cD.continent IS NOT NULL
ORDER BY 2'''
df = pd.read_sql_query(query13, connection)
df.head()

Unnamed: 0,continent,location,date,population,new_vaccinations
0,Asia,Afghanistan,24/02/2020,398354280.0,
1,Asia,Afghanistan,25/02/2020,398354280.0,
2,Asia,Afghanistan,26/02/2020,398354280.0,
3,Asia,Afghanistan,27/02/2020,398354280.0,
4,Asia,Afghanistan,28/02/2020,398354280.0,


In [40]:
# total population vs vaccinations

query13bis = '''SELECT cD.continent, cD.location, cD.date, cD.population, cV.new_vaccinations,
SUM(cV.new_vaccinations) OVER (PARTITION BY cD.location ORDER BY cD.location) AS rollingPeopleVaccinated
FROM covidDeaths AS cD
JOIN covidVaccinations AS cV
ON cD.location = cV.location AND cD.date = cV.date
WHERE cD.continent IS NOT NULL
ORDER BY 2'''
df = pd.read_sql_query(query13bis, connection)
df.head()

Unnamed: 0,continent,location,date,population,new_vaccinations,rollingPeopleVaccinated
0,Asia,Afghanistan,24/02/2020,398354280.0,,13742.0
1,Asia,Afghanistan,25/02/2020,398354280.0,,13742.0
2,Asia,Afghanistan,26/02/2020,398354280.0,,13742.0
3,Asia,Afghanistan,27/02/2020,398354280.0,,13742.0
4,Asia,Afghanistan,28/02/2020,398354280.0,,13742.0


In [41]:
# common table expression

query14 = '''WITH popVSvac (continent, location, date, population, new_vaccinations, rollingPeopleVaccinated)
AS
(
SELECT cD.continent, cD.location, cD.date, cD.population, cV.new_vaccinations,
SUM(cV.new_vaccinations) OVER (PARTITION BY cD.location ORDER BY cD.location, cD.date) AS rollingPeopleVaccinated
FROM covidDeaths AS cD
JOIN covidVaccinations AS cV
ON cD.location = cV.location AND cD.date = cV.date
WHERE cD.continent IS NOT NULL
)
SELECT *, (rollingPeopleVaccinated * 1.0 / population) * 100
FROM popVSvac'''
df = pd.read_sql_query(query14, connection)
df.head()

Unnamed: 0,continent,location,date,population,new_vaccinations,rollingPeopleVaccinated,(rollingPeopleVaccinated * 1.0 / population) * 100
0,Asia,Afghanistan,01/01/2021,398354280.0,,,
1,Asia,Afghanistan,01/01/2022,398354280.0,,,
2,Asia,Afghanistan,01/02/2021,398354280.0,,,
3,Asia,Afghanistan,01/02/2022,398354280.0,,,
4,Asia,Afghanistan,01/03/2020,398354280.0,,,
