In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3

conn = sqlite3.connect('example.db')


def rsq(query: str) -> pd.DataFrame:
    return pd.read_sql_query(query, conn)


In [34]:
df = pd.read_csv('./ChronicKidneyDisease.csv')
df.to_sql('Kidney', conn, if_exists='replace', index=False)

400

In [4]:
rsq("""-- Calculate the average, minimum and maximum
SELECT AVG(DurationSeconds) AS Average, 
       MIN(DurationSeconds) AS Minimum, 
       MAX(DurationSeconds) AS Maximum
FROM Incidents;""")

Unnamed: 0,Average,Minimum,Maximum
0,5592.875783,0.08,10526400.0


In [6]:
rsq("""-- Calculate the aggregations by Shape
SELECT Shape,
       AVG(DurationSeconds) AS Average, 
       MIN(DurationSeconds) AS Minimum, 
       MAX(DurationSeconds) AS Maximum
FROM Incidents
GROUP BY Shape
-- Return records where minimum of DurationSeconds is greater than 1
having MIN(DurationSeconds) > 1""")

Unnamed: 0,Shape,Average,Minimum,Maximum
0,changing,3191.674419,2.0,172800.0
1,chevron,1100.59375,2.0,21600.0
2,crescent,10.0,10.0,10.0
3,cross,848.133333,2.0,7200.0
4,cylinder,795.241758,3.0,37800.0
5,egg,558.95614,1.5,7200.0
6,rectangle,969.613208,4.0,28800.0
7,teardrop,3501.685185,2.0,172800.0


In [11]:
rsq("""-- Return the specified columns
SELECT IncidentDateTime, IncidentState,count(*)
FROM Incidents
WHERE IncidentState IS NOT NULL
group by 2;
-- Exclude all the missing values from IncidentState;  """)

Unnamed: 0,IncidentDateTime,IncidentState,count(*)
0,1982-10-08 22:50:00.000,ab,29
1,2013-11-20 18:00:00.000,ak,30
2,2005-01-12 03:00:00.000,al,57
3,2013-10-08 02:00:00.000,ar,46
4,2005-10-31 19:00:00.000,az,213
...,...,...,...
60,2006-10-31 10:30:00.000,wa,342
61,2006-10-31 17:40:00.000,wi,122
62,2008-10-31 00:00:00.000,wv,44
63,2000-10-09 18:45:00.000,wy,14


In [14]:
rsq("""-- Check the IncidentState column for missing values and replace them with the City column
SELECT IncidentState, ISNULL(IncidentState, City) AS Location
FROM Incidents
-- Filter to only return missing values from IncidentState
WHERE IncidentState IS NULL""")

DatabaseError: Execution failed on sql '-- Check the IncidentState column for missing values and replace them with the City column
SELECT IncidentState, ISNULL(IncidentState, City) AS Location
FROM Incidents
-- Filter to only return missing values from IncidentState
WHERE IncidentState IS NULL': near "ISNULL": syntax error

In [17]:
rsq("""-- Check the IncidentState column for missing values and replace them with the City column
SELECT City, CASE WHEN IncidentState IS NULL THEN City ELSE IncidentState END AS Location, count(*)
FROM Incidents group by 2; """)

Unnamed: 0,City,Location,count(*)
0,fort mcmurray (canada),ab,29
1,abu dhabi (uae),abu dhabi (uae),1
2,accra (ghana),accra (ghana),1
3,ahmedabad-vadodra express highway. gujarat (in...,ahmedabad-vadodra express highway. gujarat (in...,1
4,kuparuk,ak,30
...,...,...,...
290,yokohama (japan),yokohama (japan),1
291,yukon territory (location undisclosed) (canada),yt,2
292,zalka (lebanon),zalka (lebanon),1
293,zama (japan),zama (japan),1


In [19]:
rsq("""
-- Replace missing values 
SELECT Country, COALESCE(IncidentState, City) AS Location , count(*)
FROM Incidents 
WHERE Country IS NULL group by 2; """)

Unnamed: 0,Country,Location,count(*)
0,,ab,4
1,,abu dhabi (uae),1
2,,accra (ghana),1
3,,ahmedabad-vadodra express highway. gujarat (in...,1
4,,ak,2
...,...,...,...
285,,yokohama (japan),1
286,,yt,1
287,,zalka (lebanon),1
288,,zama (japan),1


In [24]:
rsq("""SELECT Country, 
    CASE 
        WHEN Country = 'us' THEN 'USA'
        ELSE 'International'
     end AS SourceCountry,
    count(*)
FROM Incidents group by 2;""")

Unnamed: 0,Country,SourceCountry,count(*)
0,,International,1000
1,us,USA,5452


In [27]:
rsq("""-- -- Complete the syntax for cutting the duration into different cases
-- SELECT DurationSeconds, 
-- -- Start with the 2 TSQL keywords, and after the condition a TSQL word and a value
--       ___ ___ (DurationSeconds <= 120) ___ ___
-- -- The pattern repeats with the same keyword and after the condition the same word and next value          
--        ___ (DurationSeconds > 120 AND DurationSeconds <= 600) ___ ___
-- -- Use the same syntax here             
--        ___ (DurationSeconds > 601 AND DurationSeconds <= 1200) ___ ___
-- -- Use the same syntax here               
--        ___ (DurationSeconds > 1201 AND DurationSeconds <= 5000) ___ ___
-- -- Specify a value      
--        ELSE ___ 
--        END AS SecondGroup   
-- FROM Incidents

SELECT DurationSeconds, 
CASE 
      WHEN DurationSeconds <= 120 THEN '0-2 minutes'
      WHEN DurationSeconds > 120 AND DurationSeconds <= 600 THEN '2-10 minutes'
      WHEN DurationSeconds > 601 AND DurationSeconds <= 1200 THEN '10-20 minutes'
      WHEN DurationSeconds > 1201 AND DurationSeconds <= 5000 THEN '20-83 minutes'
      ELSE 'More than 83 minutes'
END AS SecondGroup,
count(*) 
FROM Incidents group by 2;""")

Unnamed: 0,DurationSeconds,SecondGroup,count(*)
0,5.0,0-2 minutes,3008
1,1200.0,10-20 minutes,588
2,600.0,2-10 minutes,1948
3,3600.0,20-83 minutes,633
4,37800.0,More than 83 minutes,275


In [30]:
rsq("""-- Complete the syntax for cutting the duration into different cases
SELECT DurationSeconds, 
-- Start with the 2 TSQL keywords, and after the condition a TSQL word and a value
       CASE WHEN (DurationSeconds <= 120) THEN 1
-- The pattern repeats with the same keyword and after the condition the same word and next value    
	   WHEN (DurationSeconds > 120 AND DurationSeconds <= 600) THEN 2
-- Use the same syntax here  
	   WHEN (DurationSeconds > 601 AND DurationSeconds <= 1200) THEN 3
-- Use the same syntax here 
	   WHEN (DurationSeconds > 1201 AND DurationSeconds <= 5000) THEN 4
-- Specify a value
       ELSE 5 
	   END AS SecondGroup,
    count(*)
FROM Incidents group by 2;""")

Unnamed: 0,DurationSeconds,SecondGroup,count(*)
0,5.0,1,3008
1,600.0,2,1948
2,1200.0,3,588
3,3600.0,4,633
4,37800.0,5,275
