In [63]:
%run 02-connect.ipynb

## SQL Date Format

| Function                                                            | Description                                                               |
| ------------------------------------------------------------------- | ------------------------------------------------------------------------- |
| [CURDATE](https://www.mysqltutorial.org/mysql-curdate/)             | Returns the current date.                                                 |
| [DATEDIFF](https://www.mysqltutorial.org/mysql-datediff.aspx)       | Calculates the number of days between two DATE values.                    |
| [DAY](https://www.mysqltutorial.org/mysql-day/)                     | Gets the day of the month of a specified date.                            |
| [DATE_ADD](https://www.mysqltutorial.org/mysql-date_add/)           | Adds a time value to date value.                                          |
| [DATE_SUB](https://www.mysqltutorial.org/mysql-date_sub/)           | Subtracts a time value from a date value.                                 |
| [DATE_FORMAT](https://www.mysqltutorial.org/mysql-date_format/)     | Formats a date value based on a specified date format.                    |
| [DAYNAME](https://www.mysqltutorial.org/mysql-dayname/)             | Gets the name of a weekday for a specified date.                          |
| [DAYOFWEEK](https://www.mysqltutorial.org/mysql-dayofweek/)         | Returns the weekday index for a date.                                     |
| [EXTRACT](https://www.mysqltutorial.org/mysql-extract/)             | Extracts a part of a date.                                                |
| [LAST_DAY](https://www.mysqltutorial.org/mysql-last_day/)           | Returns the last day of the month of a specified date                     |
| [NOW](https://www.mysqltutorial.org/mysql-now/)                     | Returns the current date and time at which the statement executed.        |
| [MONTH](https://www.mysqltutorial.org/mysql-month/)                 | Returns an integer that represents a month of a specified date.           |
| [STR_TO_DATE](https://www.mysqltutorial.org/mysql-str_to_date/)     | Converts a string into a date and time value based on a specified format. |
| [SYSDATE](https://www.mysqltutorial.org/mysql-sysdate/)             | Returns the current date.                                                 |
| [TIMEDIFF](https://www.mysqltutorial.org/mysql-timediff/)           | Calculates the difference between two TIME or DATETIME values.            |
| [TIMESTAMPDIFF](https://www.mysqltutorial.org/mysql-timestampdiff/) | Calculates the difference between two DATE or DATETIME values.            |
| [WEEK](https://www.mysqltutorial.org/mysql-week/)                   | Returns a week number of a date.                                          |
| [WEEKDAY](https://www.mysqltutorial.org/mysql-weekday/)             | Returns a weekday index for a date.                                       |
| [YEAR](https://www.mysqltutorial.org/mysql-year/)                   | Return the year for a specified date                                      |

In [18]:
%%sql

SELECT companies.permalink,
    companies.founded_at_clean,
    acquisitions.acquired_at_cleaned,
    DATEDIFF(acquisitions.acquired_at_cleaned, companies.founded_at_clean) AS time_to_acquisition
FROM crunchbase_companies_clean_date_preprocessed companies
    JOIN crunchbase_acquisitions_clean_date_preprocessed acquisitions 
        ON acquisitions.company_permalink = companies.permalink
WHERE founded_at_clean IS NOT NULL
LIMIT 10

Unnamed: 0,permalink,founded_at_clean,acquired_at_cleaned,time_to_acquisition
0,/company/waywire,2012-06-01,2013-10-17,10416000000
1,/company/1000memories,2010-07-01,2012-10-03,20302000000
2,/company/12society,2012-01-01,2013-07-03,10602000000
3,/company/280-north,NaT,2010-07-01,20100701000000
4,/company/280-north,NaT,2010-07-01,20100701000000
5,/company/2web-technologies,NaT,2006-06-01,20060601000000
6,/company/3leaf,2004-06-01,2011-02-19,69618000000
7,/company/3x-systems,2007-11-01,2012-10-25,49924000000
8,/company/4home,2006-01-01,2010-12-01,41100000000
9,/company/5to1,2009-01-01,2011-05-01,20400000000


In [None]:
%%sql

SELECT companies.permalink,
    companies.founded_at_clean,
    acquisitions.acquired_at_cleaned,
    DATEDIFF(acquisitions.acquired_at_cleaned, companies.founded_at_clean) AS time_to_acquisition,
    DATE_SUB(acquisitions.acquired_at_cleaned, INTERVAL 1 YEAR) AS year_before_acquired,
    DATE_ADD(acquisitions.acquired_at_cleaned, INTERVAL 1 YEAR) AS year_after_acquired
FROM crunchbase_companies_clean_date_preprocessed companies
    JOIN crunchbase_acquisitions_clean_date_preprocessed acquisitions 
        ON acquisitions.company_permalink = companies.permalink
WHERE founded_at_clean IS NOT NULL
LIMIT 10

In [23]:
%%sql

SELECT companies.permalink,
    companies.founded_at_clean,
    NOW() - companies.founded_at_clean AS founded_time_ago
FROM crunchbase_companies_clean_date_preprocessed companies
WHERE founded_at_clean IS NOT NULL
LIMIT 10

Unnamed: 0,permalink,founded_at_clean,founded_time_ago
0,/company/8868,NaT,20221231085611
1,/company/21e6,2013-01-01,91130085611
2,/company/club-domains,2011-10-10,110221085611
3,/company/fox-networks,NaT,20221231085611
4,/company/a-list-games,NaT,20221231085611
5,/company/pay-mobile-checkout,2011-05-01,110730085611
6,/company/tv-communications,NaT,20221231085611
7,/company/waywire,2012-06-01,100630085611
8,/company/0-6-com,2007-01-01,151130085611
9,/company/0xdata,2011-01-01,111130085611


## Using SQL String Functions to Clean Data

| Name                                                                                                    | Description                                                                              |
| ------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------- |
| [CONCAT](https://www.mysqltutorial.org/sql-concat-in-mysql.aspx)                                        | Concatenate two or more strings into a single string                                     |
| [INSTR](https://www.mysqltutorial.org/mysql-instr/)                                                     | Return the position of the first occurrence of a substring in a string                   |
| [LENGTH](https://www.mysqltutorial.org/mysql-string-length/)                                            | Get the length of a string in bytes and in characters                                    |
| [LEFT](https://www.mysqltutorial.org/mysql-left-function/)                                              | Get a specified number of leftmost characters from a string                              |
| [LOWER](https://www.mysqltutorial.org/mysql-string-functions/mysql-lower/)                              | Convert a string to lowercase                                                            |
| [LTRIM](https://www.mysqltutorial.org/mysql-string-functions/mysql-ltrim-function/)                     | Remove all leading spaces from a string                                                  |
| [REPLACE](https://www.mysqltutorial.org/mysql-string-replace-function.aspx)                             | Search and replace a substring in a string                                               |
| [RIGHT](https://www.mysqltutorial.org/mysql-string-functions/mysql-right-function/)                     | Get a specified number of rightmost characters from a string                             |
| [RTRIM](https://www.mysqltutorial.org/mysql-string-functions/mysql-rtrim-function/)                     | Remove all trailing spaces from a string                                                 |
| [SUBSTRING](https://www.mysqltutorial.org/mysql-substring.aspx)                                         | Extract a substring starting from a position with a specific length.                     |
| [SUBSTRING_INDEX](https://www.mysqltutorial.org/mysql-string-functions/mysql-substring_index-function/) | Return a substring from a string before a specified number of occurrences of a delimiter |
| [TRIM](https://www.mysqltutorial.org/mysql-trim/)                                                       | Remove unwanted characters from a string.                                                |
| [FIND_IN_SET](https://www.mysqltutorial.org/mysql-find_in_set/)                                         | Find a string within a comma-separated list of strings                                   |
| [FORMAT](https://www.mysqltutorial.org/mysql-format-function/)                                          | Format a number with a specific locale, rounded to the number of decimals                |
| [UPPER](https://www.mysqltutorial.org/mysql-string-functions/mysql-upper/)                              | Convert a string to uppercase                                                            |

In [26]:
%%sql

SELECT incidnt_num,
    date,
    LEFT(date, 10) AS cleaned_date
FROM sf_crime_incidents_2014_01_preprocessed
LIMIT 10

Unnamed: 0,incidnt_num,date,cleaned_date
0,140099416.0,2014-01-31 08:00:00,2014-01-31
1,140092426.0,2014-01-31 08:00:00,2014-01-31
2,140092410.0,2014-01-31 08:00:00,2014-01-31
3,140092341.0,NaT,0000-00-00
4,140092573.0,2014-01-31 08:00:00,2014-01-31
5,146027306.0,2014-01-31 08:00:00,2014-01-31
6,140092288.0,2014-01-31 08:00:00,2014-01-31
7,140092727.0,2014-01-31 08:00:00,2014-01-31
8,140092874.0,2014-01-31 08:00:00,2014-01-31
9,140092830.0,NaT,0000-00-00


In [28]:
%%sql

SELECT incidnt_num,
    date,
    LEFT(date, 10) AS cleaned_date,
    RIGHT(date, 8) AS cleaned_time
FROM sf_crime_incidents_2014_01_preprocessed
LIMIT 10

Unnamed: 0,incidnt_num,date,cleaned_date,cleaned_time
0,140099416.0,2014-01-31 08:00:00,2014-01-31,08:00:00
1,140092426.0,2014-01-31 08:00:00,2014-01-31,08:00:00
2,140092410.0,2014-01-31 08:00:00,2014-01-31,08:00:00
3,140092341.0,NaT,0000-00-00,00:00:00
4,140092573.0,2014-01-31 08:00:00,2014-01-31,08:00:00
5,146027306.0,2014-01-31 08:00:00,2014-01-31,08:00:00
6,140092288.0,2014-01-31 08:00:00,2014-01-31,08:00:00
7,140092727.0,2014-01-31 08:00:00,2014-01-31,08:00:00
8,140092874.0,2014-01-31 08:00:00,2014-01-31,08:00:00
9,140092830.0,NaT,0000-00-00,00:00:00


In [29]:
%%sql

SELECT incidnt_num,
    date,
    LEFT(date, 10) AS cleaned_date,
    RIGHT(date, LENGTH(date) - 11) AS cleaned_time
FROM sf_crime_incidents_2014_01_preprocessed
LIMIT 10

Unnamed: 0,incidnt_num,date,cleaned_date,cleaned_time
0,140099416.0,2014-01-31 08:00:00,2014-01-31,08:00:00
1,140092426.0,2014-01-31 08:00:00,2014-01-31,08:00:00
2,140092410.0,2014-01-31 08:00:00,2014-01-31,08:00:00
3,140092341.0,NaT,0000-00-00,00:00:00
4,140092573.0,2014-01-31 08:00:00,2014-01-31,08:00:00
5,146027306.0,2014-01-31 08:00:00,2014-01-31,08:00:00
6,140092288.0,2014-01-31 08:00:00,2014-01-31,08:00:00
7,140092727.0,2014-01-31 08:00:00,2014-01-31,08:00:00
8,140092874.0,2014-01-31 08:00:00,2014-01-31,08:00:00
9,140092830.0,NaT,0000-00-00,00:00:00


In [37]:
%%sql

SELECT location,
    TRIM(
        both '"('
        FROM location
    )
FROM sf_crime_incidents_2014_01_preprocessed
LIMIT 10

Unnamed: 0,location,"TRIM(\n both '""('\n FROM location\n )"
0,"""(37.709725805163",37.709725805163
1,37.7154876086057,37.7154876086057
2,"""(37.7686887134351",37.7686887134351
3,-122.412527239682,-122.412527239682
4,37.7750814399634,37.7750814399634
5,"""(37.7716335058168",37.7716335058168
6,"""(37.7798376142327",37.7798376142327
7,"""(37.7940182573369",37.7940182573369
8,37.7850491022697,37.7850491022697
9,-122.403595293514,-122.403595293514


In [38]:
%%sql

SELECT incidnt_num,
    descript,
    POSITION('A' IN descript) AS a_position
FROM sf_crime_incidents_2014_01_preprocessed
LIMIT 10

Unnamed: 0,incidnt_num,descript,a_position
0,140099416.0,STOLEN AND RECOVERED VEHICLE,8
1,140092426.0,BATTERY,2
2,140092410.0,SUSPICIOUS OCCURRENCE,0
3,140092341.0,"""DRIVERS LICENSE",0
4,140092573.0,POSSESSION OF NARCOTICS PARAPHERNALIA,16
5,146027306.0,GRAND THEFT FROM LOCKED AUTO,3
6,140092288.0,GRAND THEFT FROM LOCKED AUTO,3
7,140092727.0,BATTERY,2
8,140092874.0,PETTY THEFT SHOPLIFTING,0
9,140092830.0,"""DRIVERS LICENSE",0


In [41]:
%%sql

SELECT incidnt_num,
    date,
    SUBSTR(date, 9, 2) AS day
FROM sf_crime_incidents_2014_01_preprocessed
LIMIT 10

Unnamed: 0,incidnt_num,date,day
0,140099416.0,2014-01-31 08:00:00,31
1,140092426.0,2014-01-31 08:00:00,31
2,140092410.0,2014-01-31 08:00:00,31
3,140092341.0,NaT,0
4,140092573.0,2014-01-31 08:00:00,31
5,146027306.0,2014-01-31 08:00:00,31
6,140092288.0,2014-01-31 08:00:00,31
7,140092727.0,2014-01-31 08:00:00,31
8,140092874.0,2014-01-31 08:00:00,31
9,140092830.0,NaT,0


In [42]:
%%sql

SELECT incidnt_num,
    day_of_week,
    LEFT(date, 10) AS cleaned_date,
    CONCAT(day_of_week, ', ', LEFT(date, 10)) AS day_and_date
FROM sf_crime_incidents_2014_01_preprocessed
LIMIT 10

Unnamed: 0,incidnt_num,day_of_week,cleaned_date,day_and_date
0,140099416.0,Friday,2014-01-31,"Friday, 2014-01-31"
1,140092426.0,Friday,2014-01-31,"Friday, 2014-01-31"
2,140092410.0,Friday,2014-01-31,"Friday, 2014-01-31"
3,140092341.0,"SUSPENDED OR REVOKED""",0000-00-00,"SUSPENDED OR REVOKED"", 0000-00-00"
4,140092573.0,Friday,2014-01-31,"Friday, 2014-01-31"
5,146027306.0,Friday,2014-01-31,"Friday, 2014-01-31"
6,140092288.0,Friday,2014-01-31,"Friday, 2014-01-31"
7,140092727.0,Friday,2014-01-31,"Friday, 2014-01-31"
8,140092874.0,Friday,2014-01-31,"Friday, 2014-01-31"
9,140092830.0,"SUSPENDED OR REVOKED""",0000-00-00,"SUSPENDED OR REVOKED"", 0000-00-00"


In [44]:
%%sql

SELECT incidnt_num,
    address,
    UPPER(address) AS address_upper,
    LOWER(address) AS address_lower
FROM sf_crime_incidents_2014_01_preprocessed
LIMIT 10

Unnamed: 0,incidnt_num,address,address_upper,address_lower
0,140099416.0,0 Block of GARRISON AV,0 BLOCK OF GARRISON AV,0 block of garrison av
1,140092426.0,"CITED""","CITED""","cited"""
2,140092410.0,0 Block of CASTRO ST,0 BLOCK OF CASTRO ST,0 block of castro st
3,140092341.0,"""ARREST","""ARREST","""arrest"
4,140092573.0,"BOOKED""","BOOKED""","booked"""
5,146027306.0,0 Block of MCCOPPIN ST,0 BLOCK OF MCCOPPIN ST,0 block of mccoppin st
6,140092288.0,400 Block of 6TH AV,400 BLOCK OF 6TH AV,400 block of 6th av
7,140092727.0,500 Block of SACRAMENTO ST,500 BLOCK OF SACRAMENTO ST,500 block of sacramento st
8,140092874.0,"CITED""","CITED""","cited"""
9,140092830.0,"""ARREST","""ARREST","""arrest"


In [47]:
%%sql

SELECT date,
       EXTRACT(YEAR FROM date) AS year,
       EXTRACT(MONTH  FROM date) AS month,
       EXTRACT(DAY  FROM date) AS day,
       EXTRACT(HOUR  FROM date) AS hour,
       EXTRACT(MINUTE  FROM date) AS minute,
       EXTRACT(SECOND  FROM date) AS second
FROM sf_crime_incidents_2014_01_preprocessed
LIMIT 10

Unnamed: 0,date,year,month,day,hour,minute,second
0,2014-01-31 08:00:00,2014,1,31,8,0,0
1,2014-01-31 08:00:00,2014,1,31,8,0,0
2,2014-01-31 08:00:00,2014,1,31,8,0,0
3,NaT,0,0,0,0,0,0
4,2014-01-31 08:00:00,2014,1,31,8,0,0
5,2014-01-31 08:00:00,2014,1,31,8,0,0
6,2014-01-31 08:00:00,2014,1,31,8,0,0
7,2014-01-31 08:00:00,2014,1,31,8,0,0
8,2014-01-31 08:00:00,2014,1,31,8,0,0
9,NaT,0,0,0,0,0,0


In [50]:
%%sql

SELECT incidnt_num,
    descript,
    COALESCE(descript, 'No Description')
FROM sf_crime_incidents_2014_01_preprocessed
ORDER BY descript DESC
LIMIT 10

Unnamed: 0,incidnt_num,descript,"COALESCE(descript, 'No Description')"
0,130966035.0,WILLFUL CRUELTY TO CHILD,WILLFUL CRUELTY TO CHILD
1,130973109.0,WILLFUL CRUELTY TO CHILD,WILLFUL CRUELTY TO CHILD
2,131009814.0,WILLFUL CRUELTY TO CHILD,WILLFUL CRUELTY TO CHILD
3,131015247.0,WILLFUL CRUELTY TO CHILD,WILLFUL CRUELTY TO CHILD
4,131004290.0,WARRANT ARREST,WARRANT ARREST
5,130994014.0,WARRANT ARREST,WARRANT ARREST
6,131008032.0,WARRANT ARREST,WARRANT ARREST
7,131013235.0,WARRANT ARREST,WARRANT ARREST
8,130995197.0,WARRANT ARREST,WARRANT ARREST
9,130997256.0,WARRANT ARREST,WARRANT ARREST


## SQL Subqueries

In [57]:
%%sql

SELECT sub.*
FROM (
        SELECT *
        FROM sf_crime_incidents_2014_01_preprocessed
        WHERE day_of_week = 'Friday'
    ) sub
WHERE sub.resolution = 'NONE'
LIMIT 10

Unnamed: 0,incidnt_num,category,descript,day_of_week,date,time,pd_district,resolution,address,lon,lat,location,id
0,140099416.0,VEHICLE THEFT,STOLEN AND RECOVERED VEHICLE,Friday,2014-01-31 08:00:00,17:00,INGLESIDE,NONE,0 Block of GARRISON AV,-122.413624,37.709726,"""(37.709725805163",-122
1,140092410.0,SUSPICIOUS OCC,SUSPICIOUS OCCURRENCE,Friday,2014-01-31 08:00:00,15:30,PARK,NONE,0 Block of CASTRO ST,-122.435719,37.768689,"""(37.7686887134351",-122
2,146027306.0,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Friday,2014-01-31 08:00:00,17:25,SOUTHERN,NONE,0 Block of MCCOPPIN ST,-122.421325,37.771634,"""(37.7716335058168",-122
3,140092288.0,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Friday,2014-01-31 08:00:00,14:00,RICHMOND,NONE,400 Block of 6TH AV,-122.464338,37.779838,"""(37.7798376142327",-122
4,140092727.0,ASSAULT,BATTERY,Friday,2014-01-31 08:00:00,20:00,CENTRAL,NONE,500 Block of SACRAMENTO ST,-122.401338,37.794018,"""(37.7940182573369",-122
5,140092818.0,ASSAULT,BATTERY,Friday,2014-01-31 08:00:00,21:06,INGLESIDE,NONE,0 Block of AMETHYST WY,-122.446067,37.746115,"""(37.7461152780528",-122
6,140092200.0,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Friday,2014-01-31 08:00:00,09:00,RICHMOND,NONE,500 Block of JOHNFKENNEDY DR,-122.465838,37.772497,"""(37.7724965522266",-122
7,140092125.0,NON-CRIMINAL,FOUND PROPERTY,Friday,2014-01-31 08:00:00,14:30,PARK,NONE,1800 Block of WALLER ST,-122.454878,37.768147,"""(37.7681470009312",-122
8,140092040.0,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Friday,2014-01-31 08:00:00,11:00,RICHMOND,NONE,300 Block of MARTIN LUTHER KING JR DR,-122.464414,37.766456,"""(37.7664562802319",-122
9,140091785.0,LARCENY/THEFT,GRAND THEFT FROM UNLOCKED AUTO,Friday,2014-01-31 08:00:00,14:00,CENTRAL,NONE,BROADWAY ST / CORDELIA ST,-122.409044,37.797573,"""(37.7975730481122",-122


What if you wanted to figure out how many incidents get reported on each day of the week? Better yet, what if you wanted to know how many incidents happen, on average, on a Friday in December? In January? There are two steps to this process: counting the number of incidents each day (inner query), then determining the monthly average (outer query):

In [58]:
%%sql

SELECT LEFT(sub.date, 2) AS cleaned_month,
    sub.day_of_week,
    AVG(sub.incidents) AS average_incidents
FROM (
        SELECT day_of_week,
            date,
            COUNT(incidnt_num) AS incidents
        FROM sf_crime_incidents_2014_01_preprocessed
        GROUP BY 1,2
    ) sub
GROUP BY 1,2
ORDER BY 1,2
LIMIT 10

Unnamed: 0,cleaned_month,day_of_week,average_incidents
0,0,"ADULT VICTIM""",26.0
1,0,"ARMED WITH A DANGEROUS WEAPON""",17.0
2,0,"ARMED WITH A GUN""",55.0
3,0,"ARMED WITH A KNIFE""",27.0
4,0,"ATT FORCIBLE ENTRY""",13.0
5,0,"ATTEMPTED FORCIBLE ENTRY""",53.0
6,0,"ATTEMPTED""",2.0
7,0,"BODILY FORCE""",192.0
8,0,"BREAKING WINDOWS WITH BB GUN""",5.0
9,0,"BREAKING WINDOWS""",186.0


If you're having trouble figuring out what's happening, try running the inner query individually to get a sense of what its results look like. In general, it's easiest to write inner queries first and revise them until the results make sense to you, then to move on to the outer query.

In [59]:
%%sql

SELECT *
FROM sf_crime_incidents_2014_01_preprocessed
WHERE Date = (
        SELECT MIN(date)
        FROM sf_crime_incidents_2014_01_preprocessed
    )
LIMIT 10

Unnamed: 0,incidnt_num,category,descript,day_of_week,date,time,pd_district,resolution,address,lon,lat,location,id
0,140092341.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,17:50,CENTRAL,"""ARREST",0.0,0.0,-122.412527239682,38
1,140092830.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,21:35,BAYVIEW,"""ARREST",0.0,0.0,-122.403595293514,38
2,140092777.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,21:10,TARAVAL,"""ARREST",0.0,0.0,-122.50022040363,38
3,140091848.0,NON-CRIMINAL,"""DEATH REPORT","CAUSE UNKNOWN""",,2014-01-31 08:00:00+00:00,14:48,NORTHERN,NONE,100.0,-122.420676,37.7762540028806,0
4,140091503.0,ROBBERY,"""ROBBERY","BODILY FORCE""",,2014-01-31 08:00:00+00:00,12:20,SOUTHERN,NONE,800.0,-122.403743,37.7752316978114,0
5,140091177.0,NON-CRIMINAL,"""DEATH REPORT","CAUSE UNKNOWN""",,2014-01-31 08:00:00+00:00,00:01,TARAVAL,NONE,1400.0,-122.50801,37.7589334708659,0
6,140090884.0,NON-CRIMINAL,"""AIDED CASE","MENTAL DISTURBED""",,2014-01-31 08:00:00+00:00,07:47,SOUTHERN,PSYCHOPATHIC CASE,0.0,-122.409574,37.7816102568731,0
7,140090812.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,06:25,CENTRAL,"""ARREST",0.0,0.0,-122.401319002979,38
8,140090743.0,NON-CRIMINAL,"""AIDED CASE","MENTAL DISTURBED""",,2014-01-31 08:00:00+00:00,03:52,MISSION,PSYCHOPATHIC CASE,0.0,-122.434089,37.7513050185724,0
9,140090680.0,ROBBERY,"""ROBBERY","ARMED WITH A GUN""",,2014-01-31 08:00:00+00:00,02:30,MISSION,NONE,18.0,-122.41936,37.7618358012376,0


The above query works because the result of the subquery is only one cell. Most conditional logic will work with subqueries containing one-cell results. However, IN is the only type of conditional logic that will work when the inner query contains multiple results:

In [61]:
%%sql

SELECT *
FROM sf_crime_incidents_2014_01_preprocessed incidents
    JOIN (
        SELECT date
        FROM sf_crime_incidents_2014_01_preprocessed
        ORDER BY date
        LIMIT 5
    ) sub ON incidents.date = sub.date
LIMIT 10

Unnamed: 0,incidnt_num,category,descript,day_of_week,date,time,pd_district,resolution,address,lon,lat,location,id,date.1
0,140092341.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,17:50,CENTRAL,"""ARREST",0.0,0.0,-122.412527239682,38,
1,140092341.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,17:50,CENTRAL,"""ARREST",0.0,0.0,-122.412527239682,38,
2,140092341.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,17:50,CENTRAL,"""ARREST",0.0,0.0,-122.412527239682,38,
3,140092341.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,17:50,CENTRAL,"""ARREST",0.0,0.0,-122.412527239682,38,
4,140092341.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,17:50,CENTRAL,"""ARREST",0.0,0.0,-122.412527239682,38,
5,140092830.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,21:35,BAYVIEW,"""ARREST",0.0,0.0,-122.403595293514,38,
6,140092830.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,21:35,BAYVIEW,"""ARREST",0.0,0.0,-122.403595293514,38,
7,140092830.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,21:35,BAYVIEW,"""ARREST",0.0,0.0,-122.403595293514,38,
8,140092830.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,21:35,BAYVIEW,"""ARREST",0.0,0.0,-122.403595293514,38,
9,140092830.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2014-01-31 08:00:00+00:00,21:35,BAYVIEW,"""ARREST",0.0,0.0,-122.403595293514,38,


In [62]:
%%sql

SELECT incidents.*,
    sub.incidents AS incidents_that_day
FROM sf_crime_incidents_2014_01_preprocessed incidents
    JOIN (
        SELECT date,
            COUNT(incidnt_num) AS incidents
        FROM sf_crime_incidents_2014_01_preprocessed
        GROUP BY 1
    ) sub ON incidents.date = sub.date
ORDER BY sub.incidents DESC,
    time
LIMIT 10

Unnamed: 0,incidnt_num,category,descript,day_of_week,date,time,pd_district,resolution,address,lon,lat,location,id,incidents_that_day
0,130927758.0,VANDALISM,"""MALICIOUS MISCHIEF","VANDALISM""",,2013-11-01 07:00:00+00:00,11:00,CENTRAL,NONE,0.0,-122.394557,37.7956312827128,0,7173
1,130928900.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2013-11-01 07:00:00+00:00,18:30,PARK,"""ARREST",0.0,0.0,-122.438928631199,38,7173
2,130986209.0,SUSPICIOUS OCC,"""SUSPICIOUS AUTO","POSSIBLY SEX""",,2013-11-01 07:00:00+00:00,12:00,RICHMOND,EXCEPTIONAL CLEARANCE,5300.0,-122.476545,37.780351537521,0,7173
3,130928994.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2013-11-01 07:00:00+00:00,19:10,TARAVAL,"""ARREST",0.0,0.0,-122.467092240422,38,7173
4,130926556.0,ROBBERY,"""ROBBERY","BODILY FORCE""",,2013-11-01 07:00:00+00:00,01:34,MISSION,NONE,16.0,-122.420902,37.7649778918035,0,7173
5,130929475.0,NON-CRIMINAL,"""AIDED CASE","MENTAL DISTURBED""",,2013-11-01 07:00:00+00:00,21:55,SOUTHERN,PSYCHOPATHIC CASE,0.0,-122.407938,37.7826463997332,0,7173
6,130926738.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2013-11-01 07:00:00+00:00,02:00,CENTRAL,"""ARREST",0.0,0.0,-122.402042584193,38,7173
7,130927861.0,NON-CRIMINAL,"""AIDED CASE","INJURED PERSON""",,2013-11-01 07:00:00+00:00,12:28,MISSION,COMPLAINANT REFUSES TO PROSECUTE,0.0,-122.436362,37.7619303468065,0,7173
8,130929102.0,ROBBERY,"""ROBBERY","BODILY FORCE""",,2013-11-01 07:00:00+00:00,19:35,INGLESIDE,NONE,0.0,-122.441087,37.7182929112555,0,7173
9,130926459.0,OTHER OFFENSES,"""DRIVERS LICENSE","SUSPENDED OR REVOKED""",,2013-11-01 07:00:00+00:00,01:10,CENTRAL,"""ARREST",0.0,600.0,-122.413955505125,38,7173


## SQL Window Functions

| Name                                                                                              | Description                                                                                                                                                                                                                                   |
| ------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [CUME_DIST](https://www.mysqltutorial.org/mysql-window-functions/mysql-cume_dist-function/)       | Calculates the cumulative distribution of a value in a set of values.                                                                                                                                                                         |
| [DENSE_RANK](https://www.mysqltutorial.org/mysql-window-functions/mysql-dense_rank-function/)     | Assigns a rank to every row within its partition based on the `ORDER BY` clause. It assigns the same rank to the rows with equal values. If two or more rows have the same rank, then there will be no gaps in the sequence of ranked values. |
| [FIRST_VALUE](https://www.mysqltutorial.org/mysql-window-functions/mysql-first_value-function/)   | Returns the value of the specified expression with respect to the first row in the window frame.                                                                                                                                              |
| [LAG](https://www.mysqltutorial.org/mysql-window-functions/mysql-lag-function/)                   | Returns the value of the Nth row before the current row in a partition. It returns NULL if no preceding row exists.                                                                                                                           |
| [LAST_VALUE](https://www.mysqltutorial.org/mysql-window-functions/mysql-last_value-function/)     | Returns the value of the specified expression with respect to the last row in the window frame.                                                                                                                                               |
| [LEAD](https://www.mysqltutorial.org/mysql-window-functions/mysql-lead-function/)                 | Returns the value of the Nth row after the current row in a partition. It returns NULL if no subsequent row exists.                                                                                                                           |
| [NTH_VALUE](https://www.mysqltutorial.org/mysql-window-functions/mysql-nth_value-function/)       | Returns value of argument from Nth row of the window frame                                                                                                                                                                                    |
| [NTILE](https://www.mysqltutorial.org/mysql-window-functions/mysql-ntile-function/)               | Distributes the rows for each window partition into a specified number of ranked groups.                                                                                                                                                      |
| [PERCENT_RANK](https://www.mysqltutorial.org/mysql-window-functions/mysql-percent_rank-function/) | Calculates the percentile rank of a row in a partition or result set                                                                                                                                                                          |
| [RANK](https://www.mysqltutorial.org/mysql-window-functions/mysql-rank-function/)                 | Similar to the `DENSE_RANK()` function except that there are gaps in the sequence of ranked values when two or more rows have the same rank.                                                                                                  |
| [ROW_NUMBER](https://www.mysqltutorial.org/mysql-window-functions/mysql-row_number-function/)     | Assigns a sequential integer to every row within its partition                                                                                                                                                                                |

The most practical example of this is a running total:

In [73]:
%%sql

SELECT duration_seconds,
    SUM(duration_seconds) OVER (
        ORDER BY start_time
    ) AS running_total
FROM dc_bikeshare_q1_2012_preprocessed
LIMIT 10

Unnamed: 0,duration_seconds,running_total
0,1162.0,2307.0
1,1145.0,2307.0
2,1754.0,4061.0
3,1206.0,5267.0
4,1124.0,6391.0
5,1512.0,7903.0
6,1212.0,10300.0
7,1185.0,10300.0
8,41427.0,52929.0
9,1202.0,52929.0


You can see that the above query creates an aggregation (running_total) without using GROUP BY.

The first part of the above aggregation, SUM(duration_seconds), looks a lot like any other aggregation. Adding OVER designates it as a window function. You could read the above aggregation as "take the sum of duration_seconds over the entire result set, in order by start_time."

In [66]:
%%sql

SELECT start_terminal,
    duration_seconds,
    SUM(duration_seconds) OVER (
        PARTITION BY start_terminal
        ORDER BY start_time
    ) AS running_total
FROM dc_bikeshare_q1_2012_preprocessed
WHERE start_time < '2012-01-08'
LIMIT 10

Unnamed: 0,start_terminal,duration_seconds,running_total
0,31000.0,1422.0,1422.0
1,31000.0,3340.0,4762.0
2,31000.0,2661.0,7423.0
3,31001.0,2876.0,2876.0
4,31001.0,2804.0,5680.0
5,31001.0,2686.0,8366.0
6,31001.0,3624.0,15588.0
7,31001.0,3598.0,15588.0
8,31001.0,1426.0,17014.0
9,31001.0,1103.0,18117.0


The above query groups and orders the query by start_terminal. Within each value of start_terminal, it is ordered by start_time, and the running total sums across the current row and all previous rows of duration_seconds. That's what happens when you group using PARTITION BY. In case you're still stumped by ORDER BY, it simply orders by the designated column(s) the same way the ORDER BY clause would, except that it treats every partition as separate. It also creates the running total—without ORDER BY, each value will simply be a sum of all the duration_seconds values in its respective start_terminal.

In [67]:
%%sql

SELECT start_terminal,
    duration_seconds,
    SUM(duration_seconds) OVER (PARTITION BY start_terminal) AS running_total,
    COUNT(duration_seconds) OVER (PARTITION BY start_terminal) AS running_count,
    AVG(duration_seconds) OVER (PARTITION BY start_terminal) AS running_avg
FROM dc_bikeshare_q1_2012_preprocessed
WHERE start_time < '2012-01-08'
LIMIT 10

Unnamed: 0,start_terminal,duration_seconds,running_total,running_count,running_avg
0,31000.0,1422.0,7423.0,3,2474.333333
1,31000.0,3340.0,7423.0,3,2474.333333
2,31000.0,2661.0,7423.0,3,2474.333333
3,31001.0,2876.0,25193.0,12,2099.416667
4,31001.0,2804.0,25193.0,12,2099.416667
5,31001.0,2686.0,25193.0,12,2099.416667
6,31001.0,3624.0,25193.0,12,2099.416667
7,31001.0,3598.0,25193.0,12,2099.416667
8,31001.0,1426.0,25193.0,12,2099.416667
9,31001.0,1103.0,25193.0,12,2099.416667


Alternatively, the same functions with ORDER BY:

In [68]:
%%sql

SELECT start_terminal,
    duration_seconds,
    SUM(duration_seconds) OVER (
        PARTITION BY start_terminal
        ORDER BY start_time
    ) AS running_total,
    COUNT(duration_seconds) OVER (
        PARTITION BY start_terminal
        ORDER BY start_time
    ) AS running_count,
    AVG(duration_seconds) OVER (
        PARTITION BY start_terminal
        ORDER BY start_time
    ) AS running_avg
FROM dc_bikeshare_q1_2012_preprocessed
WHERE start_time < '2012-01-08'
LIMIT 10

Unnamed: 0,start_terminal,duration_seconds,running_total,running_count,running_avg
0,31000.0,1422.0,1422.0,1,1422.0
1,31000.0,3340.0,4762.0,2,2381.0
2,31000.0,2661.0,7423.0,3,2474.333333
3,31001.0,2876.0,2876.0,1,2876.0
4,31001.0,2804.0,5680.0,2,2840.0
5,31001.0,2686.0,8366.0,3,2788.666667
6,31001.0,3624.0,15588.0,5,3117.6
7,31001.0,3598.0,15588.0,5,3117.6
8,31001.0,1426.0,17014.0,6,2835.666667
9,31001.0,1103.0,18117.0,7,2588.142857


In [75]:
%%sql

SELECT start_terminal,
    start_time,
    duration_seconds,
    ROW_NUMBER() OVER (
        ORDER BY start_time
    ) AS 'row_number'
FROM dc_bikeshare_q1_2012_preprocessed
WHERE start_time < '2012-01-08'
LIMIT 10

Unnamed: 0,start_terminal,start_time,duration_seconds,row_number
0,31400.0,2012-01-01 00:10:00,1162.0,1
1,31400.0,2012-01-01 00:10:00,1145.0,2
2,31236.0,2012-01-01 00:18:00,1754.0,3
3,31214.0,2012-01-01 00:58:00,1206.0,4
4,31604.0,2012-01-01 01:04:00,1124.0,5
5,31300.0,2012-01-01 01:09:00,1512.0,6
6,31201.0,2012-01-01 01:14:00,1212.0,7
7,31201.0,2012-01-01 01:14:00,1185.0,8
8,31002.0,2012-01-01 01:16:00,41427.0,9
9,31002.0,2012-01-01 01:16:00,1202.0,10


In [77]:
%%sql

SELECT start_terminal,
    start_time,
    duration_seconds,
    ROW_NUMBER() OVER (
        PARTITION BY start_terminal
        ORDER BY start_time
    ) AS 'row_number'
FROM dc_bikeshare_q1_2012_preprocessed
WHERE start_time < '2012-01-08'
LIMIT 10

Unnamed: 0,start_terminal,start_time,duration_seconds,row_number
0,31000.0,2012-01-03 12:32:00,1422.0,1
1,31000.0,2012-01-05 17:25:00,3340.0,2
2,31000.0,2012-01-06 17:29:00,2661.0,3
3,31001.0,2012-01-01 13:35:00,2876.0,1
4,31001.0,2012-01-01 13:36:00,2804.0,2
5,31001.0,2012-01-01 13:38:00,2686.0,3
6,31001.0,2012-01-01 15:09:00,3624.0,4
7,31001.0,2012-01-01 15:09:00,3598.0,5
8,31001.0,2012-01-01 16:49:00,1426.0,6
9,31001.0,2012-01-04 09:03:00,1103.0,7


In [74]:
%%sql

SELECT start_terminal,
    duration_seconds,
    RANK() OVER (
        PARTITION BY start_terminal
        ORDER BY start_time
    ) AS 'rank'
FROM dc_bikeshare_q1_2012_preprocessed
WHERE start_time < '2012-01-08'
LIMIT 10

Unnamed: 0,start_terminal,duration_seconds,rank
0,31000.0,1422.0,1
1,31000.0,3340.0,2
2,31000.0,2661.0,3
3,31001.0,2876.0,1
4,31001.0,2804.0,2
5,31001.0,2686.0,3
6,31001.0,3624.0,4
7,31001.0,3598.0,4
8,31001.0,1426.0,6
9,31001.0,1103.0,7


In [78]:
%%sql

SELECT start_terminal,
    duration_seconds,
    NTILE(4) OVER (
        PARTITION BY start_terminal
        ORDER BY duration_seconds
    ) AS 'quartile',
    NTILE(5) OVER (
        PARTITION BY start_terminal
        ORDER BY duration_seconds
    ) AS 'quintile',
    NTILE(100) OVER (
        PARTITION BY start_terminal
        ORDER BY duration_seconds
    ) AS 'percentile'
FROM dc_bikeshare_q1_2012_preprocessed
WHERE start_time < '2012-01-08'
ORDER BY start_terminal,
    duration_seconds
LIMIT 10

Unnamed: 0,start_terminal,duration_seconds,quartile,quintile,percentile
0,31000.0,1422.0,1,1,1
1,31000.0,2661.0,2,2,2
2,31000.0,3340.0,3,3,3
3,31001.0,1103.0,1,1,1
4,31001.0,1157.0,1,1,2
5,31001.0,1168.0,1,1,3
6,31001.0,1264.0,2,2,4
7,31001.0,1426.0,2,2,5
8,31001.0,1740.0,2,2,6
9,31001.0,1747.0,3,3,7


Looking at the results from the query above, you can see that the percentile column doesn't calculate exactly as you might expect. If you only had two records and you were measuring percentiles, you'd expect one record to define the 1st percentile, and the other record to define the 100th percentile. Using the NTILE function, what you'd actually see is one record in the 1st percentile, and one in the 2nd percentile. If you're working with very small windows, keep this in mind and consider using quartiles or similarly small bands.

In [79]:
%%sql

SELECT start_terminal,
    duration_seconds,
    LAG(duration_seconds, 1) OVER (
        PARTITION BY start_terminal
        ORDER BY duration_seconds
    ) AS 'lag',
    LEAD(duration_seconds, 1) OVER (
        PARTITION BY start_terminal
        ORDER BY duration_seconds
    ) AS 'lead'
FROM dc_bikeshare_q1_2012_preprocessed
WHERE start_time < '2012-01-08'
ORDER BY start_terminal,
    duration_seconds
LIMIT 10

Unnamed: 0,start_terminal,duration_seconds,lag,lead
0,31000.0,1422.0,,2661.0
1,31000.0,2661.0,1422.0,3340.0
2,31000.0,3340.0,2661.0,
3,31001.0,1103.0,,1157.0
4,31001.0,1157.0,1103.0,1168.0
5,31001.0,1168.0,1157.0,1264.0
6,31001.0,1264.0,1168.0,1426.0
7,31001.0,1426.0,1264.0,1740.0
8,31001.0,1740.0,1426.0,1747.0
9,31001.0,1747.0,1740.0,2686.0


In [80]:
%%sql

SELECT start_terminal,
    duration_seconds,
    duration_seconds - LAG(duration_seconds, 1) OVER (
        PARTITION BY start_terminal
        ORDER BY duration_seconds
    ) AS 'difference'
FROM dc_bikeshare_q1_2012_preprocessed
WHERE start_time < '2012-01-08'
ORDER BY start_terminal,
    duration_seconds
LIMIT 10

Unnamed: 0,start_terminal,duration_seconds,difference
0,31000.0,1422.0,
1,31000.0,2661.0,1239.0
2,31000.0,3340.0,679.0
3,31001.0,1103.0,
4,31001.0,1157.0,54.0
5,31001.0,1168.0,11.0
6,31001.0,1264.0,96.0
7,31001.0,1426.0,162.0
8,31001.0,1740.0,314.0
9,31001.0,1747.0,7.0


The first row of the difference column is null because there is no previous row from which to pull. Similarly, using LEAD will create nulls at the end of the dataset. If you'd like to make the results a bit cleaner, you can wrap it in an outer query to remove nulls:

In [81]:
%%sql

SELECT *
FROM (
        SELECT start_terminal,
            duration_seconds,
            duration_seconds - LAG(duration_seconds, 1) OVER (
                PARTITION BY start_terminal
                ORDER BY duration_seconds
            ) AS 'difference'
        FROM dc_bikeshare_q1_2012_preprocessed
        WHERE start_time < '2012-01-08'
        ORDER BY start_terminal,
            duration_seconds
    ) sub
WHERE sub.difference IS NOT NULL
LIMIT 10

Unnamed: 0,start_terminal,duration_seconds,difference
0,31000.0,2661.0,1239.0
1,31000.0,3340.0,679.0
2,31001.0,1157.0,54.0
3,31001.0,1168.0,11.0
4,31001.0,1264.0,96.0
5,31001.0,1426.0,162.0
6,31001.0,1740.0,314.0
7,31001.0,1747.0,7.0
8,31001.0,2686.0,939.0
9,31001.0,2804.0,118.0


In [82]:
%%sql

SELECT start_terminal,
    duration_seconds,
    NTILE(4) OVER (
        PARTITION BY start_terminal
        ORDER BY duration_seconds
    ) AS 'quartile',
    NTILE(5) OVER (
        PARTITION BY start_terminal
        ORDER BY duration_seconds
    ) AS 'quintile',
    NTILE(100) OVER (
        PARTITION BY start_terminal
        ORDER BY duration_seconds
    ) AS 'percentile'
FROM dc_bikeshare_q1_2012_preprocessed
WHERE start_time < '2012-01-08'
ORDER BY start_terminal,
    duration_seconds
LIMIT 10

Unnamed: 0,start_terminal,duration_seconds,quartile,quintile,percentile
0,31000.0,1422.0,1,1,1
1,31000.0,2661.0,2,2,2
2,31000.0,3340.0,3,3,3
3,31001.0,1103.0,1,1,1
4,31001.0,1157.0,1,1,2
5,31001.0,1168.0,1,1,3
6,31001.0,1264.0,2,2,4
7,31001.0,1426.0,2,2,5
8,31001.0,1740.0,2,2,6
9,31001.0,1747.0,3,3,7


This can be rewritten as:

In [83]:
%%sql

SELECT start_terminal,
    duration_seconds,
    NTILE(4) OVER ntile_window AS 'quartile',
    NTILE(5) OVER ntile_window AS 'quintile',
    NTILE(100) OVER ntile_window AS 'percentile'
FROM dc_bikeshare_q1_2012_preprocessed
WHERE start_time < '2012-01-08' WINDOW ntile_window AS (
        PARTITION BY start_terminal
        ORDER BY duration_seconds
    )
ORDER BY start_terminal,
    duration_seconds
LIMIT 10

Unnamed: 0,start_terminal,duration_seconds,quartile,quintile,percentile
0,31000.0,1422.0,1,1,1
1,31000.0,2661.0,2,2,2
2,31000.0,3340.0,3,3,3
3,31001.0,1103.0,1,1,1
4,31001.0,1157.0,1,1,2
5,31001.0,1168.0,1,1,3
6,31001.0,1264.0,2,2,4
7,31001.0,1426.0,2,2,5
8,31001.0,1740.0,2,2,6
9,31001.0,1747.0,3,3,7


## EXPLAIN

In [84]:
%sql select * from dc_bikeshare_q1_2012_preprocessed limit 10

Unnamed: 0,duration,duration_seconds,start_time,start_station,start_terminal,end_time,end_station,end_terminal,bike_number,rider_type,id
0,0h 19m 22sec.,1162.0,2012-01-01 00:10:00,Georgia & New Hampshire Ave NW,31400.0,2012-01-01 00:29:00,16th & Harvard St NW,31103.0,W00524,Casual,2
1,0h 19m 5sec.,1145.0,2012-01-01 00:10:00,Georgia & New Hampshire Ave NW,31400.0,2012-01-01 00:29:00,16th & Harvard St NW,31103.0,W00235,Registered,3
2,0h 29m 14sec.,1754.0,2012-01-01 00:18:00,37th & O St NW / Georgetown University,31236.0,2012-01-01 00:47:00,9th & Upshur St NW,31404.0,W00525,Registered,7
3,0h 20m 6sec.,1206.0,2012-01-01 00:58:00,17th & Corcoran St NW,31214.0,2012-01-01 01:18:00,4th & M St SW,31108.0,W00174,Registered,46
4,0h 18m 44sec.,1124.0,2012-01-01 01:04:00,4th St & Massachusetts Ave NW,31604.0,2012-01-01 01:23:00,15th St & Massachusetts Ave SE,31626.0,W01213,Registered,50
5,0h 25m 12sec.,1512.0,2012-01-01 01:09:00,Van Ness Metro / UDC,31300.0,2012-01-01 01:34:00,Thomas Circle,31241.0,W00186,Registered,55
6,0h 20m 12sec.,1212.0,2012-01-01 01:14:00,15th & P St NW,31201.0,2012-01-01 01:34:00,7th & Water St SW / SW Waterfront,31609.0,W00340,Casual,61
7,0h 19m 45sec.,1185.0,2012-01-01 01:14:00,15th & P St NW,31201.0,2012-01-01 01:34:00,7th & Water St SW / SW Waterfront,31609.0,W00929,Registered,63
8,11h 30m 27sec.,41427.0,2012-01-01 01:16:00,20th & Crystal Dr,31002.0,2012-01-01 12:46:00,20th & Crystal Dr,31002.0,W00551,Casual,67
9,0h 20m 2sec.,1202.0,2012-01-01 01:16:00,20th & Crystal Dr,31002.0,2012-01-01 01:36:00,20th & Crystal Dr,31002.0,W00333,Casual,69


In [None]:

%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False
%config SqlMagic.displaylimit=5
%reload_ext sql
%sql {conn_str}

In [98]:
%%sql

EXPLAIN
FORMAT=JSON
SELECT *
  FROM dc_bikeshare_q1_2012_preprocessed
 WHERE start_time >= '2012-03-01'
   AND start_time < '2012-04-01'
 LIMIT 10

EXPLAIN
"{  ""query_block"": {  ""select_id"": 1,  ""cost_info"": {  ""query_cost"": ""7590.05""  },  ""table"": {  ""table_name"": ""dc_bikeshare_q1_2012_preprocessed"",  ""access_type"": ""ALL"",  ""rows_examined_per_scan"": 73898,  ""rows_produced_per_join"": 8209,  ""filtered"": ""11.11"",  ""cost_info"": {  ""read_cost"": ""6769.13"",  ""eval_cost"": ""820.92"",  ""prefix_cost"": ""7590.05"",  ""data_read_per_join"": ""769K""  },  ""used_columns"": [  ""duration"",  ""duration_seconds"",  ""start_time"",  ""start_station"",  ""start_terminal"",  ""end_time"",  ""end_station"",  ""end_terminal"",  ""bike_number"",  ""rider_type"",  ""id""  ],  ""attached_condition"": ""((`sparsh`.`dc_bikeshare_q1_2012_preprocessed`.`start_time` >= TIMESTAMP'2012-03-01 00:00:00') and (`sparsh`.`dc_bikeshare_q1_2012_preprocessed`.`start_time` < TIMESTAMP'2012-04-01 00:00:00'))""  }  } }"
