# Data exploration
* Data source: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
* Have a look at the data

In [1]:
import pandas as pd

In [2]:
pd.__version__

'1.5.1'

In [3]:
df = pd.read_csv("../../data/green_tripdata_2019-01.csv", nrows=100)
df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2018-12-21 15:17:29,2018-12-21 15:18:57,N,1,264,264,5,0.0,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1,
1,2,2019-01-01 00:10:16,2019-01-01 00:16:32,N,1,97,49,2,0.86,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1,
2,2,2019-01-01 00:27:11,2019-01-01 00:31:38,N,1,49,189,2,0.66,4.5,0.5,0.5,0.0,0.0,,0.3,5.8,1,1,
3,2,2019-01-01 00:46:20,2019-01-01 01:04:54,N,1,189,17,2,2.68,13.5,0.5,0.5,2.96,0.0,,0.3,19.71,1,1,
4,2,2019-01-01 00:19:06,2019-01-01 00:39:43,N,1,82,258,1,4.53,18.0,0.5,0.5,0.0,0.0,,0.3,19.3,2,1,


In [113]:
df_zones = pd.read_csv("../../data/taxi+_zone_lookup.csv")
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


* Put this data to our postgres
* For that we need to create a schema (what type of columns do we have?)

In [4]:
# convert dtaframe to ddl (data definition language)
print(pd.io.sql.get_schema(df, name="green_taxi_data_2019"))

CREATE TABLE "green_taxi_data_2019" (
"VendorID" INTEGER,
  "lpep_pickup_datetime" TEXT,
  "lpep_dropoff_datetime" TEXT,
  "store_and_fwd_flag" TEXT,
  "RatecodeID" INTEGER,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "ehail_fee" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "payment_type" INTEGER,
  "trip_type" INTEGER,
  "congestion_surcharge" REAL
)


* pickup time and dropoff time is "TEXT", this needs to be changed to datetime

In [5]:
df.tpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

  df.tpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
  df.tpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)


In [6]:
print(pd.io.sql.get_schema(df, name="green_taxi_data_2019"))

CREATE TABLE "green_taxi_data_2019" (
"VendorID" INTEGER,
  "lpep_pickup_datetime" TEXT,
  "lpep_dropoff_datetime" TEXT,
  "store_and_fwd_flag" TEXT,
  "RatecodeID" INTEGER,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "ehail_fee" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "payment_type" INTEGER,
  "trip_type" INTEGER,
  "congestion_surcharge" REAL
)


In [114]:
print(pd.io.sql.get_schema(df, name="zones"))

CREATE TABLE "zones" (
"VendorID" INTEGER,
  "lpep_pickup_datetime" TIMESTAMP,
  "lpep_dropoff_datetime" TIMESTAMP,
  "store_and_fwd_flag" TEXT,
  "RatecodeID" INTEGER,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "ehail_fee" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "payment_type" INTEGER,
  "trip_type" INTEGER,
  "congestion_surcharge" REAL
)


* create a connection to postgres
* pandas uses sqlalchemy

In [7]:
from sqlalchemy import create_engine

In [8]:
engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi")

In [9]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f98d7313dc0>

In [10]:
print(pd.io.sql.get_schema(df, name="green_taxi_data_2019", con=engine))


CREATE TABLE green_taxi_data_2019 (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TEXT, 
	lpep_dropoff_datetime TEXT, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge FLOAT(53)
)




In [115]:
print(pd.io.sql.get_schema(df, name="zones", con=engine))


CREATE TABLE zones (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge FLOAT(53)
)




* Read data in batches, all at the same time would be too much

In [12]:
df_iter = pd.read_csv("../../data/green_tripdata_2019-01.csv", iterator=True, chunksize=100000)
df_iter

<pandas.io.parsers.readers.TextFileReader at 0x7f98d43c83d0>

In [13]:
df = next(df_iter)
len(df)

100000

In [14]:
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

* First create a table (using ```df.head(n=0)```)
* The fill it with chunks of the data
* With ```to_sql``` method the rows are inserted to the database
* If a table with this name alreay exists, a new one will replace the old one (```if_exists="replace"```)

In [15]:
# create table
df.head(n=0).to_sql(name="green_taxi_data_2019", con=engine, if_exists="replace")

0

In [16]:
# add data
%time df.to_sql(name="green_taxi_data_2019", con=engine, if_exists="append")

CPU times: user 4.58 s, sys: 59.3 ms, total: 4.64 s
Wall time: 8.42 s


1000

In [17]:
from time import time

In [18]:
# load all data until ther e is no chunk left
while True:
    t_start = time()
    
    df = next(df_iter)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.to_sql(name="green_taxi_data_2019", con=engine, if_exists="append")
    
    t_end = time()
    
    print(f"inserted another chunk..., took {t_end - t_start:.3f} seconds")

inserted another chunk..., took 9.512 seconds
inserted another chunk..., took 9.802 seconds
inserted another chunk..., took 8.272 seconds
inserted another chunk..., took 7.876 seconds
inserted another chunk..., took 8.929 seconds
inserted another chunk..., took 2.717 seconds


StopIteration: 

In [120]:
# create table
df.head(n=0).to_sql(name="zones", con=engine, if_exists="replace")
# add data
df.to_sql(name="zones", con=engine, if_exists="append")

265

## Test SQL

In [20]:
query = """
SELECT 1;
"""
pd.read_sql(query, con=engine)

Unnamed: 0,?column?
0,1


In [30]:
query = """
SELECT *
FROM green_taxi_data_2019
LIMIT 5;
"""
pd.read_sql(query, con=engine)

Unnamed: 0,index,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,...,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,0,2,2018-12-21 15:17:29,2018-12-21 15:18:57,N,1,264,264,5,0.0,...,0.5,0.5,0.0,0.0,,0.3,4.3,2,1,
1,1,2,2019-01-01 00:10:16,2019-01-01 00:16:32,N,1,97,49,2,0.86,...,0.5,0.5,0.0,0.0,,0.3,7.3,2,1,
2,2,2,2019-01-01 00:27:11,2019-01-01 00:31:38,N,1,49,189,2,0.66,...,0.5,0.5,0.0,0.0,,0.3,5.8,1,1,
3,3,2,2019-01-01 00:46:20,2019-01-01 01:04:54,N,1,189,17,2,2.68,...,0.5,0.5,2.96,0.0,,0.3,19.71,1,1,
4,4,2,2019-01-01 00:19:06,2019-01-01 00:39:43,N,1,82,258,1,4.53,...,0.5,0.5,0.0,0.0,,0.3,19.3,2,1,


## Question 3. Count records
How many taxi trips were totally made on January 15?

In [82]:
query = """
SELECT *, TO_CHAR(lpep_pickup_datetime,'YYYY-MM-DD') AS lpep_pickup_day, 
TO_CHAR(lpep_dropoff_datetime,'YYYY-MM-DD') AS lpep_dropoff_day
FROM green_taxi_data_2019
WHERE TO_CHAR(lpep_pickup_datetime,'YYYY-MM-DD')='2019-01-15' AND TO_CHAR(lpep_dropoff_datetime,'YYYY-MM-DD')='2019-01-15';
"""
pd.read_sql(query, con=engine)

Unnamed: 0,index,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,...,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,lpep_pickup_day,lpep_dropoff_day
0,278806,2,2019-01-15 20:02:24,2019-01-15 20:17:52,N,1,195,228,1,3.23,...,0.00,0.0,,0.3,14.30,2,1,,2019-01-15,2019-01-15
1,275421,2,2019-01-15 16:50:12,2019-01-15 17:23:03,N,1,195,17,1,6.37,...,5.46,0.0,,0.3,32.76,1,1,,2019-01-15,2019-01-15
2,276753,2,2019-01-15 17:44:00,2019-01-15 18:03:10,N,1,34,52,1,2.08,...,3.70,0.0,,0.3,18.50,1,1,,2019-01-15,2019-01-15
3,272684,2,2019-01-15 15:33:43,2019-01-15 15:33:50,N,5,195,195,1,0.00,...,0.00,0.0,,0.0,35.00,1,2,,2019-01-15,2019-01-15
4,280062,2,2019-01-15 00:01:38,2019-01-15 00:11:09,N,1,41,161,1,3.77,...,4.00,0.0,,0.3,17.30,1,1,,2019-01-15,2019-01-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20525,280398,2,2019-01-15 00:41:40,2019-01-15 00:51:26,N,1,129,129,1,1.15,...,0.00,0.0,,0.3,8.80,2,1,,2019-01-15,2019-01-15
20526,280399,2,2019-01-15 00:19:12,2019-01-15 00:23:42,N,1,106,54,4,1.29,...,4.00,0.0,,0.3,11.30,1,1,,2019-01-15,2019-01-15
20527,280400,1,2019-01-15 00:10:02,2019-01-15 00:14:32,N,1,41,42,1,1.20,...,0.00,0.0,,0.3,7.30,2,1,,2019-01-15,2019-01-15
20528,280401,2,2019-01-15 00:04:20,2019-01-15 00:45:49,N,1,97,229,1,9.73,...,0.00,0.0,,0.3,37.30,2,1,,2019-01-15,2019-01-15


## Question 4. Largest trip for each day
Which was the day with the largest trip distance Use the pick up time for your calculations.


In [110]:
query = """
SELECT lpep_pickup_datetime, trip_distance
FROM green_taxi_data_2019 t1
ORDER BY trip_distance DESC
"""
pd.read_sql(query, con=engine)

Unnamed: 0,lpep_pickup_datetime,trip_distance
0,2019-01-15 19:27:58,117.99
1,2019-01-18 07:06:27,80.96
2,2019-01-28 21:01:59,64.27
3,2019-01-10 18:58:25,64.20
4,2019-01-06 17:31:27,60.91
...,...,...
630913,2019-01-01 05:35:30,0.00
630914,2019-01-01 05:37:09,0.00
630915,2019-01-01 05:41:18,0.00
630916,2019-01-01 05:44:38,0.00


## Question 5. The number of passengers
In 2019-01-01 how many trips had 2 and 3 passengers?

In [111]:
query = """
SELECT count(lpep_pickup_datetime)
FROM green_taxi_data_2019 
WHERE TO_CHAR(lpep_pickup_datetime,'YYYY-MM-DD')='2019-01-01'
GROUP BY passenger_count
"""
pd.read_sql(query, con=engine)

Unnamed: 0,count
0,21
1,12415
2,1282
3,254
4,129
5,616
6,273


## Question 6. Largest tip
For the passengers picked up in the Astoria Zone which was the drop off zone that had the largest tip? We want the name of the zone, not the id.

In [121]:
query = """
SELECT *
FROM zones 
"""
pd.read_sql(query, con=engine)

Unnamed: 0,index,LocationID,Borough,Zone,service_zone
0,0,1,EWR,Newark Airport,EWR
1,1,2,Queens,Jamaica Bay,Boro Zone
2,2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,3,4,Manhattan,Alphabet City,Yellow Zone
4,4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...,...
260,260,261,Manhattan,World Trade Center,Yellow Zone
261,261,262,Manhattan,Yorkville East,Yellow Zone
262,262,263,Manhattan,Yorkville West,Yellow Zone
263,263,264,Unknown,NV,


In [135]:
query = """
SELECT *
FROM green_taxi_data_2019 t 
JOIN zones z ON t."PULocationID" = z."LocationID"
WHERE z."Zone"='Astoria' 
ORDER BY total_amount DESC;
"""
pd.read_sql(query, con=engine)

Unnamed: 0,index,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,...,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,index.1,LocationID,Borough,Zone,service_zone
0,243896,2,2019-01-13 01:23:25,2019-01-13 02:04:45,N,4,7,265,6,34.89,...,0.3,144.80,2,1,,6,7,Queens,Astoria,Boro Zone
1,119430,2,2019-01-07 10:52:38,2019-01-07 11:36:29,N,4,7,265,1,26.05,...,0.3,131.56,1,1,,6,7,Queens,Astoria,Boro Zone
2,15617,2,2019-01-02 04:17:39,2019-01-02 05:09:07,N,4,7,265,5,17.66,...,0.3,114.96,1,1,,6,7,Queens,Astoria,Boro Zone
3,4014,2,2019-01-01 03:21:12,2019-01-01 04:23:07,N,1,7,7,2,32.72,...,0.3,108.96,1,1,,6,7,Queens,Astoria,Boro Zone
4,44495,2,2019-01-03 15:47:05,2019-01-03 17:50:45,N,1,7,237,1,31.14,...,0.3,105.80,2,1,,6,7,Queens,Astoria,Boro Zone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26041,577697,2,2019-01-29 14:11:06,2019-01-29 14:27:38,N,1,7,7,5,0.81,...,-0.3,-11.80,4,1,0.0,6,7,Queens,Astoria,Boro Zone
26042,567550,2,2019-01-29 05:34:06,2019-01-29 05:51:36,N,1,7,7,5,0.36,...,-0.3,-12.30,3,1,0.0,6,7,Queens,Astoria,Boro Zone
26043,399701,2,2019-01-20 18:43:05,2019-01-20 18:47:57,N,5,7,7,1,0.28,...,0.0,-30.00,3,2,,6,7,Queens,Astoria,Boro Zone
26044,243769,2,2019-01-13 01:19:39,2019-01-13 01:19:43,N,5,7,7,1,0.00,...,0.0,-33.00,3,2,,6,7,Queens,Astoria,Boro Zone
