**Python analytics of NY taxi data**

In [3]:
# Import packages
import pandas as pd
import os

In [7]:
# Load dataframe
df = pd.read_csv("week_1/1_Docker/yellow_tripdata_2021-01.csv", nrows=100)
df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])
df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])

In [12]:
# Pandas uses SQLAlchemy, need to install with pip
#pip install sqlalchemy
#pip install psycopg2
from sqlalchemy import create_engine

In [14]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [15]:
# pandas io library can generate SQL for DDL commands:
print(pd.io.sql.get_schema(df, name='yellow_taxi_data', con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [28]:
#over 1M rows in ny taxi data, upload data to db in batches
df_iter = pd.read_csv('week_1/1_Docker/yellow_tripdata_2021-01.csv', iterator=True, chunksize=100000)

In [19]:
# this is a python iterator
type(df_iter)

pandas.io.parsers.readers.TextFileReader

In [20]:
# use the next function to get the next value in an iterator
df_0 = next(df_iter)
df_0

100000

In [22]:
df_0["tpep_dropoff_datetime"] = pd.to_datetime(df_0["tpep_dropoff_datetime"])
df_0["tpep_pickup_datetime"] = pd.to_datetime(df_0["tpep_pickup_datetime"])

In [24]:
# first create the table using the columns without inserting any data
df_head = df_0.head(n=0)
df_head.to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

0

**Check that table created in postgres in terminal**

pgcli -h localhost -p 5432 -u root -d ny_taxi

Check for tables:
\dt

In [25]:
# insert first 100000 rows to table. time the command
%time df_0.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

CPU times: user 6.03 s, sys: 107 ms, total: 6.13 s
Wall time: 8.93 s


1000

In [29]:
# now insert all data
from time import time
for df in df_iter:
    start_ts = time()
    df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])
    df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])

    df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')
    end_ts = time()

    print('inserted another chunk in %.3f seconds' % (end_ts - start_ts))


inserted another chunk in 9.851 seconds
inserted another chunk in 9.456 seconds
inserted another chunk in 8.494 seconds
inserted another chunk in 8.770 seconds
inserted another chunk in 8.892 seconds
inserted another chunk in 8.466 seconds
inserted another chunk in 8.528 seconds
inserted another chunk in 8.861 seconds
inserted another chunk in 8.463 seconds
inserted another chunk in 8.316 seconds
inserted another chunk in 8.438 seconds
inserted another chunk in 8.560 seconds


  for df in df_iter:


inserted another chunk in 8.324 seconds
inserted another chunk in 5.291 seconds
