In [1]:
# Import the libraries needed
import pandas as pd
import sqlalchemy as db
from time import time

In [2]:
# Load the first few rows from the csv file to facilitate table creation
df = pd.read_csv('green_tripdata_2019-10.csv', nrows=3)

In [3]:
# Transform the date columns from text to timestamp
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

In [4]:
# Create a database engine using PostgreSQL and specify the name of the database
engine = db.create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [5]:
# Establish a connectin to the database
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7fd0d315fe50>

In [6]:
# Create the table schema from the headers in csv file
pd.io.sql.get_schema(df, con=engine, name='green_taxi_data')

'\nCREATE TABLE green_taxi_data (\n\t"VendorID" BIGINT, \n\tlpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, \n\tlpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, \n\tstore_and_fwd_flag TEXT, \n\t"RatecodeID" BIGINT, \n\t"PULocationID" BIGINT, \n\t"DOLocationID" BIGINT, \n\tpassenger_count BIGINT, \n\ttrip_distance FLOAT(53), \n\tfare_amount FLOAT(53), \n\textra FLOAT(53), \n\tmta_tax FLOAT(53), \n\ttip_amount BIGINT, \n\ttolls_amount BIGINT, \n\tehail_fee FLOAT(53), \n\timprovement_surcharge FLOAT(53), \n\ttotal_amount FLOAT(53), \n\tpayment_type BIGINT, \n\ttrip_type BIGINT, \n\tcongestion_surcharge BIGINT\n)\n\n'

In [7]:
# Load the data from the csv file in chunks
df_iter = pd.read_csv('green_tripdata_2019-10.csv', iterator=True, chunksize=100000, low_memory=False)

In [8]:
index_offset = 1
for df in df_iter:
    t_start = time()

    df.index += index_offset

    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    
    df.to_sql(name='green_taxi_data', con=engine, if_exists='append')

    index_offset += 1

    t_end = time()

    print(f'inserted another chunk ... took {(t_end - t_start):0,.2f} seconds')

inserted another chunk ... took 8.55 seconds
inserted another chunk ... took 8.83 seconds
inserted another chunk ... took 9.63 seconds
inserted another chunk ... took 9.49 seconds
inserted another chunk ... took 6.09 seconds
