In [19]:
import pandas as pd

In [20]:
pd.__version__

'2.1.4'

This notebook is not exactly as the video _1.2.2 - Ingesting NY Taxi Data to Postgres_, specially in the loading chunked csv section.

### Load data

In [21]:
df = pd.read_csv('yellow_tripdata_2021-01.csv', nrows=100) # for video lesson

In [22]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [23]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1,2.1,1,N,142,43,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1,0.2,1,N,238,151,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1,14.7,1,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0,10.6,1,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1,4.94,1,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5


### Create schema for SQL Table

### DataBase connection

In [3]:
from sqlalchemy import create_engine

In [4]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [5]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x1bef19d00b0>

In [6]:
print(pd.io.sql.get_schema(df, name='yellow_taxi_data', con=engine))

NameError: name 'df' is not defined

### Upload data

#### Test with 1 chunk with size 100000

In [10]:
df_iter = pd.read_csv('yellow_tripdata_2021-01.csv', low_memory=False, chunksize=100000) #add low_memory arg because a dtype error in col6.

In [16]:
df_iter # it's an iterable object

<pandas.io.parsers.readers.TextFileReader at 0x212853e0d40>

In [12]:
df.head(n=0)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge


In [13]:
df.head(n=0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace') # Load the header and then append the data in loop for chunked df

0

#### Iterating over all data

In [14]:
from time import time

In [15]:
while True:
    t_start = time()

    df = next(df_iter)

    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

    df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

    t_end = time()

    print('inserted another chunk..., took %.3f seconds' % (t_end - t_start))

inserted another chunk..., took 13.431 seconds
inserted another chunk..., took 13.096 seconds
inserted another chunk..., took 13.209 seconds
inserted another chunk..., took 12.843 seconds
inserted another chunk..., took 13.641 seconds
inserted another chunk..., took 12.756 seconds
inserted another chunk..., took 13.867 seconds
inserted another chunk..., took 13.442 seconds
inserted another chunk..., took 13.613 seconds
inserted another chunk..., took 13.113 seconds
inserted another chunk..., took 13.599 seconds
inserted another chunk..., took 14.491 seconds
inserted another chunk..., took 13.497 seconds
inserted another chunk..., took 8.683 seconds


StopIteration: 

In [3]:
# Upload aditional data (taxi zones)

!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

--2024-01-21 15:29:50--  https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 16.182.66.48, 52.217.203.16, 16.182.100.64, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|16.182.66.48|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12322 (12K) [application/octet-stream]
Saving to: 'taxi+_zone_lookup.csv'

     0K .......... ..                                         100%  202M=0s

2024-01-21 15:29:51 (202 MB/s) - 'taxi+_zone_lookup.csv' saved [12322/12322]



In [7]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')

In [8]:
df_zones.head(5)

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [9]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265