# Script to Populate Postgres from Local file

In [1]:
import pandas as pd

pd.__version__

'2.2.2'

In [5]:
df = pd.read_csv("taxi_data/yellow_tripdata_2019-01.csv", nrows=100)

print(df)


    VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0          1  2019-01-01 00:46:40   2019-01-01 00:53:20                1   
1          1  2019-01-01 00:59:47   2019-01-01 01:18:59                1   
2          2  2018-12-21 13:48:30   2018-12-21 13:52:40                3   
3          2  2018-11-28 15:52:25   2018-11-28 15:55:45                5   
4          2  2018-11-28 15:56:57   2018-11-28 15:58:33                5   
..       ...                  ...                   ...              ...   
95         2  2019-01-01 00:02:06   2019-01-01 00:10:19                2   
96         2  2019-01-01 00:24:57   2019-01-01 00:35:42                2   
97         2  2019-01-01 00:46:09   2019-01-01 01:06:46                1   
98         1  2019-01-01 00:16:07   2019-01-01 00:19:41                1   
99         1  2019-01-01 00:23:47   2019-01-01 00:39:41                1   

    trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0      

In [5]:
# gets the csv file in smaller chunks so it doesn't break the machine!!

df = pd.read_csv("taxi_data/yellow_tripdata_2019-01.csv", iterator=True, chunksize=100000) 

print(df)


    VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0          1  2019-01-01 00:46:40   2019-01-01 00:53:20                1   
1          1  2019-01-01 00:59:47   2019-01-01 01:18:59                1   
2          2  2018-12-21 13:48:30   2018-12-21 13:52:40                3   
3          2  2018-11-28 15:52:25   2018-11-28 15:55:45                5   
4          2  2018-11-28 15:56:57   2018-11-28 15:58:33                5   
..       ...                  ...                   ...              ...   
95         2  2019-01-01 00:02:06   2019-01-01 00:10:19                2   
96         2  2019-01-01 00:24:57   2019-01-01 00:35:42                2   
97         2  2019-01-01 00:46:09   2019-01-01 01:06:46                1   
98         1  2019-01-01 00:16:07   2019-01-01 00:19:41                1   
99         1  2019-01-01 00:23:47   2019-01-01 00:39:41                1   

    trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0      

In [6]:
pd.io.sql.get_schema(df, name="yellow_taxi_data") #describes how the table will work in SQL

'CREATE TABLE "yellow_taxi_data" (\n"VendorID" INTEGER,\n  "tpep_pickup_datetime" TEXT,\n  "tpep_dropoff_datetime" TEXT,\n  "passenger_count" INTEGER,\n  "trip_distance" REAL,\n  "RatecodeID" INTEGER,\n  "store_and_fwd_flag" TEXT,\n  "PULocationID" INTEGER,\n  "DOLocationID" INTEGER,\n  "payment_type" INTEGER,\n  "fare_amount" REAL,\n  "extra" REAL,\n  "mta_tax" REAL,\n  "tip_amount" REAL,\n  "tolls_amount" REAL,\n  "improvement_surcharge" REAL,\n  "total_amount" REAL,\n  "congestion_surcharge" REAL\n)'

In [9]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) # fix issue where TEXT should be something else, such as DATETIME
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) 

In [11]:
# generate a connection to POSTGRES
from sqlalchemy import create_engine

engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi") #values from the docker file (for postgres)

engine.connect()

<sqlalchemy.engine.base.Connection at 0x180ca7a40>

In [13]:
print(pd.io.sql.get_schema(df, name="yellow_taxi_data", con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [14]:
df_iter = pd.read_csv("taxi_data/yellow_tripdata_2019-01.csv", iterator=True, chunksize=100000) 

print(df_iter) # this will be a an iterator object

<pandas.io.parsers.readers.TextFileReader object at 0x180a238c0>


In [15]:
# we want the result of that iterator, so we use next
df = next(df_iter)

len(df) # checks how many rows df has 

100000

In [16]:
# need to repeat this step
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) # fix issue where TEXT should be something else, such as DATETIME
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) 

In [17]:
# check if it worked
df.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-01-01 00:46:40,2019-01-01 00:53:20,1,1.5,1,N,151,239,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,
1,1,2019-01-01 00:59:47,2019-01-01 01:18:59,1,2.6,1,N,239,246,1,14.0,0.5,0.5,1.0,0.0,0.3,16.3,
2,2,2018-12-21 13:48:30,2018-12-21 13:52:40,3,0.0,1,N,236,236,1,4.5,0.5,0.5,0.0,0.0,0.3,5.8,
3,2,2018-11-28 15:52:25,2018-11-28 15:55:45,5,0.0,1,N,193,193,2,3.5,0.5,0.5,0.0,0.0,0.3,7.55,
4,2,2018-11-28 15:56:57,2018-11-28 15:58:33,5,0.0,2,N,193,193,2,52.0,0.0,0.5,0.0,0.0,0.3,55.55,
5,2,2018-11-28 16:25:49,2018-11-28 16:28:26,5,0.0,1,N,193,193,2,3.5,0.5,0.5,0.0,5.76,0.3,13.31,
6,2,2018-11-28 16:29:37,2018-11-28 16:33:43,5,0.0,2,N,193,193,2,52.0,0.0,0.5,0.0,0.0,0.3,55.55,
7,1,2019-01-01 00:21:28,2019-01-01 00:28:37,1,1.3,1,N,163,229,1,6.5,0.5,0.5,1.25,0.0,0.3,9.05,
8,1,2019-01-01 00:32:01,2019-01-01 00:45:39,1,3.7,1,N,229,7,1,13.5,0.5,0.5,3.7,0.0,0.3,18.5,
9,1,2019-01-01 00:57:32,2019-01-01 01:09:32,2,2.1,1,N,141,234,1,10.0,0.5,0.5,1.7,0.0,0.3,13.0,


In [18]:
# here first we need to create the header of the table in postgres)
# so we use n=0
df.head(n=0).to_sql(name="yellow_taxi_data", con=engine, if_exists="replace")
# now go back to shell and check in your docker how many tables you have
# run in the shell: /dt

0

In [19]:
# now we will append to the table in sql
%time df.to_sql(name="yellow_taxi_data", con=engine, if_exists="append") # the time will let us know how long it took


CPU times: user 5.54 s, sys: 388 ms, total: 5.92 s
Wall time: 10.3 s


1000

In [21]:
# now we wil create a loop to keep populating the table while True:
from time import time # we will measure how long it takes

while True:
    t_start = time()
    df = next(df_iter)

    # these steps need to be done everytime: convert datetime and append the new row
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) 
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

    df.to_sql(name="yellow_taxi_data", con=engine, if_exists="append")
    t_end = time()
    print("another row inserted, took %.3f seconds" % (t_end - t_start))

    # "I am not proud of this code, but it works".
    # when this throw an exception it stops

# so you go back to shell in the postgres container and run a few stats to check
# SELECT max(tpep_pickup_datetime), min(tpep_pickup_datetime), max(total_amount) FROM yellow_taxi_data

    

another row inserted, took 9.629 seconds
another row inserted, took 9.406 seconds
another row inserted, took 9.954 seconds
another row inserted, took 10.055 seconds
another row inserted, took 11.452 seconds
another row inserted, took 9.915 seconds
another row inserted, took 10.159 seconds
another row inserted, took 9.894 seconds
another row inserted, took 10.186 seconds
another row inserted, took 10.344 seconds
another row inserted, took 10.517 seconds
another row inserted, took 9.905 seconds
another row inserted, took 10.256 seconds
another row inserted, took 9.822 seconds
another row inserted, took 9.633 seconds
another row inserted, took 9.867 seconds
another row inserted, took 9.828 seconds
another row inserted, took 9.693 seconds
another row inserted, took 9.864 seconds
another row inserted, took 9.796 seconds
another row inserted, took 9.861 seconds
another row inserted, took 10.056 seconds
another row inserted, took 9.996 seconds
another row inserted, took 9.708 seconds
another 

StopIteration: 

In [None]:
!wget  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv

: 

In [2]:
import pandas as pd

df_zones = pd.read_csv("taxi_data/taxi_zone_lookup.csv")

In [3]:
df_zones.head(10)

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
5,6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone
6,7,Queens,Astoria,Boro Zone
7,8,Queens,Astoria Park,Boro Zone
8,9,Queens,Auburndale,Boro Zone
9,10,Queens,Baisley Park,Boro Zone


In [4]:
from sqlalchemy import create_engine

engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi") #values from the docker file (for postgres)

engine.connect()

<sqlalchemy.engine.base.Connection at 0x117486d20>

In [5]:
df_zones.to_sql(name="taxi_zone_lookup", con=engine, if_exists="replace")

265

## Homework Week 1
Ingest green data

In [4]:
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz

--2025-01-23 15:11:36--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/ea580e9e-555c-4bd0-ae73-43051d8e7c0b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250123%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250123T141136Z&X-Amz-Expires=300&X-Amz-Signature=4d2c8b4927ef4bfe05d0d81f0c93c81c3c86ef4bb6761c553ce31291645688e3&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dgreen_tripdata_2019-10.csv.gz&response-content-type=application%2Foctet-stream [following]
--2025-01-23 15:11:36--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/ea580e9e-555c-4bd0-ae73-43051d8e7c0b?X-Amz-A

In [8]:
from sqlalchemy import create_engine

engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi") #values from the docker file (for postgres)

engine.connect()

<sqlalchemy.engine.base.Connection at 0x16d40a960>

In [9]:
from time import time
import pandas as pd

table_name="green_taxi_trips"

df_iter = pd.read_csv("../taxi_data/green_tripdata_2019-10.csv.gz", iterator=True, chunksize=100000, compression='gzip') 

df = next(df_iter)

df.head(10)



Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2019-10-01 00:26:02,2019-10-01 00:39:58,N,1,112,196,1,5.88,18.0,0.5,0.5,0.0,0.0,,0.3,19.3,2,1.0,0.0
1,1,2019-10-01 00:18:11,2019-10-01 00:22:38,N,1,43,263,1,0.8,5.0,3.25,0.5,0.0,0.0,,0.3,9.05,2,1.0,0.0
2,1,2019-10-01 00:09:31,2019-10-01 00:24:47,N,1,255,228,2,7.5,21.5,0.5,0.5,0.0,0.0,,0.3,22.8,2,1.0,0.0
3,1,2019-10-01 00:37:40,2019-10-01 00:41:49,N,1,181,181,1,0.9,5.5,0.5,0.5,0.0,0.0,,0.3,6.8,2,1.0,0.0
4,2,2019-10-01 00:08:13,2019-10-01 00:17:56,N,1,97,188,1,2.52,10.0,0.5,0.5,2.26,0.0,,0.3,13.56,1,1.0,0.0
5,2,2019-10-01 00:35:01,2019-10-01 00:43:40,N,1,65,49,1,1.47,8.0,0.5,0.5,1.86,0.0,,0.3,11.16,1,1.0,0.0
6,1,2019-10-01 00:28:09,2019-10-01 00:30:49,N,1,7,179,1,0.6,4.0,0.5,0.5,1.0,0.0,,0.3,6.3,1,1.0,0.0
7,2,2019-10-01 00:28:26,2019-10-01 00:32:01,N,1,41,74,1,0.56,4.5,0.5,0.5,0.0,0.0,,0.3,5.8,2,1.0,0.0
8,2,2019-10-01 00:14:01,2019-10-01 00:26:16,N,1,255,49,1,2.42,10.5,0.5,0.5,0.0,0.0,,0.3,11.8,2,1.0,0.0
9,1,2019-10-01 00:03:03,2019-10-01 00:17:13,Y,1,130,131,1,3.4,13.0,0.5,0.5,2.85,0.0,,0.3,17.15,1,1.0,0.0


In [10]:
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime) # fix issue where TEXT should be something else, such as DATETIME
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime) 

df.head(n=0).to_sql(name=table_name, con=engine, if_exists="replace") # this will create the table

df.to_sql(name=table_name, con=engine, if_exists="append")

while True:
    try:
        t_start = time()
        df = next(df_iter)

        # these steps need to be done everytime: convert datetime and append the new row
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime) # fix issue where TEXT should be something else, such as DATETIME
        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime) 
        df.to_sql(name=table_name, con=engine, if_exists="append")

        t_end = time()

        print("another row inserted, took %.3f seconds" % (t_end - t_start))

    except StopIteration:
        print("No more data to process. Exiting loop.")
        break

another row inserted, took 10.478 seconds
another row inserted, took 10.606 seconds


  df = next(df_iter)


another row inserted, took 10.746 seconds
another row inserted, took 7.012 seconds
No more data to process. Exiting loop.


In [7]:
import pandas as pd

# Create an iterator for the chunks
df_iter = pd.read_csv("../taxi_data/green_tripdata_2019-10.csv.gz", iterator=True, chunksize=100000, compression='gzip')

# Initialize a counter for the total number of rows
total_length = 0

# Iterate through all chunks
for chunk in df_iter:
    total_length += len(chunk)  # Add the number of rows in the current chunk

print(f"Total number of rows: {total_length}")

Total number of rows: 476386


  for chunk in df_iter:
