# Data exploration
* Data source: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
* Have a look at the data

In [None]:
import pandas as pd

In [None]:
pd.__version__

In [None]:
df = pd.read_csv("../../../data/yellow_tripdata_2021-01.csv", nrows=100)
df.head()

* Put this data to our postgres
* For that we need to create a schema (what type of columns do we have?)

In [4]:
# convert dtaframe to ddl (data definition language)
print(pd.io.sql.get_schema(df, name="yellow_taxi_data"))

CREATE TABLE "yellow_taxi_data" (
"VendorID" INTEGER,
  "tpep_pickup_datetime" TEXT,
  "tpep_dropoff_datetime" TEXT,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "RatecodeID" INTEGER,
  "store_and_fwd_flag" TEXT,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "payment_type" INTEGER,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "congestion_surcharge" REAL
)


* pickup time and dropoff time is "TEXT", this needs to be changed to datetime

In [5]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [6]:
print(pd.io.sql.get_schema(df, name="yellow_taxi_data"))

CREATE TABLE "yellow_taxi_data" (
"VendorID" INTEGER,
  "tpep_pickup_datetime" TIMESTAMP,
  "tpep_dropoff_datetime" TIMESTAMP,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "RatecodeID" INTEGER,
  "store_and_fwd_flag" TEXT,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "payment_type" INTEGER,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "congestion_surcharge" REAL
)


* create a connection to postgres
* pandas uses sqlalchemy

In [7]:
from sqlalchemy import create_engine

In [8]:
engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi")

In [9]:
#engine.connect()

In [10]:
print(pd.io.sql.get_schema(df, name="yellow_taxi_data", con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




* Read data in batches, all at the same time would be too much

In [11]:
df_iter = pd.read_csv("/home/frauke/data-engineering-zoomcamp/data/yellow_tripdata_2021-01.csv", iterator=True, chunksize=100000)
df_iter

<pandas.io.parsers.readers.TextFileReader at 0x7f2fe4cdd7c0>

In [12]:
df = next(df_iter)
len(df)

100000

In [13]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

* First create a table (using ```df.head(n=0)```)
* The fill it with chunks of the data
* With ```to_sql``` method the rows are inserted to the database
* If a table with this name alreay exists, a new one will replace the old one (```if_exists="replace"```)

In [14]:
# create table
df.head(n=0).to_sql(name="yellow_taxi_data", con=engine, if_exists="replace")

In [16]:
# add data
%time df.to_sql(name="yellow_taxi_data", con=engine, if_exists="append")

CPU times: user 4.47 s, sys: 121 ms, total: 4.59 s
Wall time: 9.9 s


In [19]:
from time import time

In [20]:
# load all data until ther e is no chunk left
while True:
    t_start = time()
    
    df = next(df_iter)
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    df.to_sql(name="yellow_taxi_data", con=engine, if_exists="append")
    
    t_end = time()
    
    print(f"inserted another chunk..., took {t_end - t_start:.3f} seconds")

inserted another chunk..., took 9.757 seconds
inserted another chunk..., took 10.175 seconds
inserted another chunk..., took 9.558 seconds
inserted another chunk..., took 10.537 seconds
inserted another chunk..., took 15.191 seconds
inserted another chunk..., took 10.756 seconds
inserted another chunk..., took 9.533 seconds
inserted another chunk..., took 10.176 seconds
inserted another chunk..., took 8.859 seconds
inserted another chunk..., took 9.184 seconds
inserted another chunk..., took 9.574 seconds


  if (await self.run_code(code, result,  async_=asy)):


inserted another chunk..., took 8.739 seconds
inserted another chunk..., took 7.502 seconds


StopIteration: 