# Notebook for Loading Data

In [None]:
import pandas as pd
from sqlalchemy import create_engine
from time import time
import sys

# Taxi zone lookup data

In [None]:
# Download taxi zone lookup data
#!wget https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/week_4_analytics_engineering/taxi_rides_ny/data/taxi_zone_lookup.csv

In [None]:
df_zones = pd.read_csv("data/taxi_zone_lookup.csv")
df_zones.head(2)

In [None]:
# Creaat engine
engine = create_engine("postgresql://root:root@pgdatabase:5432/ny_taxi")

In [None]:
# Add data to postgres
df_zones.to_sql(name="zones", con=engine, if_exists="replace")

## Check a sample of data

In [None]:
# Load first 100 rows
df = pd.read_csv("data/green_tripdata_2019-01.csv", nrows=100)

In [None]:
df.head(2)

In [None]:
# Change column datatypes to datetime
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

In [None]:
# Create a python engine for SQL
# arguments: <type_of_db>://<user>:<password_from_docker>@<hostname>:<port>/<db_name>
engine = create_engine("postgresql://root:root@localhost:5431/ny_taxi")

In [None]:
# View SQL schema using the dataframe and engine
# This will only work if the Docker container is running
print(pd.io.sql.get_schema(df, name="green_taxi_data", con=engine))

In [None]:
# Add column names to database
df.head(n=0).to_sql(name="green_taxi_data", con=engine, if_exists="replace")

## Create iterator to go through the data

In [None]:
df_iter = pd.read_csv("data/green_tripdata_2019-01.csv", iterator=True, chunksize=100000)

In [None]:
df_iter

## Add all of the data to the db

In [None]:
while True:
    try:
        t_start = time()

        # Get next chunk of data
        df = next(df_iter)

        # Make datatype corrections
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

        # Append chunk to db
        df.to_sql(name="green_taxi_data", con=engine, if_exists="append")

        t_end = time()
        print("inserted another chunk...took %.3f seconds" % (t_end - t_start))
    except:
        print("end of file has been reached, all data is loaded...")
        break