In [100]:
import pandas as pd
from sqlalchemy import create_engine
from time import time
df = pd.read_csv("yellow_tripdata_2019-01.csv", \
                 parse_dates=['tpep_pickup_datetime','tpep_dropoff_datetime'],\
                 nrows=10)

In [101]:
print(pd.io.sql.get_schema(df,"yellow_taxi_data"))

CREATE TABLE "yellow_taxi_data" (
"VendorID" INTEGER,
  "tpep_pickup_datetime" TIMESTAMP,
  "tpep_dropoff_datetime" TIMESTAMP,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "RatecodeID" INTEGER,
  "store_and_fwd_flag" TEXT,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "payment_type" INTEGER,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "congestion_surcharge" REAL
)


In [105]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')
# create_engine(postgresql://user:passwpord@localhost:port/database_name)
# pip install psycopg2

In [106]:
# test the connection
engine.connect()

<sqlalchemy.engine.base.Connection at 0x138eeccd0>

In [107]:
table_name = 'yellow_taxi_data'
# get the postgresql shcema
print(pd.io.sql.get_schema(df,table_name,con=engine,))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [76]:
# definition in postgresql dialect

table_name = 'yellow_tripdata_2019-01.csv'
# get the postgresql shcema
schema = pd.io.sql.get_schema(df,table_name,con=engine)

with engine.connect() as conn:
#     drop table if exists
    conn.execute(f'DROP TABLE IF EXISTS {table_name}')
#     create the table
    conn.execute(schema)

or use 

```sql
df.head(0).to_sql(name = table_name,\
                 con= engine,
                 if_exists='replace')
```

In [98]:
data_iter = pd.read_csv("yellow_tripdata_2019-01.csv", \
             parse_dates=['tpep_pickup_datetime','tpep_dropoff_datetime'],\
#              iterator=True,\
             chunksize=10000)
type(datda

In [110]:
# read the data in chunks
data_iter = pd.read_csv("yellow_tripdata_2019-01.csv", \
                         parse_dates=['tpep_pickup_datetime','tpep_dropoff_datetime'],\
                         chunksize=100000)

table_name = 'yellow_taxi_data'

# insert the in chunks
for data in data_iter:
    start_time = time()
    data.to_sql(name= table_name, con=engine, if_exists= 'append')
    print(f'Inserted {len(data)} chunk data ... took %.3f'%(time() - start_time))

Inserted 100000 chunk data ... took 30.089
Inserted 100000 chunk data ... took 25.849
Inserted 100000 chunk data ... took 30.159
Inserted 100000 chunk data ... took 24.380
Inserted 100000 chunk data ... took 29.061
Inserted 100000 chunk data ... took 27.801
Inserted 100000 chunk data ... took 37.121
Inserted 100000 chunk data ... took 34.737
Inserted 100000 chunk data ... took 25.866
Inserted 100000 chunk data ... took 32.040
Inserted 100000 chunk data ... took 35.872
Inserted 100000 chunk data ... took 25.430
Inserted 100000 chunk data ... took 23.279
Inserted 100000 chunk data ... took 27.261
Inserted 100000 chunk data ... took 40.142
Inserted 100000 chunk data ... took 27.178
Inserted 100000 chunk data ... took 25.193
Inserted 100000 chunk data ... took 28.932
Inserted 100000 chunk data ... took 36.720
Inserted 100000 chunk data ... took 30.067
Inserted 100000 chunk data ... took 28.263
Inserted 100000 chunk data ... took 32.118
Inserted 100000 chunk data ... took 31.204
Inserted 10