In [None]:
import pandas as pd

In [None]:
pd.__version__

In [None]:
df = pd.read_csv("yellow_tripdata_2021-01.csv",nrows=100)

In [None]:
df.head()

In [None]:
## generate an sql create query from a dataframe
print(pd.io.sql.get_schema(df,name='yellow_taxi_data'))

### Convert the data in the drop off and pickup columns to date time

In [None]:


df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

### generate an sql create query from a dataframe

In [None]:
print(pd.io.sql.get_schema(df,name='yellow_taxi_data'))

### Working with sqlalchemy

In [None]:
from sqlalchemy import create_engine

In [None]:
# Initializing our connection to the server
engine = create_engine('postgresql://root:password@localhost:5431/ny_taxi')
engine.connect()
cursor = engine.connect() # creating a cursor to run sql queries with

### Creating the table using default SQL create table command

In [None]:
cursor.execute("""CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)""")

### Batch upload , splitting data into chunksizes

In [None]:
from time import time
import pandas as pd
# to split our full data into chunks
dtf_iter = pd.read_csv("yellow_tripdata_2021-01.csv",iterator=True, chunksize =100000)
while True:
    t_start = time()
    dtf = next(dtf_iter)
    
    # to convert the data in the drop off and pickup columns to date time
    dtf.tpep_dropoff_datetime = pd.to_datetime(dtf.tpep_dropoff_datetime)
    dtf.tpep_pickup_datetime = pd.to_datetime(dtf.tpep_pickup_datetime)

    try:
        dtf.to_sql(name="yellow_taxi_data",con=engine,if_exists='append')
        t_end = time()
        diff = t_end - t_start 
        print (f'Another 100k rows of data uploaded in %.3f seconds' %(diff))
    except StopIteration:
        print (f'Done uploading in %.3f seconds' %(diff))
        break

In [None]:
## generate a PostgreSQL create table query from a dataframe
print(pd.io.sql.get_schema(df,name='yellow_taxi_data',con=engine))

In [None]:
# to convert the data in the drop off and pickup columns to date time
dtf.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
dtf.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

# to split our full data into chunks
dtf_iter = pd.read_csv("yellow_tripdata_2021-01.csv",iterator=True, chunksize =100000)

In [None]:
dtf = next(dtf_iter)

In [None]:
dtf.head()

In [None]:
dtf.head(n=0)

### Create table using the column names { initialize an empty table directly using pandas== to_sql function}

In [None]:

dtf.head(n=0).to_sql(name="yellow_taxi_data",con=engine,if_exists='replace')

In [None]:
# NOW to upload data into the postgres db
%time dtf.to_sql(name="yellow_taxi_data",con=engine,if_exists='append')

### Recurcive function compiled from above individual runs

In [None]:
from time import time
import pandas as pd
# to split our full data into chunks
dtf_iter = pd.read_csv("yellow_tripdata_2021-01.csv",iterator=True, chunksize =100000)
while True:
    t_start = time()
    dtf = next(dtf_iter)
    
    # to convert the data in the drop off and pickup columns to date time
    dtf.tpep_dropoff_datetime = pd.to_datetime(dtf.tpep_dropoff_datetime)
    dtf.tpep_pickup_datetime = pd.to_datetime(dtf.tpep_pickup_datetime)

    dtf.to_sql(name="yellow_taxi_data",con=engine,if_exists='append')
    t_end = time()
    diff = t_end - t_start 
    print (f'Another chunk succesfully uploaded in %.3f seconds' %(diff))

## SQL Refresher

`SELECT * FROM zones;` => _returns the values of the table_ `zones`
<br>`SELECT * FROM yellow_taxi_data t LIMIT 100;` => _returns the first 100 rows of the table_ `yellow_taxi_data`


### Joins btw tables and columns

``Using WHERE``
SELECT tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,total_amount,
<br>CONCAT(zpu."Borough",' ',zpu."Zone") as pick_up_loc,
<br>CONCAT(zdo."Borough",' ',zdo."Zone") as drop_off_loc
<br>FROM 
<br>yellow_taxi_data t,
<br>zones zpu,
<br>zones zdo
<br>WHERE t."PULocationID" = zpu."LocationID" AND
<br>t."DOLocationID" = zdo."LocationID"

`Using JOIN`

SELECT tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,total_amount,
<br>CONCAT(zpu."Borough",' ',zpu."Zone") as pick_up_loc,
<br>CONCAT(zdo."Borough",' ',zdo."Zone") as drop_off_loc
<br>FROM 
<br>yellow_taxi_data t JOIN zones zpu,
<br>ON t."PULocationID" = zpu."LocationID"
JOIN zones zdo
<br> ON t."DOLocationID" = zdo."LocationID"

# Learning Python

In [None]:
import turtle
wn = turtle.Screen()
wn.bgcolor("green")
tess = turtle.Turtle()
tess.shape("turtle")
tess.color("red")

tess.penup() # This is new
size = 20
for i in range(30):
   tess.stamp() # Leave an impression on the canvas
   size = size + 3 # Increase the size on every iteration
   tess.forward(size) # Move tess along
   tess.right(24)

wn.mainloop()

In [None]:
month = ["January","Feb","Mar","April","May"]
for m in month:
   bday = "Happy birthday to all "+m+" celebrants"
   print(bday)

In [None]:
def final_amt(p, r, n, t):
   """
Apply the compound interest formula to p
to produce the final amount.
"""

   a = p * (1 + r/n) ** (n*t)
   return a # This is new, and makes the function fruitful.

   # now that we have the function above, let us call it.
toInvest = float(input("How much do you want to invest?"))
fnl = final_amt(toInvest, 0.1, 12, 1)
print("At the end of the period you'll have", fnl)


In [None]:
final_amt(100000, 0.1, 12, 1)

In [None]:
def powers(num,pow):
   for x in range(num):
      print(x,"/t", pow**x)

In [None]:
powers(10,4)

In [None]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres:Oluwagbenga007@localhost:5432/postgres')
engine.connect()

In [None]:
from sqlalchemy import create_engine
engine = create_engine("mysql://techsupport-bruce:Xrq4$9ammN@138.68.183.107/lara_beta_stage")
engine.connect()

In [None]:
with engine.connect() as conn:
    conn.execute("""LOAD DATA LOCAL INFILE "C:/Users/efaso/Documents/beta_city_data/all_routes.csv" INTO TABLE lara_beta_stage.route FIELDS TERMINATED BY ','LINES TERMINATED BY '/n' IGNORE 1 LINES (shape_id, route_id, trip_id, route_name,origin_id, dest_id);""")
    

In [7]:
!mkdir parquet_data

In [11]:
import pandas as pd
import pyarrow.parquet as pq
# !curl -sSL 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2022-01.parquet'
src_file = "C:/Users/efaso/Documents/git_repos/data_engineering/1_Docker_Terraform_GCP_Intro/yellow_tripdata_2022-01.parquet"
output_csv = "C:/Users/efaso/Documents/git_repos/data_engineering/1_Docker_Terraform_GCP_Intro/yellow_tripdata_2022-01.csv"
df = pd.read_parquet(src_file)
df.to_csv(output_csv)