# Bulk Load Data

This is for the INTERNAL SNOWFLAKE SE use to bulk load data for a demo.  It uses data that has already been downloaded, converted to gzip and saved in an S3 bucket in the Snowflake SE sandbox account.  
  
Another version of this file exists in the repo for customers/partners which does not rely on Snowflake accounts for ELT bulk load.

DO NOT SHARE THIS VERSION EXTERNALLY.  

In [None]:
from citibike_ml import elt as cbelt

import snowflake.snowpark as snp

import pandas as pd
from datetime import datetime
import os
import uuid 
import json
import getpass

with open('creds.json') as f:
    data = json.load(f)
    connection_parameters = {
      'account': data['account'],
      'user': data['username'],
      'password': data['password'], #getpass.getpass(),
      'role': data['role'],
      'warehouse': data['warehouse']}

session = snp.Session.builder.configs(connection_parameters).create()

In [None]:
start = datetime.now()
print("Start Time =", start.strftime("%H:%M:%S"))

In [None]:
project_db_name = 'CITIBIKEML'
project_schema_name = 'DEMO'
project_db_schema = str(project_db_name)+'.'+str(project_schema_name)

load_table_name = str(project_db_schema)+'.'+'RAW_'
trips_table_name = str(project_db_schema)+'.'+'TRIPS'

_ = session.sql('CREATE OR REPLACE DATABASE '+str(project_db_name)).collect()
_ = session.sql('USE DATABASE '+str(project_db_name)).collect()

_ = session.sql('CREATE SCHEMA '+str(project_db_schema)).collect()
_ = session.sql('USE SCHEMA '+str(project_db_schema)).collect()

In [None]:
aws_role_arn='arn:aws:iam::484577546576:role/citibike-demo-ml-s3-role'
internal_url='s3://citibike-demo-ml/data/'

import uuid 
stage_id = str(uuid.uuid1()).replace('-', '_')
stage_name = 'load_stage_'+str(stage_id)

sql_cmd = 'CREATE OR REPLACE TEMPORARY STAGE '+str(stage_name)+\
    ' url='+str(internal_url)+\
    ' credentials=(aws_role=\''+ str(aws_role_arn)+'\')'
session.sql(sql_cmd).collect()

In [None]:
#For files like 201306-citibike-tripdata.zip
date_range1 = pd.period_range(start=datetime.strptime("201306", "%Y%m"), 
                             end=datetime.strptime("201612", "%Y%m"), 
                             freq='M').strftime("%Y%m")
file_name_end1 = '-citibike-tripdata.zip'
files_to_extract = [date+file_name_end1 for date in date_range1.to_list()]

#For files like 201701-citibike-tripdata.csv.zip
date_range2 = pd.period_range(start=datetime.strptime("201701", "%Y%m"), 
                             end=datetime.strptime("202002", "%Y%m"), 
                             freq='M').strftime("%Y%m")

file_name_end2 = '-citibike-tripdata.csv.zip'
files_to_extract = files_to_extract + [date+file_name_end2 for date in date_range2.to_list()]

Normally we would start by extract to stage.  For the internal demo these files are already ready to load.

In [None]:
files_to_load = [file.replace('.zip','.gz') for file in files_to_extract]
#stage_name, files_to_load = cbelt.extract_trips_to_stage(session, files_to_extract, download_base_url, stage_name)

In [None]:
stage_table_names = cbelt.load_trips_to_raw(session, files_to_load, stage_name, load_table_name)

In [None]:
trips_table_name = cbelt.transform_trips(session, stage_table_names, trips_table_name)

In [None]:
testdf = session.table(trips_table_name)
testdf.schema
testdf.count()

In [None]:
session.close()

In [None]:
end = datetime.now()
print("End Time =", end.strftime("%H:%M:%S"))

run_time = end-start
print("Total Run Time (min) =", run_time.total_seconds()/60)