# Create Ingest Logic

### Incremental and Bulk Extract, Load and Transform
We expect to get new data every month which we will incrementally load.

In [None]:
from dags.snowpark_connection import snowpark_connect
session, state_dict = snowpark_connect()

In [None]:
from dags import elt as ELT

import snowflake.snowpark as snp
from datetime import datetime
import uuid 

start = datetime.now()
print("Start Time =", start.strftime("%H:%M:%S"))

load_stage_name=state_dict['load_stage_name']
session.sql('CREATE STAGE IF NOT EXISTS '+load_stage_name).collect()

In [None]:
download_base_url=state_dict['download_base_url']
load_table_name=state_dict['load_table_name']
trips_table_name=state_dict['trips_table_name']

file_name_end2 = '202102-citibike-tripdata.csv.zip'
file_name_end1 = '201402-citibike-tripdata.zip'
file_name_end3 = '202003-citibike-tripdata.csv.zip'

files_to_download = [file_name_end1, file_name_end3]

In [None]:
%%time
load_stage_names, files_to_load = ELT.extract_trips_to_stage(session=session, 
                                                            files_to_download=files_to_download, 
                                                            download_base_url=state_dict['download_base_url'], 
                                                            load_stage_name=state_dict['load_stage_name'])

In [None]:
%%time
stage_table_names = ELT.load_trips_to_raw(session=session, 
                                          files_to_load=files_to_load, 
                                          load_stage_names=load_stage_names, 
                                          load_table_name=state_dict['load_table_name'])

In [None]:
%%time
trips_table_name = ELT.transform_trips(session=session, 
                                       stage_table_names=stage_table_names, 
                                       trips_table_name=state_dict['trips_table_name'])

In [None]:
from datetime import datetime

files_to_ingest=['202004-citibike-tripdata.csv.zip', '202102-citibike-tripdata.csv.zip']
schema1_download_files = list()
schema2_download_files = list()
schema2_start_date = datetime.strptime('202102', "%Y%m")

for file_name in files_to_ingest:
    file_start_date = datetime.strptime(file_name.split("-")[0], "%Y%m")
    if file_start_date < schema2_start_date:
        schema1_download_files.append(file_name.replace('.zip','.gz'))
    else:
        schema2_download_files.append(file_name.replace('.zip','.gz'))
        
files_to_load = {'schema1': schema1_download_files, 'schema2': schema2_download_files}
files_to_load

In [None]:
%%writefile dags/ingest.py
def incremental_elt(session, 
                    state_dict:dict, 
                    files_to_ingest:list, 
                    download_role_ARN='',
                    download_base_url='') -> str:
    
    import dags.elt as ELT
    from datetime import datetime

    load_stage_name=state_dict['load_stage_name']
    load_table_name=state_dict['load_table_name']
    trips_table_name=state_dict['trips_table_name']
    
    if download_role_ARN and download_base_url:
        print("Skipping extract.  Using provided bucket.")
        sql_cmd = 'CREATE OR REPLACE TEMPORARY STAGE TEMP_LOAD_STAGE'\
                  ' url='+str(download_base_url)+\
                  " credentials=(aws_role='" + str(download_role_ARN)+"')"
        session.sql(sql_cmd).collect()
        
        schema1_download_files = list()
        schema2_download_files = list()
        schema2_start_date = datetime.strptime('202102', "%Y%m")

        for file_name in files_to_ingest:
            file_start_date = datetime.strptime(file_name.split("-")[0], "%Y%m")
            if file_start_date < schema2_start_date:
                schema1_download_files.append(file_name.replace('.zip','.gz'))
            else:
                schema2_download_files.append(file_name.replace('.zip','.gz'))
        
        
        load_stage_names = {'schema1':'TEMP_LOAD_STAGE', 'schema2':'TEMP_LOAD_STAGE'}
        files_to_load = {'schema1': schema1_download_files, 'schema2': schema2_download_files}
        #print(files_to_load)
    else:
        print("Extracting files from public location.")
        download_base_url=state_dict['download_base_url']
        #_ = session.sql('CREATE OR REPLACE TEMPORARY STAGE '+str(load_stage_name)).collect()        
        load_stage_names, files_to_load = ELT.extract_trips_to_stage(session=session, 
                                                                    files_to_download=files_to_ingest, 
                                                                    download_base_url=download_base_url, 
                                                                    load_stage_name=load_stage_name)

    print("Loading files to raw.")
    stage_table_names = ELT.load_trips_to_raw(session=session, 
                                              files_to_load=files_to_load, 
                                              load_stage_names=load_stage_names, 
                                              load_table_name=load_table_name)    
    
    print("Transforming records to trips table.")
    trips_table_name = ELT.transform_trips(session=session, 
                                           stage_table_names=stage_table_names, 
                                           trips_table_name=trips_table_name)
    return trips_table_name

def bulk_elt(session, 
             state_dict:dict,
             download_role_ARN='', 
             download_base_url=''
            ) -> str:
    
    import dags.elt as ELT
    from dags.ingest import incremental_elt
    
    import pandas as pd
    from datetime import datetime

    #Create a list of filenames to download based on date range
    #For files like 201306-citibike-tripdata.zip
    date_range1 = pd.period_range(start=datetime.strptime("201306", "%Y%m"), 
                                 end=datetime.strptime("201612", "%Y%m"), 
                                 freq='M').strftime("%Y%m")
    file_name_end1 = '-citibike-tripdata.zip'
    files_to_extract = [date+file_name_end1 for date in date_range1.to_list()]

    #For files like 201701-citibike-tripdata.csv.zip
    date_range2 = pd.period_range(start=datetime.strptime("201701", "%Y%m"), 
                                 end=datetime.strptime("202002", "%Y%m"), 
                                 freq='M').strftime("%Y%m")
    file_name_end2 = '-citibike-tripdata.csv.zip'
    files_to_extract = files_to_extract + [date+file_name_end2 for date in date_range2.to_list()]

    if download_role_ARN and download_base_url:
        trips_table_name = incremental_elt(session=session, 
                                           state_dict=state_dict, 
                                           files_to_ingest=files_to_extract, 
                                           download_role_ARN=download_role_ARN,
                                          download_base_url=download_base_url)
    else:
        trips_table_name = incremental_elt(session=session, 
                                           state_dict=state_dict, 
                                           files_to_ingest=files_to_extract)
    
    return trips_table_name


In [None]:
from dags.snowpark_connection import snowpark_connect
session, state_dict = snowpark_connect('./include/state.json')
session.use_warehouse(state_dict['compute_parameters']['fe_warehouse'])

In [None]:
%%time
from dags.ingest import incremental_elt, bulk_elt
incremental_elt(session=session, 
                state_dict=state_dict, 
                files_to_ingest=['202003-citibike-tripdata.csv.zip']
               )

In [None]:
%%time
from dags.ingest import incremental_elt, bulk_elt
incremental_elt(session, state_dict, files_to_ingest=['202004-citibike-tripdata.csv.zip', 
                                                      '202102-citibike-tripdata.csv.zip'])

In [None]:
%%time
from dags.ingest import incremental_elt, bulk_elt
incremental_elt(session=session, 
                state_dict=state_dict, 
                files_to_ingest=['202001-citibike-tripdata.csv.zip', '202005-citibike-tripdata.csv.zip'],
                download_role_ARN='arn:aws:iam::484577546576:role/citibike-demo-ml-s3-role',
                download_base_url='s3://citibike-demo-ml/data/')

In [None]:
# %%time
# from dags.ingest import incremental_elt, bulk_elt
# _ = session.sql('CREATE OR REPLACE DATABASE '+state_dict['connection_parameters']['database']).collect()
# _ = session.sql('CREATE SCHEMA '+state_dict['connection_parameters']['schema']).collect() 

# session.use_warehouse(state_dict['compute_parameters']['fe_warehouse']).collect()

# bulk_elt(session=session, state_dict=state_dict)

In [None]:
%%time
from dags.ingest import incremental_elt, bulk_elt
from dags.elt import schema1_definition, schema2_definition

_ = session.sql('CREATE OR REPLACE DATABASE '+state_dict['connection_parameters']['database']).collect()
_ = session.sql('CREATE SCHEMA '+state_dict['connection_parameters']['schema']).collect() 

load_schema1 = schema1_definition()
session.createDataFrame([[None]*len(load_schema1.names)], schema=load_schema1)\
       .na.drop()\
       .write\
       .saveAsTable(state_dict['load_table_name']+'schema1')

load_schema2 = schema2_definition()
session.createDataFrame([[None]*len(load_schema2.names)], schema=load_schema2)\
       .na.drop()\
       .write\
       .saveAsTable(state_dict['load_table_name']+'schema2')

session.use_warehouse(state_dict['compute_parameters']['fe_warehouse'])

bulk_elt(session=session, 
         state_dict=state_dict, 
         download_role_ARN='arn:aws:iam::484577546576:role/citibike-demo-ml-s3-role',
         download_base_url='s3://citibike-demo-ml/data/')

In [None]:
session.close()