# Delete Data and Resources

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
import shutil
from glob import glob
from typing import Union

import boto3
import pandas as pd
import snowflake.connector
from dotenv import find_dotenv, load_dotenv

## About

Clean up all resources created and local data.

## User Inputs

In [3]:
stations_db_name = "torbikestations"

trips_table_name = "trips"
station_stats_table_name = "station_stats"

trips_stage_name = "bikes_stage"
trips_file_format_name = "COMMASEP_ONEHEADROW"

ci_run = "no"

In [4]:
if ci_run == "no":
    load_dotenv(find_dotenv())


trips_db_name = os.getenv("DB_NAME")
snowflake_dict_no_db = dict(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASS"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    role="sysadmin",
)
snowflake_dict = dict(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASS"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=trips_db_name,
    schema=os.getenv("SNOWFLAKE_DB_SCHEMA"),
    role="sysadmin",
)
snowflake_station_stats_dict = dict(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASS"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=stations_db_name,
    schema=os.getenv("SNOWFLAKE_DB_SCHEMA"),
    role="sysadmin",
)

In [5]:
aws_region = os.getenv("AWS_REGION")
account_id = (
    boto3.client("sts", region_name=aws_region).get_caller_identity().get("Account")
)

In [6]:
def show_sql_df(
    query: str,
    cursor,
    table_output: bool = False,
) -> Union[None, pd.DataFrame]:
    cursor.execute(query)
    if table_output:
        colnames = [cdesc[0].lower() for cdesc in cursor.description]
        cur_fetched = cursor.fetchall()
        if cur_fetched:
            df_query_output = pd.DataFrame.from_records(cur_fetched, columns=colnames)
            display(df_query_output)
            return df_query_output
    return None

In [7]:
conn = snowflake.connector.connect(**snowflake_dict)
cur = conn.cursor()

## Delete AWS QuickSight Data Source

### Create AWS Python SDK Objects for Deleting QuickSight Resources

In [8]:
qs_client_user = boto3.client("quicksight", region_name="us-east-1")
qs_client = boto3.client("quicksight", region_name=aws_region)

### Delete Data Source

In [9]:
ds_list = [
    ds
    for ds in qs_client.list_data_sources(AwsAccountId=account_id)["DataSources"]
    if ds["Name"] == trips_db_name
]
ds_list

[]

In [10]:
for ds in ds_list:
    qs_client.delete_data_source(
        AwsAccountId=account_id, DataSourceId=ds["DataSourceId"]
    )

## Delete Snowflake Resources

### Trips Database Internal Data Stage

In [11]:
query = f"""
        DROP STAGE IF EXISTS {trips_stage_name}
        """
_ = cur.execute(query)

In [12]:
%%time
query = f"""
        SHOW STAGES LIKE '{trips_stage_name}'
        """
_ = show_sql_df(query, cur, True)

CPU times: user 12.1 ms, sys: 0 ns, total: 12.1 ms
Wall time: 98.6 ms


### Trips Database CSV File Format

In [13]:
query = f"""
        DROP FILE FORMAT IF EXISTS {trips_file_format_name}
        """
_ = cur.execute(query)

In [14]:
%%time
query = f"""
        SHOW FILE FORMATS LIKE '{trips_file_format_name}'
        """
_ = show_sql_df(query, cur, True)

CPU times: user 3.92 ms, sys: 0 ns, total: 3.92 ms
Wall time: 77.8 ms


### Trips Database Table

In [15]:
query = f"""
        DROP TABLE IF EXISTS {trips_table_name}
        """
_ = cur.execute(query)

In [16]:
%%time
query = f"""
        SHOW TABLES LIKE '{trips_table_name}'
        """
_ = show_sql_df(query, cur, True)

CPU times: user 3.62 ms, sys: 0 ns, total: 3.62 ms
Wall time: 100 ms


In [17]:
cur.close()
conn.close()

### Station Statistics Database Table

In [18]:
conn = snowflake.connector.connect(**snowflake_station_stats_dict)
cur = conn.cursor()

In [19]:
query = f"""
        DROP TABLE IF EXISTS {station_stats_table_name}
        """
_ = cur.execute(query)

In [20]:
%%time
query = f"""
        SHOW TABLES LIKE '{station_stats_table_name}'
        """
_ = show_sql_df(query, cur, True)

CPU times: user 3.48 ms, sys: 0 ns, total: 3.48 ms
Wall time: 88 ms


In [21]:
cur.close()
conn.close()

### Databases

In [22]:
conn = snowflake.connector.connect(**snowflake_dict_no_db)
cur = conn.cursor()

In [23]:
for database_name in [trips_db_name, stations_db_name]:
    query = f"""
            DROP DATABASE {database_name}
            """
    _ = cur.execute(query)

In [24]:
%%time
query = """
        SHOW DATABASES
        """
df_databases = show_sql_df(query, cur, True)

Unnamed: 0,created_on,name,is_default,is_current,origin,owner,comment,options,retention_time
0,2022-01-27 16:12:39.701000-08:00,DEMO_DB,N,N,,SYSADMIN,,,1
1,2022-01-27 10:58:19.534000-08:00,SNOWFLAKE_SAMPLE_DATA,N,N,SFC_SAMPLES.SAMPLE_DATA,ACCOUNTADMIN,Provided by Snowflake during account provisioning,,1
2,2022-01-27 16:12:52.421000-08:00,UTIL_DB,N,N,,SYSADMIN,,,1


CPU times: user 10.4 ms, sys: 857 µs, total: 11.2 ms
Wall time: 102 ms


In [25]:
database_list = [trips_db_name, stations_db_name]
assert df_databases.query("name.isin(@database_list)").empty

In [26]:
cur.close()
conn.close()

## Delete Local Data Files

### Raw Bikeshare Trips Data Files

In [27]:
for f in glob("data/raw/*.csv"):
    os.remove(f)

### Raw Files for Supplementary Datasets

In [28]:
raw_data_dirs = glob("data/raw/*")
for pdir in raw_data_dirs:
    if os.path.isdir(pdir):
        shutil.rmtree(pdir)

### Processed Data Files

In [30]:
for f in glob("data/processed/*.csv.gz"):
    os.remove(f)