In [3]:
cd ..

/Users/jisooryu/Projects/lease-version-reliability


### Upload processed data (df, df_all) to S3 (train)

In [2]:
import pandas as pd

In [3]:
#read df, df_all 

%store -r df 
%store -r df_all

no stored variable or alias df_all


In [4]:
import os
import shutil
import typing

import boto3
import botocore
import structlog

from train.config.settings import settings

logger = structlog.get_logger()


def get_web_identity_token() -> str:
    """
    Get token value from filepath
    """
    token = ""
    with open(settings.AWS_WEB_IDENTITY_TOKEN_FILE) as f:  # type: ignore
        token = f.read().strip()

    return token


def get_aws_cred() -> typing.Any:
    """
    Get AWS credential
    """
    token = get_web_identity_token()
    sts_client = boto3.client("sts")
    assumed_role_object = sts_client.assume_role_with_web_identity(
        RoleArn=settings.AWS_ROLE_ARN,
        RoleSessionName="SalesLinkageSession",
        WebIdentityToken=token,
    )

    return assumed_role_object["Credentials"]


def get_s3_resource() -> typing.Any:
    """
    Get S3 resource
    """
    if not settings.AWS_WEB_IDENTITY_TOKEN_FILE:
        s3_resource = boto3.resource("s3")
    else:
        cred = get_aws_cred()
        s3_resource = boto3.resource(
            "s3",
            aws_access_key_id=cred["AccessKeyId"],
            aws_secret_access_key=cred["SecretAccessKey"],
            aws_session_token=cred["SessionToken"],
        )

    return s3_resource

In [5]:
def upload_dataset(directory:str) -> None: 
    """
    Upload processed dataset to S3 bucket 
    """
    #"processed" folder is created --> removes files but not the folder 
    
    s3 = get_s3_resource()
    object_name = f"{settings.PROJECT_NAME}/{settings.DATA_DIR}/{directory}/{settings.ENV}.dataset.tar.gz"
    file_name = f"{settings.DATA_DIR}/{directory}/dataset.tar.gz"

    try:
        shutil.make_archive(
            f"{settings.DATA_DIR}/{directory}" + "/dataset",
            "gztar",
            settings.DATA_DIR,
        )

        s3.Bucket(settings.MODELS_S3_BUCKET).upload_file(
            file_name,
            object_name.format(settings.ENV),
        )

        os.remove(file_name)
        # os.rmdir(directory)
        logger.debug("Successfully uploaded dataset")
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            logger.error("Permission denied when trying to upload file.")
        else:
            raise

In [60]:
import pickle 

with open(f"{settings.DATA_DIR}"+"/processed"+"/reliable_data", "wb") as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f"{settings.DATA_DIR}"+"/processed"+"/all_data", "wb") as handle:
    pickle.dump(df_all, handle, protocol=pickle.HIGHEST_PROTOCOL)

KeyboardInterrupt: 

In [6]:
upload_dataset("processed") 

KeyboardInterrupt: 

### Download processed data from S3 (batch)

In [4]:
from batch.config.settings import settings

In [32]:
def download_dataset(directory: str) -> None:
    """
    Get dataset from S3 bucket
    """
    s3 = get_s3_resource()
    object_name = (
        f"{settings.PROJECT_NAME}/{settings.DATA_DIR}/{directory}/{settings.ENV}.dataset.tar.gz"
    )
    file_name = f"{settings.DATA_DIR}/{directory}/dataset.tar.gz"

    try:
        s3.Bucket(settings.MODELS_S3_BUCKET).download_file(
            object_name,
            file_name,
        )
        shutil.unpack_archive(file_name, directory)
        os.remove(file_name)
        logger.debug("Successfully downloaded dataset")
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            logger.error("The object does not exist.")
        else:
            raise

In [33]:
download_dataset("processed")

2022-12-14 10:33.07 [debug    ] Successfully downloaded dataset
