In [1]:
import os
import boto3
import pathlib
import pandas as pd
from dotenv import load_dotenv
from src.data.s3_communication import S3Communication

In [2]:
# Load credentials
dotenv_dir = os.environ.get(
    "CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src")
)
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

## Upload Files (from disk to s3)

### Option 1: Manually using boto3

In [3]:
s3client = boto3.client(
    's3',
    endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("S3_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("S3_SECRET_KEY"),
)
with open("../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx", "rb") as f:
    s3client.upload_fileobj(
        f,
        os.getenv("S3_BUCKET"),
        "kachau/demo/20201030 1Qbit aggregated_annotations_needs_correction.xlsx",
    )

### Option 2: Using our helper class

In [4]:
s3comm = S3Communication(
    s3_endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("S3_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("S3_SECRET_KEY"),
    s3_bucket=os.getenv("S3_BUCKET"),
)

In [5]:
filepaths = [
    "../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx",
    "../../data/curation/esg_TEXT_dataset.csv",
    "../../data/extraction/sustainability-report-2019.json",
    "../../data/kpi_mapping/ESG/kpi_mapping.csv",
    "../../data/pdfs/ESG/sustainability-report-2019.pdf",
]

In [None]:
# upload everything
for fpath in filepaths:
    splits = fpath.split("/")
    dirname, filename = splits[3], splits[-1]
    print(f"uploading {fpath} to corpdata/ESG/{dirname}/{filename}...", end="")
    ret = s3comm.upload_file_to_s3(
        fpath,
        f"corpdata/ESG/{dirname}",
        filename,
    )
    print(ret['ResponseMetadata']['HTTPStatusCode'])

uploading ../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx to corpdata/ESG/annotations/20201030 1Qbit aggregated_annotations_needs_correction.xlsx...200
uploading ../../data/curation/esg_TEXT_dataset.csv to corpdata/ESG/curation/esg_TEXT_dataset.csv...200
uploading ../../data/extraction/sustainability-report-2019.json to corpdata/ESG/extraction/sustainability-report-2019.json...200
uploading ../../data/kpi_mapping/ESG/kpi_mapping.csv to corpdata/ESG/kpi_mapping/kpi_mapping.csv...200
uploading ../../data/pdfs/ESG/sustainability-report-2019.pdf to corpdata/ESG/pdfs/sustainability-report-2019.pdf...

## View the Uploaded Files

In [None]:
ret = s3client.list_objects_v2(Bucket=os.getenv("S3_BUCKET"))
# ret["Contents"]

In [None]:
# s3client.delete_object(Bucket=os.getenv("S3_BUCKET"), Key="corpdata/ESG/pdfs/sustainability-report-2019.pdf")

## Download Files (from s3 to disk)

Download the same files we uploaded above, and suffix the filenames with "2". Then do a `diff` to ensure that the file content is the same. This is just a sanity check.

### Option 1: Manually Using boto3

In [None]:
with open("../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx", "wb") as f:
    s3client.download_fileobj(
        os.getenv("S3_BUCKET"),
        "kachau/demo/20201030 1Qbit aggregated_annotations_needs_correction.xlsx",
        f,
    )

### Option 2: Using our helper class

In [None]:
filepaths2 = [
    "../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction2.xlsx",
    "../../data/curation/esg_TEXT_dataset2.csv",
    "../../data/extraction/sustainability-report-20192.json",
    "../../data/kpi_mapping/ESG/kpi_mapping2.csv",
    "../../data/pdfs/ESG/sustainability-report-20192.pdf",
]

In [None]:
# download everything into separate dir and verify correctness
for fpath in filepaths2:
    splits = fpath.split("/")
    dirname, filename = splits[3], splits[-1]
    keyname = ''.join(filename.rsplit('2', maxsplit=1))
    breakpoint()
    print(f"download corpdata/ESG/{dirname}/{keyname} to {fpath}")
    ret = s3comm.download_file_from_s3(
        fpath,
        f"corpdata/ESG/{dirname}",
        keyname,
    )

## Upload DataFrame directly (from memory, NOT saved on disk)

In [None]:
test_df = pd.read_csv('/home/kachau/Documents/aicoe-osc-demo/data/curation/esg_TEXT_dataset.csv')
test_df.head()

In [None]:
s3comm.upload_df_to_s3(test_df, "kachau/test", "test_df.parquet")

## Download DataFrame directly (into memory, NOT saving on disk)

In [None]:
test_df2 = s3comm.download_df_from_s3("kachau/test", "test_df.parquet")
test_df2.head()

In [None]:
# sanity check
(test_df == test_df2).all()