In [1]:
import os
import boto3
import pathlib
import pandas as pd
from dotenv import load_dotenv
from src.data.s3_communication import S3FileType, S3Communication

In [2]:
# Load credentials
dotenv_dir = os.environ.get(
    "CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src")
)
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

In [3]:
s3c = S3Communication(
    s3_endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("S3_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("S3_SECRET_KEY"),
    s3_bucket=os.getenv("S3_BUCKET"),
)

In [4]:
filepaths = [
    "../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx",
    "../../data/curation/esg_TEXT_dataset.csv",
    "../../data/extraction/sustainability-report-2019.json",
    "../../data/kpi_mapping/ESG/kpi_mapping.csv",
    "../../data/pdfs/ESG/sustainability-report-2019.pdf",
]

filepaths2 = [
    "../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction2.xlsx",
    "../../data/curation/esg_TEXT_dataset2.csv",
    "../../data/extraction/sustainability-report-20192.json",
    "../../data/kpi_mapping/ESG/kpi_mapping2.csv",
    "../../data/pdfs/ESG/sustainability-report-20192.pdf",
]

In [5]:
# upload everything
for fpath in filepaths:
    splits = fpath.split("/")
    dirname, filename = splits[3], splits[-1]
    print(f"uploading {fpath} to corpdata/ESG/{dirname}/{filename}...", end="")
    ret = s3c.upload_file_to_s3(
        fpath,
        f"corpdata/ESG/{dirname}",
        filename,
    )
    print(ret['ResponseMetadata']['HTTPStatusCode'])

uploading ../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx to corpdata/ESG/annotations/20201030 1Qbit aggregated_annotations_needs_correction.xlsx...200
uploading ../../data/curation/esg_TEXT_dataset.csv to corpdata/ESG/curation/esg_TEXT_dataset.csv...200
uploading ../../data/extraction/sustainability-report-2019.json to corpdata/ESG/extraction/sustainability-report-2019.json...200
uploading ../../data/kpi_mapping/ESG/kpi_mapping.csv to corpdata/ESG/kpi_mapping/kpi_mapping.csv...200
uploading ../../data/pdfs/ESG/sustainability-report-2019.pdf to corpdata/ESG/pdfs/sustainability-report-2019.pdf...200


In [6]:
# download everything into separate dir and verify correctness
for fpath in filepaths2:
    splits = fpath.split("/")
    dirname, filename = splits[3], splits[-1]
    keyname = ''.join(filename.rsplit('2', maxsplit=1))
    breakpoint()
    print(f"download corpdata/ESG/{dirname}/{keyname} to {fpath}")
    ret = s3c.download_file_from_s3(
        fpath,
        f"corpdata/ESG/{dirname}",
        keyname,
    )

download corpdata/ESG/annotations/20201030 1Qbit aggregated_annotations_needs_correction.xlsx to ../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction2.xlsx
download corpdata/ESG/curation/esg_TEXT_dataset.csv to ../../data/curation/esg_TEXT_dataset2.csv
download corpdata/ESG/extraction/sustainability-report-2019.json to ../../data/extraction/sustainability-report-20192.json
download corpdata/ESG/kpi_mapping/kpi_mapping.csv to ../../data/kpi_mapping/ESG/kpi_mapping2.csv
download corpdata/ESG/pdfs/sustainability-report-2019.pdf to ../../data/pdfs/ESG/sustainability-report-20192.pdf


In [7]:
s3client = boto3.client(
    "s3",
    endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("S3_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("S3_SECRET_KEY"),
)

In [11]:
ret = s3client.list_objects_v2(Bucket=os.getenv("S3_BUCKET"))
# ret["Contents"]

In [9]:
# s3client.delete_object(Bucket=os.getenv("S3_BUCKET"), Key="corpdata/ESG/pdfs/sustainability-report-2019.pdf")