In [1]:
import os
import boto3
import pathlib
import pandas as pd
from dotenv import load_dotenv
from src.data.s3_communication import S3Communication

In [2]:
# Load credentials
dotenv_dir = os.environ.get(
    "CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src")
)
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

## Upload Files (from disk to s3)

In [3]:
s3c = S3Communication(
    s3_endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("S3_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("S3_SECRET_KEY"),
    s3_bucket=os.getenv("S3_BUCKET"),
)

In [4]:
filepaths = [
    "../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx",
    "../../data/curation/esg_TEXT_dataset.csv",
    "../../data/extraction/sustainability-report-2019.json",
    "../../data/kpi_mapping/ESG/kpi_mapping.csv",
    "../../data/pdfs/ESG/sustainability-report-2019.pdf",
]

In [5]:
# upload everything
for fpath in filepaths:
    splits = fpath.split("/")
    dirname, filename = splits[3], splits[-1]
    print(f"uploading {fpath} to corpdata/ESG/{dirname}/{filename}...", end="")
    ret = s3c.upload_file_to_s3(
        fpath,
        f"corpdata/ESG/{dirname}",
        filename,
    )
    print(ret['ResponseMetadata']['HTTPStatusCode'])

uploading ../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx to corpdata/ESG/annotations/20201030 1Qbit aggregated_annotations_needs_correction.xlsx...200
uploading ../../data/curation/esg_TEXT_dataset.csv to corpdata/ESG/curation/esg_TEXT_dataset.csv...200
uploading ../../data/extraction/sustainability-report-2019.json to corpdata/ESG/extraction/sustainability-report-2019.json...200
uploading ../../data/kpi_mapping/ESG/kpi_mapping.csv to corpdata/ESG/kpi_mapping/kpi_mapping.csv...200
uploading ../../data/pdfs/ESG/sustainability-report-2019.pdf to corpdata/ESG/pdfs/sustainability-report-2019.pdf...200


## View the Uploaded Files

In [None]:
s3client = boto3.client(
    "s3",
    endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("S3_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("S3_SECRET_KEY"),
)

In [None]:
ret = s3client.list_objects_v2(Bucket=os.getenv("S3_BUCKET"))
# ret["Contents"]

In [None]:
# s3client.delete_object(Bucket=os.getenv("S3_BUCKET"), Key="corpdata/ESG/pdfs/sustainability-report-2019.pdf")

## Download Files (from s3 to disk)

Download the same files we uploaded above, and suffix the filenames with "2". Then do a `diff` to ensure that the file content is the same. This is just a sanity check.

In [None]:
filepaths2 = [
    "../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction2.xlsx",
    "../../data/curation/esg_TEXT_dataset2.csv",
    "../../data/extraction/sustainability-report-20192.json",
    "../../data/kpi_mapping/ESG/kpi_mapping2.csv",
    "../../data/pdfs/ESG/sustainability-report-20192.pdf",
]

In [6]:
# download everything into separate dir and verify correctness
for fpath in filepaths2:
    splits = fpath.split("/")
    dirname, filename = splits[3], splits[-1]
    keyname = ''.join(filename.rsplit('2', maxsplit=1))
    breakpoint()
    print(f"download corpdata/ESG/{dirname}/{keyname} to {fpath}")
    ret = s3c.download_file_from_s3(
        fpath,
        f"corpdata/ESG/{dirname}",
        keyname,
    )

download corpdata/ESG/annotations/20201030 1Qbit aggregated_annotations_needs_correction.xlsx to ../../data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction2.xlsx
download corpdata/ESG/curation/esg_TEXT_dataset.csv to ../../data/curation/esg_TEXT_dataset2.csv
download corpdata/ESG/extraction/sustainability-report-2019.json to ../../data/extraction/sustainability-report-20192.json
download corpdata/ESG/kpi_mapping/kpi_mapping.csv to ../../data/kpi_mapping/ESG/kpi_mapping2.csv
download corpdata/ESG/pdfs/sustainability-report-2019.pdf to ../../data/pdfs/ESG/sustainability-report-20192.pdf


## Upload DataFrame directly (from memory, NOT saved on disk)

In [12]:
test_df = pd.read_csv('/home/kachau/Documents/aicoe-osc-demo/data/curation/esg_TEXT_dataset.csv')
test_df.head()

Unnamed: 0.1,Unnamed: 0,question,context,company,source_file,source_page,kpi_id,year,answer,data_type,relevant_paragraphs,annotator,label
0,0,In which year was the annual report or the sus...,IPCC (2018): Global Warming of 1.5 C IPCC (20...,Cabot Oil & Gas Corp,Cabot Oil & Gas Corp Annual Report 2015.pdf,[1],1.0,2015,2015,TEXT,"[""2015 ANNUAL REPORT""]",20201030 1Qbit aggregated_annotations_needs_co...,0
1,1,In which year was the annual report or the sus...,1) Assets means installations and plants in op...,NOVATEK,Sustainability_Report_2017_Eng_small[1].pdf,[1],1.0,2017,2017,TEXT,"[""2017 SUSTAINABILITY REPORT""]",20201030 1Qbit aggregated_annotations_needs_co...,0
2,2,In which year was the annual report or the sus...,2012 annual report,CenterPoint Energy,CenterPoint Energy Annual report 2012.pdf,[1],1.0,2012,2012,TEXT,"[""2012 annual report""]",20201030 1Qbit aggregated_annotations_needs_co...,1
3,3,In which year was the annual report or the sus...,2013 ANNUAL REPORT,Cenovus Energy,Cenovus Annual Report 2013.pdf,[1],1.0,2013,2013,TEXT,"[""2013 ANNUAL REPORT""]",20201030 1Qbit aggregated_annotations_needs_co...,1
4,4,In which year was the annual report or the sus...,2013 annual report,CenterPoint Energy,CenterPoint Energy Annual report 2013.pdf,[1],1.0,2013,2013,TEXT,"[""2013 annual report""]",20201030 1Qbit aggregated_annotations_needs_co...,1


In [None]:
s3c.upload_df_to_s3(test_df, "kachau/test", "test_df.parquet")

## Download DataFrame directly (into memory, NOT saving on disk)

In [14]:
test_df2 = s3c.download_df_from_s3("kachau/test", "test_df.parquet")
test_df2.head()

Unnamed: 0.1,Unnamed: 0,question,context,company,source_file,source_page,kpi_id,year,answer,data_type,relevant_paragraphs,annotator,label
0,0,In which year was the annual report or the sus...,IPCC (2018): Global Warming of 1.5 C IPCC (20...,Cabot Oil & Gas Corp,Cabot Oil & Gas Corp Annual Report 2015.pdf,[1],1.0,2015,2015,TEXT,"[""2015 ANNUAL REPORT""]",20201030 1Qbit aggregated_annotations_needs_co...,0
1,1,In which year was the annual report or the sus...,1) Assets means installations and plants in op...,NOVATEK,Sustainability_Report_2017_Eng_small[1].pdf,[1],1.0,2017,2017,TEXT,"[""2017 SUSTAINABILITY REPORT""]",20201030 1Qbit aggregated_annotations_needs_co...,0
2,2,In which year was the annual report or the sus...,2012 annual report,CenterPoint Energy,CenterPoint Energy Annual report 2012.pdf,[1],1.0,2012,2012,TEXT,"[""2012 annual report""]",20201030 1Qbit aggregated_annotations_needs_co...,1
3,3,In which year was the annual report or the sus...,2013 ANNUAL REPORT,Cenovus Energy,Cenovus Annual Report 2013.pdf,[1],1.0,2013,2013,TEXT,"[""2013 ANNUAL REPORT""]",20201030 1Qbit aggregated_annotations_needs_co...,1
4,4,In which year was the annual report or the sus...,2013 annual report,CenterPoint Energy,CenterPoint Energy Annual report 2013.pdf,[1],1.0,2013,2013,TEXT,"[""2013 annual report""]",20201030 1Qbit aggregated_annotations_needs_co...,1


In [15]:
# sanity check
(test_df == test_df2).all()

Unnamed: 0             True
question               True
context                True
company                True
source_file            True
source_page            True
kpi_id                 True
year                   True
answer                 True
data_type              True
relevant_paragraphs    True
annotator              True
label                  True
dtype: bool