In [17]:
import glob
import boto3
import pandas as pd
from boto3.session import Session

import warnings
warnings.filterwarnings('ignore')

In [18]:
s3_details = {"aws_access_key_id":"",
"aws_secret_access_key": "",
"bucket_name": "",
"aws_region":"ap-south-1"}

In [19]:
s3_resource = boto3.resource(
    's3',
    aws_access_key_id=s3_details["aws_access_key_id"],
    aws_secret_access_key=s3_details["aws_secret_access_key"],
    verify=False
)

## How to Read a CSV file

In [20]:
obj = s3_resource.Object(s3_details['bucket_name'], 'ADANI_INDIA/202003140818/dim_date/dim_date.csv')
file = obj.get()['Body'] #.read()

In [21]:
df = pd.read_csv(file, header=14, delimiter="|", low_memory=False)
df.head()

Unnamed: 0,20140117,2014-01-17,2014,1,January,17,17.1,5,3,Friday,...,2014/03,Weekday,2014-01-13,2014-01-19,2014-01-01,2014-01-31 00:00:00,3.2,20140101,20140131,1.1
0,20140118,2014-01-18,2014,1,January,18,18,6,3,Saturday,...,2014/03,Weekend,2014-01-13,2014-01-19,2014-01-01,2014-01-31 00:00:00,3,20140101,20140131,1
1,20140119,2014-01-19,2014,1,January,19,19,0,4,Sunday,...,2014/03,Weekend,2014-01-13,2014-01-19,2014-01-01,2014-01-31 00:00:00,4,20140101,20140131,1
2,20140120,2014-01-20,2014,1,January,20,20,1,4,Monday,...,2014/04,Weekday,2014-01-20,2014-01-26,2014-01-01,2014-01-31 00:00:00,4,20140101,20140131,1
3,20140121,2014-01-21,2014,1,January,21,21,2,4,Tuesday,...,2014/04,Weekday,2014-01-20,2014-01-26,2014-01-01,2014-01-31 00:00:00,4,20140101,20140131,1
4,20140122,2014-01-22,2014,1,January,22,22,3,4,Wednesday,...,2014/04,Weekday,2014-01-20,2014-01-26,2014-01-01,2014-01-31 00:00:00,4,20140101,20140131,1


## How to List a Folder

In [22]:
bucket = s3_resource.Bucket(name=s3_details["bucket_name"])
folder_name = 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713' 
sub_folder = '/20200614_20200614/'

In [23]:
def list_folders(folder_name):
    files = []
    for obj in bucket.objects.filter(Prefix=folder_name):
        files.append(obj.key)
        
    print(f'Total Object {len(files)}')
    return files

In [24]:
files = list_folders(folder_name)
files

Total Object 7


['DATA_ARCHIVAL_BIMBO_MEXICO_20200713/20200601_20200630/schema.csv',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/20200614_20200614/schema.csv',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/Audit_HHTTransactionDataQueueList/20200614_20200614/Audit_HHTTransactionDataQueueList_1.gzip',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/SADM_Users/20200601_20200630/SADM_Users_1.gzip',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_files',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_files/df.csv',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_filesdf.csv']

### Sub-Folder

In [25]:
files = list_folders(folder_name + sub_folder)
files

Total Object 1


['DATA_ARCHIVAL_BIMBO_MEXICO_20200713/20200614_20200614/schema.csv']

## How to Create a Folder

In [26]:
new_sub_folder = '/processed_files'
bucket.put_object(Key=folder_name + new_sub_folder)
files = list_folders(folder_name)
files

Total Object 7


['DATA_ARCHIVAL_BIMBO_MEXICO_20200713/20200601_20200630/schema.csv',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/20200614_20200614/schema.csv',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/Audit_HHTTransactionDataQueueList/20200614_20200614/Audit_HHTTransactionDataQueueList_1.gzip',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/SADM_Users/20200601_20200630/SADM_Users_1.gzip',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_files',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_files/df.csv',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_filesdf.csv']

In [None]:
files_to_move = [
    'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/20200601_20200630/schema.csv',
    'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/20200614_20200614/schema.csv'
]

## I will verify this

s3_resource.Object(
    s3_details['bucket_name'], 
    'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_files' + "/schema.csv"
).copy_from(
    CopySource='DATA_ARCHIVAL_BIMBO_MEXICO_20200713/20200601_20200630/schema.csv'
)

## How to Delete a Folder

In [28]:
bucket.delete_objects(Delete={
    'Objects': [{
        'Key': folder_name + new_sub_folder
    }],
    'Quiet': False
})
files = list_folders(folder_name)
files

Total Object 6


['DATA_ARCHIVAL_BIMBO_MEXICO_20200713/20200601_20200630/schema.csv',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/20200614_20200614/schema.csv',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/Audit_HHTTransactionDataQueueList/20200614_20200614/Audit_HHTTransactionDataQueueList_1.gzip',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/SADM_Users/20200601_20200630/SADM_Users_1.gzip',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_files/df.csv',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_filesdf.csv']

## Write Data Frame to a Folder

In [15]:
from io import StringIO
csv_buffer = StringIO()
df.to_csv(csv_buffer)
s3_resource.Object(s3_details['bucket_name'], 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_files' + '/df.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '39EC6F3D19A1F0CE',
  'HostId': 'ymb7AQl8ICALdxLwQguBpGewqpJbwPQFGf0zqB8i5ABPU6ZKF60uj/kLkw3xM6okBqKAhjJP7UQ=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'ymb7AQl8ICALdxLwQguBpGewqpJbwPQFGf0zqB8i5ABPU6ZKF60uj/kLkw3xM6okBqKAhjJP7UQ=',
   'x-amz-request-id': '39EC6F3D19A1F0CE',
   'date': 'Thu, 30 Jul 2020 13:39:50 GMT',
   'etag': '"7325b48b36d35ab766fe15f21cf371ae"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"7325b48b36d35ab766fe15f21cf371ae"'}

In [16]:
files = list_folders(folder_name)
files

Total Object 7


['DATA_ARCHIVAL_BIMBO_MEXICO_20200713/20200601_20200630/schema.csv',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/20200614_20200614/schema.csv',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/Audit_HHTTransactionDataQueueList/20200614_20200614/Audit_HHTTransactionDataQueueList_1.gzip',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/SADM_Users/20200601_20200630/SADM_Users_1.gzip',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_files',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_files/df.csv',
 'DATA_ARCHIVAL_BIMBO_MEXICO_20200713/processed_filesdf.csv']