In [6]:
%run ../../src/start.py


python	3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
---------------------
Versions:
----------------------
pandas      1.1.2
numpy       1.19.1
matplotlib  3.3.1
seaborn     0.11.0
plotly      4.11.0
boto3       1.17.106
awswrangler 2.9.0
----------------------


Loaded Libraries
-------------------
import pandas as pd
import numpy as np
import sys,os
import re
import glob
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)


## AWS
import boto3
import awswrangler as wr
----------------


GLOBAL VARIABLES
--------------------------
# Creating the low level functional client
client = boto3.client(
    's3',
    aws_access_key_id = AWS_KEY_ID,
    aws_secret_access_key = AWS_SECRET,
)
    
# Creating the high level object oriented interface
resource = boto3.resource(
    's3',
    aws_access_key_id = AWS_KEY_

## Helper functions

In [13]:
from botocore.exceptions import ClientError

def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True


def create_bucket(bucket_name, region=None):
    """Create an S3 bucket in a specified region

    If a region is not specified, the bucket is created in the S3 default
    region (us-east-1).

    :param bucket_name: Bucket to create
    :param region: String region to create bucket in, e.g., 'us-west-2'
    :return: True if bucket created, else False
    """

    # Create bucket
    try:
        if region is None:
            s3_client = boto3.client('s3')
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            s3_client = boto3.client('s3', region_name=region)
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        logging.error(e)
        return False
    return True




def return_path(path,year):
    '''return path'''
    return f'{path}{year}/'

def list_files(path,):
    '''list files in a given path'''
    return list(glob.glob(f'{path}/*'))

def list_files_match(path,file_type):
    '''list the files in a given path given file type'''
    return list(glob.glob(f'{path}/*{file_type}'))


def upload_list_of_files_S3(file_list, bucket_name,subfolder_path):
    '''function to upload multiple files in a path to S3
    param file_list: list of paths where the files are located
    param bucket_name: bucket to upload to
    param: subfoder_path: where in the bucket the files will go to'''
    try:
        for file in file_list:
            file_name = file.rsplit('/', 1)[1]  # name of file
            upload_file(file,bucket_name, object_name=f"{subfolder_path}/{file_name}")
    except ClientError as e:
        logging.error(e)
        return False
    return True


## Examples

In [4]:
# load simple path`

path = '../../data/raw/crime_data/'
return_path(path,'2005')

'../../data/raw/crime_data/2005/'

In [8]:
# print the files full path
path = '../../data/raw/crime_data/2005'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2005/apr05.xls',
 '../../data/raw/crime_data/2005/aug05.xls',
 '../../data/raw/crime_data/2005/dec05.xls',
 '../../data/raw/crime_data/2005/feb05.xls',
 '../../data/raw/crime_data/2005/jan05.xls',
 '../../data/raw/crime_data/2005/jul05.xls',
 '../../data/raw/crime_data/2005/jun05.xls',
 '../../data/raw/crime_data/2005/mar05.xls',
 '../../data/raw/crime_data/2005/may05.xls',
 '../../data/raw/crime_data/2005/nov05.xls',
 '../../data/raw/crime_data/2005/oct05.xls',
 '../../data/raw/crime_data/2005/sep05.xls']

In [11]:
# print the files full path with specific file type
# No csv files in the path
path = '../../data/raw/crime_data/2005'
file_list = list_files_match(path,'csv')
file_list

[]

## List

In [14]:
# Retrieve the list of existing buckets
s3 = boto3.client('s3')
response = s3.list_buckets()

# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')

Existing buckets:
  aws-logs-765441314938-us-east-1
  cacabucket
  salas-bucket
  salas-data


# Load files into S3 Bucket

In [18]:
# 2005 crime files into bucket `salas-data` with subfolders as `capstone/raw-data/crime-data/2005
bucket_name = 'salas-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2005'
dir_path = '../../data/raw/crime_data/2005'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2005/apr05.xls',
 '../../data/raw/crime_data/2005/aug05.xls',
 '../../data/raw/crime_data/2005/dec05.xls',
 '../../data/raw/crime_data/2005/feb05.xls',
 '../../data/raw/crime_data/2005/jan05.xls',
 '../../data/raw/crime_data/2005/jul05.xls',
 '../../data/raw/crime_data/2005/jun05.xls',
 '../../data/raw/crime_data/2005/mar05.xls',
 '../../data/raw/crime_data/2005/may05.xls',
 '../../data/raw/crime_data/2005/nov05.xls',
 '../../data/raw/crime_data/2005/oct05.xls',
 '../../data/raw/crime_data/2005/sep05.xls']

In [19]:
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

True

In [27]:
# 2006
bucket_name = 'salas-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2006'
path = '../../data/raw/crime_data/2006'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2006/apr06.xls',
 '../../data/raw/crime_data/2006/aug06.xls',
 '../../data/raw/crime_data/2006/dec06.xls',
 '../../data/raw/crime_data/2006/feb06.xls',
 '../../data/raw/crime_data/2006/jan06.xls',
 '../../data/raw/crime_data/2006/jul06.xls',
 '../../data/raw/crime_data/2006/jun06.xls',
 '../../data/raw/crime_data/2006/mar06.xls',
 '../../data/raw/crime_data/2006/may06.xls',
 '../../data/raw/crime_data/2006/nov06.xls',
 '../../data/raw/crime_data/2006/oct06.xls',
 '../../data/raw/crime_data/2006/sep06.xls']

In [28]:
# 2006
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

True