In [1]:
%run ../../src/start.py


python	3.9.5 (default, Jun  4 2021, 12:28:51) 
[GCC 7.5.0]
---------------------
Versions:
----------------------
pandas      1.3.0
numpy       1.20.2
matplotlib  3.3.4
seaborn     0.11.1
plotly      5.1.0
boto3       1.17.109
awswrangler 2.10.0
----------------------


Loaded Libraries
-------------------
import pandas as pd
import numpy as np
import sys,os
import re
import glob
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)


## AWS
import boto3
import awswrangler as wr
----------------


GLOBAL VARIABLES
--------------------------
# Creating the low level functional client
client = boto3.client(
    's3',
    aws_access_key_id = AWS_KEY_ID,
    aws_secret_access_key = AWS_SECRET,
)
    
# Creating the high level object oriented interface
resource = boto3.resource(
    's3',
    aws_access_key_id = AWS_KEY_

## Helper functions

In [2]:
from botocore.exceptions import ClientError

def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True


def create_bucket(bucket_name, region=None):
    """Create an S3 bucket in a specified region

    If a region is not specified, the bucket is created in the S3 default
    region (us-east-1).

    :param bucket_name: Bucket to create
    :param region: String region to create bucket in, e.g., 'us-west-2'
    :return: True if bucket created, else False
    """

    # Create bucket
    try:
        if region is None:
            s3_client = boto3.client('s3')
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            s3_client = boto3.client('s3', region_name=region)
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        logging.error(e)
        return False
    return True




def return_path(path,year):
    '''return path'''
    return f'{path}{year}/'

def list_files(path,):
    '''list files in a given path'''
    return list(glob.glob(f'{path}/*'))

def list_files_match(path,file_type):
    '''list the files in a given path given file type'''
    return list(glob.glob(f'{path}/*{file_type}'))


def upload_list_of_files_S3(file_list, bucket_name,subfolder_path):
    '''function to upload multiple files in a path to S3
    param file_list: list of paths where the files are located
    param bucket_name: bucket to upload to
    param: subfoder_path: where in the bucket the files will go to'''
    try:
        for file in file_list:
            file_name = file.rsplit('/', 1)[1]  # name of file
            upload_file(file,bucket_name, object_name=f"{subfolder_path}/{file_name}")
    except ClientError as e:
        logging.error(e)
        return False
    return True


## Examples

In [3]:
# load simple path`

path = '../../data/raw/crime_data/'
return_path(path,'2005')

'../../data/raw/crime_data/2005/'

In [4]:
# print the files full path
path = '../../data/raw/crime_data/2005'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2005/apr05.xls',
 '../../data/raw/crime_data/2005/aug05.xls',
 '../../data/raw/crime_data/2005/dec05.xls',
 '../../data/raw/crime_data/2005/feb05.xls',
 '../../data/raw/crime_data/2005/jan05.xls',
 '../../data/raw/crime_data/2005/jul05.xls',
 '../../data/raw/crime_data/2005/jun05.xls',
 '../../data/raw/crime_data/2005/mar05.xls',
 '../../data/raw/crime_data/2005/may05.xls',
 '../../data/raw/crime_data/2005/nov05.xls',
 '../../data/raw/crime_data/2005/oct05.xls',
 '../../data/raw/crime_data/2005/sep05.xls']

In [5]:
# print the files full path with specific file type
# No csv files in the path
path = '../../data/raw/crime_data/2005'
file_list = list_files_match(path,'csv')
file_list

[]

## List

In [7]:
# Retrieve the list of existing buckets
s3 = boto3.client('s3')
response = s3.list_buckets()

# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')

Existing buckets:
  aws-logs-765441314938-us-east-1
  cacabucket
  dend-data
  salas-blog
  salas-bucket
  salas-data


# Load files into S3 Bucket

In [8]:
# 2005 crime files into bucket `salas-data` with subfolders as `capstone/raw-data/crime-data/2005
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2005'
dir_path = '../../data/raw/crime_data/2005'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2005/apr05.xls',
 '../../data/raw/crime_data/2005/aug05.xls',
 '../../data/raw/crime_data/2005/dec05.xls',
 '../../data/raw/crime_data/2005/feb05.xls',
 '../../data/raw/crime_data/2005/jan05.xls',
 '../../data/raw/crime_data/2005/jul05.xls',
 '../../data/raw/crime_data/2005/jun05.xls',
 '../../data/raw/crime_data/2005/mar05.xls',
 '../../data/raw/crime_data/2005/may05.xls',
 '../../data/raw/crime_data/2005/nov05.xls',
 '../../data/raw/crime_data/2005/oct05.xls',
 '../../data/raw/crime_data/2005/sep05.xls']

In [9]:
%%time
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

True

In [13]:
# 2006
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2006'
path = '../../data/raw/crime_data/2006'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2006/apr06.xls',
 '../../data/raw/crime_data/2006/aug06.xls',
 '../../data/raw/crime_data/2006/dec06.xls',
 '../../data/raw/crime_data/2006/feb06.xls',
 '../../data/raw/crime_data/2006/jan06.xls',
 '../../data/raw/crime_data/2006/jul06.xls',
 '../../data/raw/crime_data/2006/jun06.xls',
 '../../data/raw/crime_data/2006/mar06.xls',
 '../../data/raw/crime_data/2006/may06.xls',
 '../../data/raw/crime_data/2006/nov06.xls',
 '../../data/raw/crime_data/2006/oct06.xls',
 '../../data/raw/crime_data/2006/sep06.xls']

In [14]:
%%time
# 2006
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 237 ms, sys: 52.2 ms, total: 289 ms
Wall time: 16.4 s


True

In [15]:
# 2007
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2007'
path = '../../data/raw/crime_data/2007'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2007/apr07.mdb',
 '../../data/raw/crime_data/2007/aug07.mdb',
 '../../data/raw/crime_data/2007/dec07.mdb',
 '../../data/raw/crime_data/2007/feb07.mdb',
 '../../data/raw/crime_data/2007/jan07.mdb',
 '../../data/raw/crime_data/2007/jul07.mdb',
 '../../data/raw/crime_data/2007/jun07.mdb',
 '../../data/raw/crime_data/2007/mar07.mdb',
 '../../data/raw/crime_data/2007/may07.mdb',
 '../../data/raw/crime_data/2007/nov07.mdb',
 '../../data/raw/crime_data/2007/oct07.mdb',
 '../../data/raw/crime_data/2007/sep07.mdb']

In [16]:
%%time
# 2007
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 162 ms, sys: 11.8 ms, total: 174 ms
Wall time: 10 s


True

In [17]:
# 2008
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2007'
path = '../../data/raw/crime_data/2007'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2007/apr07.mdb',
 '../../data/raw/crime_data/2007/aug07.mdb',
 '../../data/raw/crime_data/2007/dec07.mdb',
 '../../data/raw/crime_data/2007/feb07.mdb',
 '../../data/raw/crime_data/2007/jan07.mdb',
 '../../data/raw/crime_data/2007/jul07.mdb',
 '../../data/raw/crime_data/2007/jun07.mdb',
 '../../data/raw/crime_data/2007/mar07.mdb',
 '../../data/raw/crime_data/2007/may07.mdb',
 '../../data/raw/crime_data/2007/nov07.mdb',
 '../../data/raw/crime_data/2007/oct07.mdb',
 '../../data/raw/crime_data/2007/sep07.mdb']

In [18]:
%%time
# 2008
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 149 ms, sys: 21.1 ms, total: 171 ms
Wall time: 9.82 s


True

In [40]:
# 2009
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2009'
path = '../../data/raw/crime_data/2009/csv'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2009/csv/aug09.xlsx',
 '../../data/raw/crime_data/2009/csv/dec09.xlsx',
 '../../data/raw/crime_data/2009/csv/jul09.xlsx',
 '../../data/raw/crime_data/2009/csv/nov09.xlsx',
 '../../data/raw/crime_data/2009/csv/oct09.xlsx',
 '../../data/raw/crime_data/2009/csv/sep09.xlsx']

In [41]:
%%time
# 2009
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 112 ms, sys: 0 ns, total: 112 ms
Wall time: 6.46 s


True

In [42]:
# 2010
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2010'
path = '../../data/raw/crime_data/2010/csv'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2010/csv/apr10.xlsx',
 '../../data/raw/crime_data/2010/csv/aug10.xlsx',
 '../../data/raw/crime_data/2010/csv/dec10.xlsx',
 '../../data/raw/crime_data/2010/csv/feb10.xlsx',
 '../../data/raw/crime_data/2010/csv/jan10.xlsx',
 '../../data/raw/crime_data/2010/csv/jul10.xlsx',
 '../../data/raw/crime_data/2010/csv/jun10.xlsx',
 '../../data/raw/crime_data/2010/csv/mar10.xlsx',
 '../../data/raw/crime_data/2010/csv/may10.xlsx',
 '../../data/raw/crime_data/2010/csv/nov10.xlsx',
 '../../data/raw/crime_data/2010/csv/oct10.xlsx',
 '../../data/raw/crime_data/2010/csv/sep10.xlsx']

In [43]:
%%time
# 2010
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 193 ms, sys: 38.3 ms, total: 231 ms
Wall time: 13.1 s


True

In [44]:
# 2011
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2011'
path = '../../data/raw/crime_data/2011/csv'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2011/csv/apr11.xlsx',
 '../../data/raw/crime_data/2011/csv/aug11.xlsx',
 '../../data/raw/crime_data/2011/csv/dec11.xlsx',
 '../../data/raw/crime_data/2011/csv/feb11.xlsx',
 '../../data/raw/crime_data/2011/csv/jan11.xlsx',
 '../../data/raw/crime_data/2011/csv/jul11.xlsx',
 '../../data/raw/crime_data/2011/csv/jun11.xlsx',
 '../../data/raw/crime_data/2011/csv/mar11.xlsx',
 '../../data/raw/crime_data/2011/csv/may11.xlsx',
 '../../data/raw/crime_data/2011/csv/nov11.xlsx',
 '../../data/raw/crime_data/2011/csv/oct11.xlsx',
 '../../data/raw/crime_data/2011/csv/sep11.xlsx']

In [45]:
%%time
# 2011
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 190 ms, sys: 38.9 ms, total: 229 ms
Wall time: 12.9 s


True

In [46]:
# 2012
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2012'
path = '../../data/raw/crime_data/2012/csv'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2012/csv/apr12.xlsx',
 '../../data/raw/crime_data/2012/csv/aug12.xlsx',
 '../../data/raw/crime_data/2012/csv/dec12.xlsx',
 '../../data/raw/crime_data/2012/csv/feb12.xlsx',
 '../../data/raw/crime_data/2012/csv/jan12.xlsx',
 '../../data/raw/crime_data/2012/csv/jul12.xlsx',
 '../../data/raw/crime_data/2012/csv/jun12.xlsx',
 '../../data/raw/crime_data/2012/csv/mar12.xlsx',
 '../../data/raw/crime_data/2012/csv/may12.xlsx',
 '../../data/raw/crime_data/2012/csv/nov12.xlsx',
 '../../data/raw/crime_data/2012/csv/oct12.xlsx',
 '../../data/raw/crime_data/2012/csv/sep12.xlsx']

In [47]:
%%time
# 2012
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 255 ms, sys: 17 ms, total: 272 ms
Wall time: 12.7 s


True

In [48]:
# 2013
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2013'
path = '../../data/raw/crime_data/2013/csv'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2013/csv/apr13.xlsx',
 '../../data/raw/crime_data/2013/csv/aug13.xlsx',
 '../../data/raw/crime_data/2013/csv/dec13.xlsx',
 '../../data/raw/crime_data/2013/csv/feb13.xlsx',
 '../../data/raw/crime_data/2013/csv/jan13.xlsx',
 '../../data/raw/crime_data/2013/csv/jul13.xlsx',
 '../../data/raw/crime_data/2013/csv/jun13.xlsx',
 '../../data/raw/crime_data/2013/csv/mar13.xlsx',
 '../../data/raw/crime_data/2013/csv/may13.xlsx',
 '../../data/raw/crime_data/2013/csv/nov13.xlsx',
 '../../data/raw/crime_data/2013/csv/oct13.xlsx',
 '../../data/raw/crime_data/2013/csv/sep13.xlsx']

In [49]:
%%time
# 2013
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 192 ms, sys: 35.8 ms, total: 227 ms
Wall time: 12.9 s


True

In [50]:
# 2014
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2014'
path = '../../data/raw/crime_data/2014/csv'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2014/csv/apr14.xlsx',
 '../../data/raw/crime_data/2014/csv/aug14.xlsx',
 '../../data/raw/crime_data/2014/csv/dec14.xlsx',
 '../../data/raw/crime_data/2014/csv/feb14.xlsx',
 '../../data/raw/crime_data/2014/csv/jan14.xlsx',
 '../../data/raw/crime_data/2014/csv/jul14.xlsx',
 '../../data/raw/crime_data/2014/csv/jun14.xlsx',
 '../../data/raw/crime_data/2014/csv/mar14.xlsx',
 '../../data/raw/crime_data/2014/csv/may14.xlsx',
 '../../data/raw/crime_data/2014/csv/nov14.xlsx',
 '../../data/raw/crime_data/2014/csv/oct14.xlsx',
 '../../data/raw/crime_data/2014/csv/sep14.xlsx']

In [51]:
%%time
# 2014
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 205 ms, sys: 18.6 ms, total: 223 ms
Wall time: 12.6 s


True

In [52]:
# 2015
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2015'
path = '../../data/raw/crime_data/2015/csv'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2015/csv/apr15.xlsx',
 '../../data/raw/crime_data/2015/csv/aug15.xlsx',
 '../../data/raw/crime_data/2015/csv/dec15.xlsx',
 '../../data/raw/crime_data/2015/csv/feb15.xlsx',
 '../../data/raw/crime_data/2015/csv/jan15.xlsx',
 '../../data/raw/crime_data/2015/csv/jul15.xlsx',
 '../../data/raw/crime_data/2015/csv/jun15.xlsx',
 '../../data/raw/crime_data/2015/csv/mar15.xlsx',
 '../../data/raw/crime_data/2015/csv/may15.xlsx',
 '../../data/raw/crime_data/2015/csv/nov15.xlsx',
 '../../data/raw/crime_data/2015/csv/oct15.xlsx',
 '../../data/raw/crime_data/2015/csv/sep15.xlsx']

In [53]:
%%time
# 2015
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 204 ms, sys: 18.7 ms, total: 222 ms
Wall time: 12.7 s


True

In [54]:
# 2016
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2016'
path = '../../data/raw/crime_data/2016/csv'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2016/csv/apr16.xlsx',
 '../../data/raw/crime_data/2016/csv/aug16.xlsx',
 '../../data/raw/crime_data/2016/csv/dec16.xlsx',
 '../../data/raw/crime_data/2016/csv/feb16.xlsx',
 '../../data/raw/crime_data/2016/csv/jan16.xlsx',
 '../../data/raw/crime_data/2016/csv/jul16.xlsx',
 '../../data/raw/crime_data/2016/csv/jun16.xlsx',
 '../../data/raw/crime_data/2016/csv/mar16.xlsx',
 '../../data/raw/crime_data/2016/csv/may16.xlsx',
 '../../data/raw/crime_data/2016/csv/nov16.xlsx',
 '../../data/raw/crime_data/2016/csv/oct16.xlsx',
 '../../data/raw/crime_data/2016/csv/sep16.xlsx']

In [55]:
%%time
# 2016
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 211 ms, sys: 12.1 ms, total: 223 ms
Wall time: 12.9 s


True

In [56]:
# 2017
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2017'
path = '../../data/raw/crime_data/2017/csv'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2017/csv/apr17.xlsx',
 '../../data/raw/crime_data/2017/csv/aug17.xlsx',
 '../../data/raw/crime_data/2017/csv/dec17.xlsx',
 '../../data/raw/crime_data/2017/csv/feb17.xlsx',
 '../../data/raw/crime_data/2017/csv/jan17.xlsx',
 '../../data/raw/crime_data/2017/csv/jul17.xlsx',
 '../../data/raw/crime_data/2017/csv/jun17.xlsx',
 '../../data/raw/crime_data/2017/csv/mar17.xlsx',
 '../../data/raw/crime_data/2017/csv/may17.xlsx',
 '../../data/raw/crime_data/2017/csv/nov17.xlsx',
 '../../data/raw/crime_data/2017/csv/oc17.xlsx',
 '../../data/raw/crime_data/2017/csv/sep17.xlsx']

In [57]:
%%time
# 2017
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 211 ms, sys: 10.4 ms, total: 221 ms
Wall time: 12.7 s


True

In [58]:
# 2018
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/crime-data/2018'
path = '../../data/raw/crime_data/2018/csv'
file_list = list_files(path)
file_list

['../../data/raw/crime_data/2018/csv/apr18.xlsx',
 '../../data/raw/crime_data/2018/csv/feb18.xlsx',
 '../../data/raw/crime_data/2018/csv/jan18.xlsx',
 '../../data/raw/crime_data/2018/csv/mar 18.xlsx',
 '../../data/raw/crime_data/2018/csv/may18.xlsx']

In [59]:
%%time
# 2018
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

CPU times: user 90.3 ms, sys: 1.57 ms, total: 91.9 ms
Wall time: 5.13 s


True

# Upload Weather Data

In [3]:
# print the files full path
path = '../../data/raw/weather_data'
file_list = list_files(path)
file_list

['../../data/raw/weather_data/b5af47a41a784be4c6fca0b53302f0a1.csv']

In [4]:
# weather
bucket_name = 'dend-data'
bucket_subfolders = 'capstone/raw-data/weather-data'
path = '../../data/raw/weather_data'
file_list = list_files(path)
file_list

['../../data/raw/weather_data/b5af47a41a784be4c6fca0b53302f0a1.csv']

In [5]:
upload_list_of_files_S3(file_list,bucket_name, bucket_subfolders)

True