## Ingesting data from S3 to Amazon Athena through boto3

In [101]:
##Importing required libraries
import pandas as pd
import pandas_datareader.data as web
import numpy as np
import datetime
import logging
import time
import os
import boto3
from botocore.exceptions import ClientError

### Installing dependencies

In [8]:
!pip3 install pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pandas
Successfully installed pandas-1.5.2


In [10]:
!pip3 install pandas-datareader

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas-datareader
  Using cached pandas_datareader-0.10.0-py3-none-any.whl (109 kB)
Installing collected packages: pandas-datareader
Successfully installed pandas-datareader-0.10.0


In [77]:
##Setting variables
start_time = datetime.datetime(2022, 1, 1)
# today
end_time = datetime.datetime.now().date().isoformat()

In [13]:
##Setting ticker names
tickers  = ['AAPL','TSLA','GOOGL','NFLX']

### Getting data from yahoo API

In [105]:
def getting_data_yahoo(ticker,file_name,path=data_path):
    # yahoo gives only daily historical data
    connected = False
    while not connected:
        try:
            df = web.get_data_yahoo(ticker, start=start_time, end=end_time)
            connected = True
            print('connected to yahoo')
        except Exception as e:
            print("type error: " + str(e))
            time.sleep( 5 )
            pass   

    # use numerical integer index instead of date    
    df = df.reset_index()
    df.to_csv(f'{file_name}.csv',index=False)

In [106]:
df1 = getting_data_yahoo(tickers[0],tickers[0])
df2 = getting_data_yahoo(tickers[1],tickers[1])
df3 = getting_data_yahoo(tickers[2],tickers[2])
df4 = getting_data_yahoo(tickers[3],tickers[3])

connected to yahoo
connected to yahoo
connected to yahoo
connected to yahoo


In [18]:
df1

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2022-01-03,182.880005,177.710007,177.830002,182.009995,104487900.0,180.959732
1,2022-01-04,182.940002,179.119995,182.630005,179.699997,99310400.0,178.663071
2,2022-01-05,180.169998,174.639999,179.610001,174.919998,94537600.0,173.910660
3,2022-01-06,175.300003,171.639999,172.699997,172.000000,96904000.0,171.007507
4,2022-01-07,174.139999,171.029999,172.889999,172.169998,86709100.0,171.176514
...,...,...,...,...,...,...,...
228,2022-11-29,144.809998,140.350006,144.289993,141.169998,83763800.0,141.169998
229,2022-11-30,148.720001,140.550003,141.399994,148.029999,111224400.0,148.029999
230,2022-12-01,149.130005,146.610001,148.210007,148.309998,71250400.0,148.309998
231,2022-12-02,148.000000,145.649994,145.960007,147.809998,65421400.0,147.809998


In [None]:
##Getting time of your AWS service
!curl http://s3.amazonaws.com -v

In [None]:
##Getting your system date
!date -u

### Getting S3 resources

In [113]:
##Defining S3 client
s3 = boto3.client('s3')

In [114]:
##Getting buckets data
def getting_bucket_data(client=s3):
    ##Getting response from S3 client
    r = client.list_buckets()
    ##Listing AWS buckets
    for bucket in r['Buckets']:
        bucketName = bucket["Name"]
        
    return bucketName

In [115]:
buckets = getting_bucket_data()

In [116]:
buckets

'yahoo-finance-bckt-1201'

In [117]:
print(r)

{'ResponseMetadata': {'RequestId': '75TBEWCT6FHX2DXW', 'HostId': 'wdeCgQUybolxN++p8igX/qSYJhky5ZNeT7TQ7xMU0iJjKrH6y9o/zkxyrYBYzWmViK1lD+/SLTM=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'wdeCgQUybolxN++p8igX/qSYJhky5ZNeT7TQ7xMU0iJjKrH6y9o/zkxyrYBYzWmViK1lD+/SLTM=', 'x-amz-request-id': '75TBEWCT6FHX2DXW', 'date': 'Tue, 06 Dec 2022 19:26:43 GMT', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'Buckets': [{'Name': 'yahoo-finance-bckt-1201', 'CreationDate': datetime.datetime(2022, 12, 1, 12, 20, 50, tzinfo=tzutc())}], 'Owner': {'DisplayName': 'felixmlb', 'ID': '9402830493660c37d25355e863da4afacaea8c554644b78d09231ef66877871f'}}


### Uploading data to bucket

In [118]:
def upload_files(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [119]:
upload_file('AAPL.csv',bucketName)

True