In [1]:
import boto3
import pandas as pd
from io import StringIO
from datetime import datetime as dt

In [2]:
src_bucket = 'etl-p2-data'
    
s3 = boto3.resource('s3')
bucket = s3.Bucket(src_bucket)
objects = [obj for obj in bucket.objects.all()]

In [3]:
def list_of_files(bucket):
    files = [obj.key for obj in bucket]
    return files

In [4]:
def read_csv_to_df(bucket, key, decoding = 'utf-8', sep = ','):
    csv_obj = bucket.Object(key=key).get().get('Body').read().decode(decoding)
    data = StringIO(csv_obj)
    df = pd.read_csv(data, delimiter=sep)
    return df

In [5]:
def extract(bucket):
    files = [key for key in list_of_files(objects)]
    # print(files)
    df = pd.concat([read_csv_to_df(bucket, obj) for obj in files], ignore_index=True)
    return df

In [7]:
def transformations(df):
    
    df.dropna(inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    df = df.groupby([ 'symbol', df['date'].dt.year], as_index=False).agg(
                                                              opening_price=('open', 'min'), 
                                                              closing_price=('close', 'min'), 
                                                              minimum_price=('low', 'min'),
                                                              maximum_price=('high','max'),
                                                              daily_traded_volume=('volume','sum'),
                                                              avg_opening_price=('open', 'mean'), 
                                                              avg_closing_price=('close', 'mean'), 
                                                              avg_minimum_price=('low', 'mean'),
                                                              avg_maximum_price=('high','mean'),
                                                              avg_daily_traded_volume=('volume','mean')
                                                                                )
    df['$_change_closing_price'] = df['closing_price'] - df['opening_price']
    df['%_change_closing_price'] = (df['$_change_closing_price']/df['closing_price'])*100
    df = df.round(decimals=2)
    
    df['Year'] = ''
    
    df.loc[df.index[range(0,len(df),4)], 'Year'] = "2014"
    df.loc[df.index[range(1,len(df),4)], 'Year'] = "2015"
    df.loc[df.index[range(2,len(df),4)], 'Year'] = "2016"
    df.loc[df.index[range(3,len(df),4)], 'Year'] = "2017"

    cols = list(df.columns.values)
    cols.insert(1, cols.pop(cols.index('Year')))
    df = df.loc[:, cols]
    
    return df

In [8]:
def etl_report():
    df = extract(bucket)
    df = transformations(df)
    
    print(df)

In [10]:
def main():
    src_bucket = 'etl-p2-data'
    
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(src_bucket)
    objects = [obj for obj in bucket.objects.all()]
    
    test_report = etl_report()
    print(test_report)

In [11]:
main()

     symbol  Year  opening_price  closing_price  minimum_price  maximum_price  \
0         A  2014          38.30          38.47          38.09          61.22   
1         A  2015          33.18          33.37          33.12          43.59   
2         A  2016          34.21          34.80          34.15          48.63   
3         A  2017          45.93          46.49          45.74          70.93   
4       AAL  2014          25.07          25.36          25.06          54.64   
...     ...   ...            ...            ...            ...            ...   
1982   ZION  2016          38.88          38.99          38.43          52.20   
1983    ZTS  2017          28.48          28.40          28.14          45.24   
1984    ZTS  2014          40.30          39.65          37.73          55.38   
1985    ZTS  2015          38.98          39.33          38.26          54.15   
1986    ZTS  2016          52.49          52.51          52.00          73.58   

      daily_traded_volume  