In [1]:
import boto3
import pandas as pd
from io import StringIO
from datetime import datetime as dt

In [2]:
src_bucket = 'etl-p2-data'

In [3]:
s3 = boto3.resource('s3')
bucket = s3.Bucket(src_bucket)
objects = [obj for obj in bucket.objects.all()]

In [4]:
def read_csv_to_df(filename, decoding = 'utf-8', sep = ','):
    csv_obj = bucket.Object(key=filename).get().get('Body').read().decode(decoding)
    data = StringIO(csv_obj)
    df = pd.read_csv(data, delimiter=sep)
    return df

In [5]:
df_all = pd.concat([read_csv_to_df(obj.key) for obj in objects], ignore_index=True)

### Transformations

In [6]:
def transformations():
    
    df_all.dropna(inplace=True)
    df_all['date'] = pd.to_datetime(df_all['date'])
    df_all = df_all.groupby([ 'symbol', df_all['date'].dt.year], as_index=False).agg(
                                                              opening_price=('open', 'min'), 
                                                              closing_price=('close', 'min'), 
                                                              minimum_price=('low', 'min'),
                                                              maximum_price=('high','max'),
                                                              daily_traded_volume=('volume','sum'),
                                                              avg_opening_price=('open', 'mean'), 
                                                              avg_closing_price=('close', 'mean'), 
                                                              avg_minimum_price=('low', 'mean'),
                                                              avg_maximum_price=('high','mean'),
                                                              avg_daily_traded_volume=('volume','mean')
                                                                                )
    df_all['$_change_closing_price'] = df_all['closing_price'] - df_all['opening_price']
    df_all['%_change_closing_price'] = (df_all['$_change_closing_price']/df_all['closing_price'])*100
    df_all = df_all.round(decimals=2)
    
    df_all['Year'] = ''
    
    df_all.loc[df_all.index[range(0,len(df_all),4)], 'Year'] = "2014"
    df_all.loc[df_all.index[range(1,len(df_all),4)], 'Year'] = "2015"
    df_all.loc[df_all.index[range(2,len(df_all),4)], 'Year'] = "2016"
    df_all.loc[df_all.index[range(3,len(df_all),4)], 'Year'] = "2017"

    cols = list(df_all.columns.values)
    cols.insert(1, cols.pop(cols.index('Year')))
    df_all = df_all.loc[:, cols]
    
    return df_all

In [7]:
report1 = transformations()
report1

UnboundLocalError: local variable 'df_all' referenced before assignment

### Write to s3

In [None]:
def write_to_s3():
    bucket = 'etl-p2-storage'
    file_name = 'stock_data_cleansed_' + dt.today().strftime("%Y%m%d_%H:%M:%S") + '.csv'
    
    csv_buffer = StringIO()
    report1.to_csv(csv_buffer)
    
    s3.Object(bucket, file_name).put(Body=csv_buffer.getvalue())

In [None]:
write_to_s3()