In [1]:
import boto3
import pandas as pd
from io import StringIO
from datetime import datetime as dt

In [2]:
src_bucket = 'etl-p2-data'

In [8]:
s3 = boto3.resource('s3')
bucket = s3.Bucket(src_bucket)
objects = [obj for obj in bucket.objects.all()]

In [9]:
def read_csv_to_df(filename, decoding = 'utf-8', sep = ','):
    csv_obj = bucket.Object(key=filename).get().get('Body').read().decode(decoding)
    data = StringIO(csv_obj)
    df = pd.read_csv(data, delimiter=sep)
    return df

In [10]:
df_all = pd.concat([read_csv_to_df(obj.key) for obj in objects], ignore_index=True)
df_all

Unnamed: 0,symbol,date,open,high,low,close,volume
0,AAL,1/2/2014,25.0700,25.8200,25.0600,25.3600,8998943
1,AAPL,1/2/2014,79.3828,79.5756,78.8601,79.0185,58791957
2,AAP,1/2/2014,110.3600,111.8800,109.2900,109.7400,542711
3,ABBV,1/2/2014,52.1200,52.3300,51.5200,51.9800,4569061
4,ABC,1/2/2014,70.1100,70.2300,69.4800,69.8900,1148391
...,...,...,...,...,...,...,...
497467,SNA,1/13/2016,165.0000,165.5900,161.3800,161.5100,478865
497468,SNI,1/13/2016,54.6300,55.2400,53.8200,54.6000,1829401
497469,SNPS,1/13/2016,42.1100,42.1700,40.3600,40.5300,1542464
497470,SO,1/13/2016,46.5500,47.0300,46.4400,46.8000,5860061


### Transformations

In [6]:
def transformations(df):
    
    df.dropna(inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    df = df.groupby([ 'symbol', df['date'].dt.year], as_index=False).agg(
                                                              opening_price=('open', 'min'), 
                                                              closing_price=('close', 'min'), 
                                                              minimum_price=('low', 'min'),
                                                              maximum_price=('high','max'),
                                                              daily_traded_volume=('volume','sum'),
                                                              avg_opening_price=('open', 'mean'), 
                                                              avg_closing_price=('close', 'mean'), 
                                                              avg_minimum_price=('low', 'mean'),
                                                              avg_maximum_price=('high','mean'),
                                                              avg_daily_traded_volume=('volume','mean')
                                                                                )
    df['$_change_closing_price'] = df['closing_price'] - df['opening_price']
    df['%_change_closing_price'] = (df['$_change_closing_price']/df['closing_price'])*100
    df = df.round(decimals=2)
    
    df['Year'] = ''
    
    df.loc[df_all.index[range(0,len(df),4)], 'Year'] = "2014"
    df.loc[df_all.index[range(1,len(df),4)], 'Year'] = "2015"
    df.loc[df_all.index[range(2,len(df),4)], 'Year'] = "2016"
    df.loc[df_all.index[range(3,len(df),4)], 'Year'] = "2017"

    cols = list(df.columns.values)
    cols.insert(1, cols.pop(cols.index('Year')))
    df = df.loc[:, cols]
    
    return df

In [7]:
report1 = transformations(df_all)
report1

Unnamed: 0,symbol,Year,opening_price,closing_price,minimum_price,maximum_price,daily_traded_volume,avg_opening_price,avg_closing_price,avg_minimum_price,avg_maximum_price,avg_daily_traded_volume,$_change_closing_price,%_change_closing_price
0,A,2014,38.30,38.47,38.09,61.22,572795464,54.12,54.12,53.64,54.58,2272997.87,0.17,0.44
1,A,2015,33.18,33.37,33.12,43.59,645331754,39.59,39.60,39.23,39.94,2560840.29,0.19,0.57
2,A,2016,34.21,34.80,34.15,48.63,524961956,43.34,43.37,42.97,43.73,2083182.37,0.59,1.70
3,A,2017,45.93,46.49,45.74,70.93,464185914,59.33,59.34,58.92,59.73,1849346.27,0.56,1.20
4,AAL,2014,25.07,25.36,25.06,54.64,3124304404,38.86,38.92,38.23,39.55,12398033.35,0.29,1.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1982,ZION,2016,38.88,38.99,38.43,52.20,623452328,44.27,44.25,43.79,44.71,2483873.82,0.11,0.28
1983,ZTS,2017,28.48,28.40,28.14,45.24,1015299846,34.03,34.06,33.75,34.33,4028967.64,-0.08,-0.28
1984,ZTS,2014,40.30,39.65,37.73,55.38,870016319,46.13,46.10,45.56,46.61,3452445.71,-0.65,-1.64
1985,ZTS,2015,38.98,39.33,38.26,54.15,947876473,47.83,47.83,47.35,48.24,3761414.58,0.35,0.89


### Write to s3

In [9]:
def write_to_s3():
    bucket = 'etl-p2-storage'
    file_name = 'stock_data_cleansed_' + dt.today().strftime("%Y%m%d_%H:%M:%S") + '.csv'
    
    csv_buffer = StringIO()
    report1.to_csv(csv_buffer)
    
    s3.Object(bucket, file_name).put(Body=csv_buffer.getvalue())

In [10]:
write_to_s3()