In [1]:
import boto3
import pandas as pd
from io import StringIO
from datetime import datetime as dt

In [2]:
src_bucket = 'etl-p2-data'

In [3]:
s3 = boto3.resource('s3')
bucket = s3.Bucket(src_bucket)
objects = [obj for obj in bucket.objects.all()]

In [4]:
def csv_to_df(filename):
    csv_obj = bucket.Object(key=filename).get().get('Body').read().decode('utf-8')
    data = StringIO(csv_obj)
    df = pd.read_csv(data, delimiter=',')
    return df

df_all = pd.concat([csv_to_df(obj.key) for obj in objects], ignore_index=True)
df_all.dropna(inplace=True)

### Transformations

In [5]:
df_all['date'] = pd.to_datetime(df_all['date'])

In [6]:

df_all = df_all.groupby([ 'symbol', df_all['date'].dt.year], as_index=False).agg(
                                                              opening_price=('open', 'min'), 
                                                              closing_price=('close', 'min'), 
                                                              minimum_price=('low', 'min'),
                                                              maximum_price=('high','max'),
                                                              daily_traded_volume=('volume','sum'),
                                                              avg_opening_price=('open', 'mean'), 
                                                              avg_closing_price=('close', 'mean'), 
                                                              avg_minimum_price=('low', 'mean'),
                                                              avg_maximum_price=('high','mean'),
                                                              avg_daily_traded_volume=('volume','mean')
                                                                                )

In [7]:
df_all['$_change_closing_price'] = df_all['closing_price'] - df_all['opening_price']

In [8]:
df_all['%_change_closing_price'] = (df_all['$_change_closing_price']/df_all['closing_price'])*100


In [9]:
df_all = df_all.round(decimals=2)

In [10]:
df_all['Year'] = ''

In [11]:
df_all.loc[df_all.index[range(0,len(df_all),4)], 'Year'] = "2014"

In [12]:
df_all.loc[df_all.index[range(1,len(df_all),4)], 'Year'] = "2015"

In [13]:
df_all.loc[df_all.index[range(2,len(df_all),4)], 'Year'] = "2016"

In [14]:
df_all.loc[df_all.index[range(3,len(df_all),4)], 'Year'] = "2017"

In [15]:
cols = list(df_all.columns.values)
cols.insert(1, cols.pop(cols.index('Year')))

In [16]:
df_all = df_all.loc[:, cols]

In [17]:
df_all

Unnamed: 0,symbol,Year,opening_price,closing_price,minimum_price,maximum_price,daily_traded_volume,avg_opening_price,avg_closing_price,avg_minimum_price,avg_maximum_price,avg_daily_traded_volume,$_change_closing_price,%_change_closing_price
0,A,2014,38.30,38.47,38.09,61.22,572795464,54.12,54.12,53.64,54.58,2272997.87,0.17,0.44
1,A,2015,33.18,33.37,33.12,43.59,645331754,39.59,39.60,39.23,39.94,2560840.29,0.19,0.57
2,A,2016,34.21,34.80,34.15,48.63,524961956,43.34,43.37,42.97,43.73,2083182.37,0.59,1.70
3,A,2017,45.93,46.49,45.74,70.93,464185914,59.33,59.34,58.92,59.73,1849346.27,0.56,1.20
4,AAL,2014,25.07,25.36,25.06,54.64,3124304404,38.86,38.92,38.23,39.55,12398033.35,0.29,1.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1982,ZION,2016,38.88,38.99,38.43,52.20,623452328,44.27,44.25,43.79,44.71,2483873.82,0.11,0.28
1983,ZTS,2017,28.48,28.40,28.14,45.24,1015299846,34.03,34.06,33.75,34.33,4028967.64,-0.08,-0.28
1984,ZTS,2014,40.30,39.65,37.73,55.38,870016319,46.13,46.10,45.56,46.61,3452445.71,-0.65,-1.64
1985,ZTS,2015,38.98,39.33,38.26,54.15,947876473,47.83,47.83,47.35,48.24,3761414.58,0.35,0.89


### Write to s3

In [None]:
bucket = 'etl-p2-storage'
file_name = 'stock_data_cleansed_' + dt.today().strftime("%Y%m%d_%H:%M:%S") + '.csv'

In [None]:
csv_buffer = StringIO()
df_all.to_csv(csv_buffer)

In [None]:
s3.Object(bucket, file_name).put(Body=csv_buffer.getvalue())