This notebook performs the following tasks:
    1. Download .csv file(s) from a AWS S3 Bucket for specific date(s)/hour(s) or a range of dates/hours to a single pandas dataframe
    2. Filter the records to choose the records determined to be ISSR ('IsISSR' == 1)
    3. Save the filtered dataframe to the "partly-cloudy-common-area" S3 Bucket for idv input

References:

https://github.com/awslabs/aws-data-wrangler

### 1. Install/Load Libraries

In [1]:
pip install awswrangler

Collecting awswrangler
  Downloading awswrangler-2.8.0-py3-none-any.whl (179 kB)
[K     |████████████████████████████████| 179 kB 22.7 MB/s eta 0:00:01
[?25hCollecting redshift-connector~=2.0.0
  Downloading redshift_connector-2.0.881-py3-none-any.whl (91 kB)
[K     |████████████████████████████████| 91 kB 11.9 MB/s eta 0:00:01
Collecting pymysql<1.1.0,>=0.9.0
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 3.1 MB/s  eta 0:00:01
[?25hCollecting pg8000<1.20.0,>=1.16.0
  Downloading pg8000-1.19.5-py3-none-any.whl (34 kB)
Collecting scramp==1.4.0
  Downloading scramp-1.4.0-py3-none-any.whl (8.4 kB)
Installing collected packages: scramp, redshift-connector, pymysql, pg8000, awswrangler
Successfully installed awswrangler-2.8.0 pg8000-1.19.5 pymysql-1.0.2 redshift-connector-2.0.881 scramp-1.4.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import sagemaker
import boto3
import awswrangler as wr

import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
from sagemaker import get_execution_role
role = get_execution_role()

### 2. Specify Input/Output S3 Buckets

Always, output_bucket = 'partly-cloudy-common-area' or output_bucket = your own bucket

Please do NOT store anything new (i.e., setting output_bucket as) in the following buckets: (1) partly-cloudy-asdb, (2) partly-cloudy-rap-csv, or (3) partly-cloudy-rap-parquet

In [4]:
input_bucket = 'partly-cloudy-rap-csv' # <<<<<<<<<<<<<<<<<<<<< Use when reading from master .csv RAP data
#input_bucket = 'partly-cloudy-common-area' # <<<<<<<<<<<<<<<<< Use when reading file(s) from the team common area

output_bucket = 'partly-cloudy-common-area'
subfolder = ''

### 3. For reading specific file(s) from Input S3 Bucket to make a single Dataframe (Run this cell and skip Step 4)

For reading RAP .csv data for a range of dates/times, skip Step 3 and Run Step 4 instead

In [5]:
data_locations = ['s3://partly-cloudy-rap-csv/2021_05_01_13.csv',
                  's3://partly-cloudy-common-area/2021_05_02_20.csv'] # <---------------- Specify

for data_location in data_locations:
    print(data_location)

s3://partly-cloudy-rap-csv/2021_05_01_13.csv
s3://partly-cloudy-common-area/2021_05_02_20.csv


### 4. Reading RAP .csv data for a range of dates/times

In [6]:
def get_dataLocations(bucket_name, firstDT, lastDT):
    if (firstDT >= lastDT):
        lastDT = firstDT

    dateTimes = pd.date_range(firstDT, lastDT, freq= 'H')
    dateTimesSer = pd.Series([str(dateTime) for dateTime in dateTimes], name= 'temp')
    dateTimesDF = pd.DataFrame({'yr':list(dateTimesSer.str.slice(0,4)),
                                'mo':list(dateTimesSer.str.slice(5,7)), 
                                'day':list(dateTimesSer.str.slice(8,10)), 
                                'hr':list(dateTimesSer.str.slice(11,13))})

    dtDF = dateTimesDF.iloc[0:len(dateTimesDF)].copy()
    dat_locs = ['s3://' + bucket_name + '/' + dtDF.iloc[i,0] + "_" + dtDF.iloc[i,1] + "_" + dtDF.iloc[i,2] + "_" + dtDF.iloc[i,3] +'.csv' for i in range(len(dtDF))]
    
    return dat_locs

In [9]:
beginDT = '2021-05-01 00:00:00' # <---------------- Specify (between '2020-06-01 00:00:00' to '2021-05-30 23:00:00')
endDT =   '2021-05-01 23:00:00' # <-----------------Specify (between '2020-06-01 00:00:00' to '2021-05-30 23:00:00')

data_locations = get_dataLocations(input_bucket, beginDT, endDT)

for data_location in data_locations:
    print(data_location)

s3://partly-cloudy-rap-csv/2021_05_01_00.csv
s3://partly-cloudy-rap-csv/2021_05_01_01.csv
s3://partly-cloudy-rap-csv/2021_05_01_02.csv
s3://partly-cloudy-rap-csv/2021_05_01_03.csv
s3://partly-cloudy-rap-csv/2021_05_01_04.csv
s3://partly-cloudy-rap-csv/2021_05_01_05.csv
s3://partly-cloudy-rap-csv/2021_05_01_06.csv
s3://partly-cloudy-rap-csv/2021_05_01_07.csv
s3://partly-cloudy-rap-csv/2021_05_01_08.csv
s3://partly-cloudy-rap-csv/2021_05_01_09.csv
s3://partly-cloudy-rap-csv/2021_05_01_10.csv
s3://partly-cloudy-rap-csv/2021_05_01_11.csv
s3://partly-cloudy-rap-csv/2021_05_01_12.csv
s3://partly-cloudy-rap-csv/2021_05_01_13.csv
s3://partly-cloudy-rap-csv/2021_05_01_14.csv
s3://partly-cloudy-rap-csv/2021_05_01_15.csv
s3://partly-cloudy-rap-csv/2021_05_01_16.csv
s3://partly-cloudy-rap-csv/2021_05_01_17.csv
s3://partly-cloudy-rap-csv/2021_05_01_18.csv
s3://partly-cloudy-rap-csv/2021_05_01_19.csv
s3://partly-cloudy-rap-csv/2021_05_01_20.csv
s3://partly-cloudy-rap-csv/2021_05_01_21.csv
s3://partl

### 5. Ingest Selected Files into a Single Dataframe
The section works for both Sections 3 or 4.

In [10]:
df = wr.s3.read_csv(path= data_locations)

In [11]:
df

Unnamed: 0,dateTime,hPa,FLevel,Nx,Ny,Lat,Lon,Temperature,RH_ice,IsISSR
0,2021-05-01 00:00:00,150,440,1,1,16.281000,-126.138000,204.567,56.21,0
1,2021-05-01 00:00:00,150,440,2,1,16.322011,-125.954684,204.567,62.81,0
2,2021-05-01 00:00:00,150,440,3,1,16.362789,-125.771252,204.629,70.29,0
3,2021-05-01 00:00:00,150,440,4,1,16.403332,-125.587705,204.567,77.14,0
4,2021-05-01 00:00:00,150,440,5,1,16.443642,-125.404045,204.442,82.89,0
...,...,...,...,...,...,...,...,...,...,...
21130195,2021-05-01 23:00:00,450,210,297,225,55.648911,-58.431595,243.881,13.90,0
21130196,2021-05-01 23:00:00,450,210,298,225,55.607604,-58.167947,243.819,16.40,0
21130197,2021-05-01 23:00:00,450,210,299,225,55.565986,-57.904583,243.819,18.39,0
21130198,2021-05-01 23:00:00,450,210,300,225,55.524058,-57.641507,243.881,19.37,0


### 6. Data Manipulation

In [12]:
df_select = df.loc[df['IsISSR'] == 1].copy()

In [13]:
df_select

Unnamed: 0,dateTime,hPa,FLevel,Nx,Ny,Lat,Lon,Temperature,RH_ice,IsISSR
67737,2021-05-01 00:00:00,175,410,13,1,16.757641,-123.930719,211.428,100.23,1
67738,2021-05-01 00:00:00,175,410,14,1,16.795827,-123.746057,211.428,100.23,1
68471,2021-05-01 00:00:00,175,410,145,3,20.023556,-98.875552,211.178,100.61,1
68472,2021-05-01 00:00:00,175,410,146,3,20.028629,-98.681874,211.241,100.79,1
68473,2021-05-01 00:00:00,175,410,147,3,20.033442,-98.488181,211.241,100.57,1
...,...,...,...,...,...,...,...,...,...,...
20988510,2021-05-01 23:00:00,400,240,82,205,54.252498,-117.414163,230.670,100.44,1
20994351,2021-05-01 23:00:00,400,240,204,224,57.979857,-83.973376,232.858,100.17,1
20994650,2021-05-01 23:00:00,400,240,202,225,58.152658,-84.514715,232.858,100.17,1
20994651,2021-05-01 23:00:00,400,240,203,225,58.140960,-84.232502,232.858,100.53,1


### 6. Store Output to "partly-cloudy-common-area" Bucket

In [14]:
# Examine the list of files are already in the output_bucket (i.e., "partly-cloudy-common-area")
conn = boto3.client('s3')
contents = conn.list_objects(Bucket= output_bucket, Prefix= subfolder)['Contents']
for f in contents:
    print(f['Key'])

JuneFirst2020_24hr_issr.csv
hourly_issr_summary.csv


In [None]:
outputFileName = 'selectAfileName.csv' # <--------------------------------------------- Specify

In [None]:
wr.s3.to_csv(df_select, f"s3://{output_bucket}/{outputFileName}", index=False)