In [2]:
from helper import get_req_handler, initBoto3Session, get_s3_objs
from dotenv import load_dotenv
import boto3
import os
from io import StringIO

import pandas as pd

# Camera to Location Map 

In [4]:
cam_to_loc = pd.read_csv("camera_station_mapping.csv")
cam_to_loc.head()

Unnamed: 0,cam_id,not_rainfall,rainfall,region
0,1001,S108,S119,Kallang
1,1002,S108,S215,Geylang
2,1003,S108,S123,Kallang
3,1004,S108,S215,Kallang
4,1005,S107,S221,Paya Lebar


# Set Up 

In [8]:
load_dotenv()

aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"]
aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"]

client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

bucket_name = os.environ["BUCKET_NAME"]
data_dir = os.environ["DATA_DIR"]

In [12]:
def get_csv(file_name):
    csv_obj = client.get_object(Bucket=bucket_name, Key= data_dir + file_name)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_string))
    df = df.drop("Unnamed: 0", axis = 1)
    return df

# Read Files 

In [13]:
rainfall_realtime = get_csv("rainfall-realtime.csv" )
rainfall_realtime.head()

Unnamed: 0,timestamp,station_id,station_loc,station_name,rainfall_realtime
0,2022-03-28T163000,S77,"(1.2937, 103.8125)",Alexandra Road,0.0
1,2022-03-28T163000,S109,"(1.3764, 103.8492)",Ang Mo Kio Avenue 5,0.2
2,2022-03-28T163000,S90,"(1.3191, 103.8191)",Bukit Timah Road,0.0
3,2022-03-28T163000,S114,"(1.38, 103.73)",Choa Chu Kang Avenue 4,0.8
4,2022-03-28T163000,S50,"(1.3337, 103.7768)",Clementi Road,0.0


In [14]:
non_rainfall_realtime = get_csv("non-rainfall-realtime.csv" )
non_rainfall_realtime.head()

Unnamed: 0,timestamp,station_id,station_loc,station_name,wind_speed_realtime,wind_dir_realtime,RH%_realtime,air_temp_realtime
0,2022-03-28T163500,S107,"(1.3135, 103.9625)",East Coast Parkway,3.0,285,86.8,26.9
1,2022-03-28T163500,S108,"(1.2799, 103.8703)",Marina Gardens Drive,4.4,339,99.4,26.7
2,2022-03-28T163500,S44,"(1.34583, 103.68166)",Nanyang Avenue,5.8,39,74.7,28.2
3,2022-03-28T163500,S106,"(1.4168, 103.9673)",Pulau Ubin,1.5,217,85.1,26.2
4,2022-03-28T163500,S24,"(1.3678, 103.9826)",Upper Changi Road North,4.0,323,85.9,26.5


In [15]:
forecast_4d = get_csv("forecast-4DAY.csv")
forecast_4d.head()

Unnamed: 0,4day_date,4day_forecast,4day_temperature_low,4day_temperature_high,4day_relative_humidity_low,4day_relative_humidity_high,4day_wind_speed_low,4day_wind_speed_high,4day_wind_direction,4day_update_timestamp
0,2022-03-29,Afternoon thundery showers,24,35,55,95,10,20,NNW,2022-03-28T180212
1,2022-03-30,Late afternoon thundery showers,24,35,55,90,5,15,NNW,2022-03-28T180212
2,2022-03-31,Pre-dawn hours and early morning thundery showers,23,33,60,95,5,15,W,2022-03-28T180212
3,2022-04-01,Afternoon thundery showers,24,34,55,95,5,20,W,2022-03-28T180212


In [16]:
forecast_24h = get_csv("forecast-24HR.csv")
forecast_24h.head()

Unnamed: 0,24hr_start,24hr_end,24hr_general_forecast,24hr_general_relative_humidity_low,24hr_general_relative_humidity_high,24hr_general_temperature_low,24hr_general_temperature_high,24hr_general_wind_speed_low,24hr_general_wind_speed_high,24hr_general_wind_direction,...,24hr_period_2_forecast_central,24hr_period_2_forecast_south,24hr_period_2_forecast_north,24hr_period_3_start,24hr_period_3_end,24hr_period_3_forecast_west,24hr_period_3_forecast_east,24hr_period_3_forecast_central,24hr_period_3_forecast_south,24hr_period_3_forecast_north
0,2022-03-28T180000,2022-03-29T180000,Thundery Showers,55,95,24,35,10,20,NNW,...,Partly Cloudy (Day),Partly Cloudy (Day),Partly Cloudy (Day),2022-03-29T120000,2022-03-29T180000,Thundery Showers,Thundery Showers,Thundery Showers,Thundery Showers,Thundery Showers


In [106]:
forecast_24h.columns

Index(['24hr_start', '24hr_end', '24hr_general_forecast',
       '24hr_general_relative_humidity_low',
       '24hr_general_relative_humidity_high', '24hr_general_temperature_low',
       '24hr_general_temperature_high', '24hr_general_wind_speed_low',
       '24hr_general_wind_speed_high', '24hr_general_wind_direction',
       '24hr_period_1_start', '24hr_period_1_end',
       '24hr_period_1_forecast_west', '24hr_period_1_forecast_east',
       '24hr_period_1_forecast_central', '24hr_period_1_forecast_south',
       '24hr_period_1_forecast_north', '24hr_period_2_start',
       '24hr_period_2_end', '24hr_period_2_forecast_west',
       '24hr_period_2_forecast_east', '24hr_period_2_forecast_central',
       '24hr_period_2_forecast_south', '24hr_period_2_forecast_north',
       '24hr_period_3_start', '24hr_period_3_end',
       '24hr_period_3_forecast_west', '24hr_period_3_forecast_east',
       '24hr_period_3_forecast_central', '24hr_period_3_forecast_south',
       '24hr_period_3_foreca

In [17]:
forecast_2h = get_csv("forecast-2HR.csv")
forecast_2h.head()

Unnamed: 0,2hr_forecast_area,2hr_forecast_area_loc,2hr_forecast_value,2hr_start,2hr_end
0,Ang Mo Kio,"(1.375, 103.839)",Light Rain,2022-03-28T163000,2022-03-28T183000
1,Bedok,"(1.321, 103.924)",Light Rain,2022-03-28T163000,2022-03-28T183000
2,Bishan,"(1.350772, 103.839)",Light Rain,2022-03-28T163000,2022-03-28T183000
3,Boon Lay,"(1.304, 103.701)",Light Rain,2022-03-28T163000,2022-03-28T183000
4,Bukit Batok,"(1.353, 103.754)",Light Rain,2022-03-28T163000,2022-03-28T183000


# Processing 

So the objective here is to make the dataset into a useable one - ie the whole long row as we would input into a model   
Chances are going to have to process individually.  
The superkey to join all tables will be the timestamp except forecast idk how that will work  

QUESTION: ARE ALL THE TIMESTAMP THE SAME? 

## Realtime Rainfall 

In [18]:
# original view
rainfall_realtime.head()

Unnamed: 0,timestamp,station_id,station_loc,station_name,rainfall_realtime
0,2022-03-28T163000,S77,"(1.2937, 103.8125)",Alexandra Road,0.0
1,2022-03-28T163000,S109,"(1.3764, 103.8492)",Ang Mo Kio Avenue 5,0.2
2,2022-03-28T163000,S90,"(1.3191, 103.8191)",Bukit Timah Road,0.0
3,2022-03-28T163000,S114,"(1.38, 103.73)",Choa Chu Kang Avenue 4,0.8
4,2022-03-28T163000,S50,"(1.3337, 103.7768)",Clementi Road,0.0


In [21]:
rainfall_realtime = rainfall_realtime[["timestamp", "station_id", "rainfall_realtime"]]
rainfall_realtime.head()

Unnamed: 0,timestamp,station_id,rainfall_realtime
0,2022-03-28T163000,S77,0.0
1,2022-03-28T163000,S109,0.2
2,2022-03-28T163000,S90,0.0
3,2022-03-28T163000,S114,0.8
4,2022-03-28T163000,S50,0.0


## Realtime non rainfall

In [25]:
non_rainfall_realtime.head()

Unnamed: 0,timestamp,station_id,station_loc,station_name,wind_speed_realtime,wind_dir_realtime,RH%_realtime,air_temp_realtime
0,2022-03-28T163500,S107,"(1.3135, 103.9625)",East Coast Parkway,3.0,285,86.8,26.9
1,2022-03-28T163500,S108,"(1.2799, 103.8703)",Marina Gardens Drive,4.4,339,99.4,26.7
2,2022-03-28T163500,S44,"(1.34583, 103.68166)",Nanyang Avenue,5.8,39,74.7,28.2
3,2022-03-28T163500,S106,"(1.4168, 103.9673)",Pulau Ubin,1.5,217,85.1,26.2
4,2022-03-28T163500,S24,"(1.3678, 103.9826)",Upper Changi Road North,4.0,323,85.9,26.5


In [26]:
non_rainfall_realtime = non_rainfall_realtime.drop(["station_loc", "station_name"], axis = 1)
non_rainfall_realtime

Unnamed: 0,timestamp,station_id,wind_speed_realtime,wind_dir_realtime,RH%_realtime,air_temp_realtime
0,2022-03-28T163500,S107,3.0,285,86.8,26.9
1,2022-03-28T163500,S108,4.4,339,99.4,26.7
2,2022-03-28T163500,S44,5.8,39,74.7,28.2
3,2022-03-28T163500,S106,1.5,217,85.1,26.2
4,2022-03-28T163500,S24,4.0,323,85.9,26.5
...,...,...,...,...,...,...
459,2022-03-29T002000,S107,2.4,329,83.3,27.1
460,2022-03-29T002000,S108,5.9,359,92.9,27.7
461,2022-03-29T002000,S44,2.2,39,79.5,26.7
462,2022-03-29T002000,S106,1.0,120,96.8,24.1


## 4 Day Forecast

Need to double cfm with more data

In [108]:
forecast_4d = get_csv("forecast-4DAY.csv")

In [111]:
forecast_4d.head()
forecast_4d["4day_date"] = pd.to_datetime(forecast_4d["4day_date"])
forecast_4d["grp_row_num"] = forecast_4d.sort_values("4day_date").groupby("4day_update_timestamp")["4day_date"].cumcount()+1
forecast_4d = forecast_4d.drop("4day_date", axis = 1)

In [109]:
forecast_4d

Unnamed: 0,4day_date,4day_forecast,4day_temperature_low,4day_temperature_high,4day_relative_humidity_low,4day_relative_humidity_high,4day_wind_speed_low,4day_wind_speed_high,4day_wind_direction,4day_update_timestamp
0,2022-03-29,Afternoon thundery showers,24,35,55,95,10,20,NNW,2022-03-28T180212
1,2022-03-30,Late afternoon thundery showers,24,35,55,90,5,15,NNW,2022-03-28T180212
2,2022-03-31,Pre-dawn hours and early morning thundery showers,23,33,60,95,5,15,W,2022-03-28T180212
3,2022-04-01,Afternoon thundery showers,24,34,55,95,5,20,W,2022-03-28T180212


In [113]:
forecast_4d

Unnamed: 0,4day_update_timestamp,"(4day_forecast, 1)","(4day_forecast, 2)","(4day_forecast, 3)","(4day_forecast, 4)","(4day_temperature_low, 1)","(4day_temperature_low, 2)","(4day_temperature_low, 3)","(4day_temperature_low, 4)","(4day_temperature_high, 1)",...,"(4day_wind_speed_low, 3)","(4day_wind_speed_low, 4)","(4day_wind_speed_high, 1)","(4day_wind_speed_high, 2)","(4day_wind_speed_high, 3)","(4day_wind_speed_high, 4)","(4day_wind_direction, 1)","(4day_wind_direction, 2)","(4day_wind_direction, 3)","(4day_wind_direction, 4)"
0,2022-03-28T180212,Afternoon thundery showers,Late afternoon thundery showers,Pre-dawn hours and early morning thundery showers,Afternoon thundery showers,24,24,23,24,35,...,5,5,20,15,15,20,NNW,NNW,W,W


In [112]:
forecast_4d = forecast_4d.pivot(index = "4day_update_timestamp", columns = "grp_row_num")
forecast_4d.columns = forecast_4d.columns.to_flat_index()
forecast_4d = forecast_4d.reset_index()

In [96]:
forecast_4d.merge(non_rainfall_realtime, how = "inner", left_on = "4day_update_timestamp", right_on = "timestamp")

Unnamed: 0,4day_update_timestamp,"(4day_forecast, 1)","(4day_forecast, 2)","(4day_forecast, 3)","(4day_forecast, 4)","(4day_temperature_low, 1)","(4day_temperature_low, 2)","(4day_temperature_low, 3)","(4day_temperature_low, 4)","(4day_temperature_high, 1)",...,"(4day_wind_direction, 1)","(4day_wind_direction, 2)","(4day_wind_direction, 3)","(4day_wind_direction, 4)",timestamp,station_id,wind_speed_realtime,wind_dir_realtime,RH%_realtime,air_temp_realtime


In [104]:
forecast_4d[["4day_update_timestamp"]] # try based on nearest date

Unnamed: 0,4day_update_timestamp
0,2022-03-28T180212


In [105]:
non_rainfall_realtime.timestamp.unique()

array(['2022-03-28T163500', '2022-03-28T164500', '2022-03-28T165000',
       '2022-03-28T165500', '2022-03-28T170000', '2022-03-28T170500',
       '2022-03-28T171000', '2022-03-28T171500', '2022-03-28T172000',
       '2022-03-28T172500', '2022-03-28T173000', '2022-03-28T173500',
       '2022-03-28T174000', '2022-03-28T174500', '2022-03-28T175000',
       '2022-03-28T175500', '2022-03-28T180000', '2022-03-28T180500',
       '2022-03-28T181000', '2022-03-28T181500', '2022-03-28T182000',
       '2022-03-28T182500', '2022-03-28T183000', '2022-03-28T183500',
       '2022-03-28T184000', '2022-03-28T184500', '2022-03-28T185000',
       '2022-03-28T185500', '2022-03-28T190000', '2022-03-28T190500',
       '2022-03-28T191000', '2022-03-28T191500', '2022-03-28T192000',
       '2022-03-28T192500', '2022-03-28T193000', '2022-03-28T193500',
       '2022-03-28T194000', '2022-03-28T194500', '2022-03-28T195000',
       '2022-03-28T195500', '2022-03-28T200000', '2022-03-28T200500',
       '2022-03-28T2