# Abstracted Form 

In [1]:
from get_files_dynamic import get_super_table
import time
import sys

In [2]:
tic = time.perf_counter()
quick_pull = get_super_table()
toc = time.perf_counter()
print(f"Got the super table in {toc - tic:0.4f} seconds")

Got the super table in 3.6552 seconds


In [3]:
sys.getsizeof(quick_pull) / (10**9)

0.383447669

In [4]:
from helper import get_req_handler, initBoto3Session, get_s3_objs
from dotenv import load_dotenv
import boto3
import os
from io import StringIO

import pandas as pd
import numpy as np

# Set Up 

In [5]:
load_dotenv()

aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"]
aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"]

client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

bucket_name = os.environ["BUCKET_NAME"]
weather_dir = os.environ["WEATHER_DIR"]
bing_dir = os.environ["BING_DIR"]

In [6]:
def get_csv(file_name, data_dir):
    csv_obj = client.get_object(Bucket=bucket_name, Key= data_dir + file_name)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_string))
    df = df.drop("Unnamed: 0", axis = 1)
    return df

# Camera to Location Map 

In [7]:
cam_to_loc = df = get_csv("camera_station_mapping.csv", "").rename(
    columns = {"rainfall" : "rainfall_station_id",
               "not_rainfall" : "non_rainfall_station_id",
               "region" : "2hr_forecast_area"})
cam_to_loc.head()

Unnamed: 0,cam_id,non_rainfall_station_id,rainfall_station_id,2hr_forecast_area,compass
0,1001,S108,S119,Kallang,south
1,1002,S108,S215,Geylang,east
2,1003,S108,S123,Kallang,central
3,1004,S108,S215,Kallang,central
4,1005,S107,S221,Paya Lebar,east


# Read Files 

IMPORTANT: Note all data will be consistent (sometime the stations wont appear)  
Time stamps are also different - shimar looking into it 

In [8]:
rainfall_realtime = get_csv("rainfall-realtime.csv", weather_dir)
rainfall_realtime.head()

Unnamed: 0,timestamp,station_id,station_loc,station_name,rainfall_realtime,call_timestamp
0,2022-04-07T100000,S77,"(1.2937, 103.8125)",Alexandra Road,0.0,2022-04-07T101422
1,2022-04-07T100000,S109,"(1.3764, 103.8492)",Ang Mo Kio Avenue 5,0.0,2022-04-07T101422
2,2022-04-07T100000,S90,"(1.3191, 103.8191)",Bukit Timah Road,0.0,2022-04-07T101422
3,2022-04-07T100000,S114,"(1.38, 103.73)",Choa Chu Kang Avenue 4,0.0,2022-04-07T101422
4,2022-04-07T100000,S50,"(1.3337, 103.7768)",Clementi Road,0.0,2022-04-07T101422


In [9]:
non_rainfall_realtime = get_csv("non-rainfall-realtime.csv", weather_dir)
non_rainfall_realtime.head()

Unnamed: 0,timestamp,station_id,station_loc,station_name,wind_speed_realtime,wind_dir_realtime,call_timestamp,RH%_realtime,air_temp_realtime
0,2022-04-07T101000,S107,"(1.3135, 103.9625)",East Coast Parkway,11.9,220.0,2022-04-07T101422,,
1,2022-04-07T101000,S44,"(1.34583, 103.68166)",Nanyang Avenue,5.2,205.0,2022-04-07T101422,,
2,2022-04-07T101000,S24,"(1.3678, 103.9826)",Upper Changi Road North,4.0,281.0,2022-04-07T101422,,
3,2022-04-07T101500,S107,"(1.3135, 103.9625)",East Coast Parkway,11.2,225.0,2022-04-07T102045,82.4,26.5
4,2022-04-07T101500,S44,"(1.34583, 103.68166)",Nanyang Avenue,5.2,206.0,2022-04-07T102045,87.8,24.9


In [10]:
non_rainfall_realtime.station_id.unique()

array(['S107', 'S44', 'S24', 'S116', 'S50', 'S100'], dtype=object)

In [11]:
non_rainfall_realtime[["station_id", "call_timestamp"]].groupby("station_id").count()

Unnamed: 0_level_0,call_timestamp
station_id,Unnamed: 1_level_1
S100,1
S107,915
S116,6
S24,862
S44,250
S50,123


In [12]:
non_rainfall_realtime.dropna()[["station_id", "call_timestamp"]].groupby("station_id").count()

Unnamed: 0_level_0,call_timestamp
station_id,Unnamed: 1_level_1
S100,1
S107,884
S116,6
S24,835
S44,240
S50,120


In [9]:
len(non_rainfall_realtime.call_timestamp.unique())

901

In [10]:
non_rainfall_realtime[non_rainfall_realtime.call_timestamp == "2022-04-05T162939"]

Unnamed: 0,timestamp,station_id,station_loc,station_name,wind_speed_realtime,wind_dir_realtime,call_timestamp,RH%_realtime,air_temp_realtime


In [11]:
forecast_4d = get_csv("forecast-4DAY.csv", weather_dir)
forecast_4d.head()

Unnamed: 0,4day_date,4day_forecast,4day_temperature_low,4day_temperature_high,4day_relative_humidity_low,4day_relative_humidity_high,4day_wind_speed_low,4day_wind_speed_high,4day_wind_direction,4day_update_timestamp,call_timestamp
0,2022-04-08T000000,Pre-dawn and early morning thundery showers,23,33,60,95,10,20,W,2022-04-07T043212,2022-04-07T101422
1,2022-04-09T000000,Afternoon thundery showers,24,34,60,95,10,20,W,2022-04-07T043212,2022-04-07T101422
2,2022-04-10T000000,Afternoon thundery showers,24,34,55,95,5,15,W,2022-04-07T043212,2022-04-07T101422
3,2022-04-11T000000,Morning thundery showers,23,33,55,95,5,15,SW,2022-04-07T043212,2022-04-07T101422
4,2022-04-08T000000,Pre-dawn and early morning thundery showers,23,33,60,95,10,20,W,2022-04-07T043212,2022-04-07T102045


In [12]:
forecast_24h = get_csv("forecast-24HR.csv", weather_dir)
forecast_24h.head()

Unnamed: 0,24hr_start,24hr_end,24hr_general_forecast,24hr_general_relative_humidity_low,24hr_general_relative_humidity_high,24hr_general_temperature_low,24hr_general_temperature_high,24hr_general_wind_speed_low,24hr_general_wind_speed_high,24hr_general_wind_direction,...,24hr_period_2_forecast_south,24hr_period_2_forecast_north,24hr_period_3_start,24hr_period_3_end,24hr_period_3_forecast_west,24hr_period_3_forecast_east,24hr_period_3_forecast_central,24hr_period_3_forecast_south,24hr_period_3_forecast_north,call_timestamp
0,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,Partly Cloudy (Day),Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy,Cloudy,Cloudy,Cloudy,Cloudy,2022-04-07T101422
1,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,Partly Cloudy (Day),Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy,Cloudy,Cloudy,Cloudy,Cloudy,2022-04-07T102045
2,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,Partly Cloudy (Day),Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy,Cloudy,Cloudy,Cloudy,Cloudy,2022-04-07T102706
3,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,Partly Cloudy (Day),Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy,Cloudy,Cloudy,Cloudy,Cloudy,2022-04-07T103335
4,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,Partly Cloudy (Day),Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy,Cloudy,Cloudy,Cloudy,Cloudy,2022-04-07T103957


In [13]:
forecast_2h = get_csv("forecast-2HR.csv", weather_dir)
forecast_2h.head()

Unnamed: 0,2hr_forecast_area,2hr_forecast_area_loc,2hr_forecast_value,2hr_start,2hr_end,call_timestamp
0,Ang Mo Kio,"(1.375, 103.839)",Showers,2022-04-07T100000,2022-04-07T120000,2022-04-07T101422
1,Bedok,"(1.321, 103.924)",Showers,2022-04-07T100000,2022-04-07T120000,2022-04-07T101422
2,Bishan,"(1.350772, 103.839)",Showers,2022-04-07T100000,2022-04-07T120000,2022-04-07T101422
3,Boon Lay,"(1.304, 103.701)",Showers,2022-04-07T100000,2022-04-07T120000,2022-04-07T101422
4,Bukit Batok,"(1.353, 103.754)",Showers,2022-04-07T100000,2022-04-07T120000,2022-04-07T101422


In [14]:
bing_data = get_csv("all-congestion-levels.csv", bing_dir)
bing_data.head()

Unnamed: 0,camera_id,direction,trafficCongestion,travelDistance,travelDuration,travelDurationTraffic,distanceUnit,durationUnit,call_timestamp,cam_loc,remarks,dir_start,dir_finish
0,1001,1,,0.1,4,3,Kilometer,Second,2022-04-07T101422,"(1.29531332, 103.871146)",TRUE (left),"(1.2952719096725678, 103.87068089160921)","(1.2954046383155724, 103.87157229738281)"
1,1001,2,,0.099,4,3,Kilometer,Second,2022-04-07T101422,"(1.29531332, 103.871146)",TRUE (left),"(1.295216922089859, 103.87161591936747)","(1.29509936242626, 103.87072261698584)"
2,1002,1,Mild,0.099,5,6,Kilometer,Second,2022-04-07T101422,"(1.319541067, 103.8785627)",TRUE (left),"(1.319649819147653, 103.87894320461515)","(1.3195731538491529, 103.87805455346877)"
3,1002,2,Medium,0.1,4,5,Kilometer,Second,2022-04-07T101422,"(1.319541067, 103.8785627)",TRUE (left),"(1.319965499763606, 103.8780229770321)","(1.320015106713897, 103.8789296718566)"
4,1003,1,,0.105,6,5,Kilometer,Second,2022-04-07T101422,"(1.323957439, 103.8728576)",SPECIAL (5 lanes),"(1.3243485357876017, 103.87259422178255)","(1.3236450201732457, 103.87318289678052)"


In [15]:
bing_key_table = get_csv("route_data.csv", bing_dir)
bing_key_table.head()

Unnamed: 0,camera_id,direction,cam_loc,remarks,dir_start,dir_finish
0,1001,1,"(1.29531332, 103.871146)",TRUE (left),"(1.2952719096725678, 103.87068089160921)","(1.2954046383155724, 103.87157229738281)"
1,1001,2,"(1.29531332, 103.871146)",TRUE (left),"(1.295216922089859, 103.87161591936747)","(1.29509936242626, 103.87072261698584)"
2,1002,1,"(1.319541067, 103.8785627)",TRUE (left),"(1.319649819147653, 103.87894320461515)","(1.3195731538491529, 103.87805455346877)"
3,1002,2,"(1.319541067, 103.8785627)",TRUE (left),"(1.319965499763606, 103.8780229770321)","(1.320015106713897, 103.8789296718566)"
4,1003,1,"(1.323957439, 103.8728576)",SPECIAL (5 lanes),"(1.3243485357876017, 103.87259422178255)","(1.3236450201732457, 103.87318289678052)"


# Processing 

So the objective here is to make the dataset into a useable one - ie the whole long row as we would input into a model   
Chances are going to have to process individually.  
The superkey to join all tables will be the timestamp except forecast idk how that will work  

## Realtime Rainfall 

In [16]:
# original view
rainfall_realtime.head()

Unnamed: 0,timestamp,station_id,station_loc,station_name,rainfall_realtime,call_timestamp
0,2022-04-07T100000,S77,"(1.2937, 103.8125)",Alexandra Road,0.0,2022-04-07T101422
1,2022-04-07T100000,S109,"(1.3764, 103.8492)",Ang Mo Kio Avenue 5,0.0,2022-04-07T101422
2,2022-04-07T100000,S90,"(1.3191, 103.8191)",Bukit Timah Road,0.0,2022-04-07T101422
3,2022-04-07T100000,S114,"(1.38, 103.73)",Choa Chu Kang Avenue 4,0.0,2022-04-07T101422
4,2022-04-07T100000,S50,"(1.3337, 103.7768)",Clementi Road,0.0,2022-04-07T101422


In [17]:
rainfall_realtime = rainfall_realtime[["call_timestamp", "station_id", "rainfall_realtime"]]
rainfall_realtime = rainfall_realtime.rename(columns = {"station_id" : "rainfall_station_id"})
rainfall_realtime.head()

Unnamed: 0,call_timestamp,rainfall_station_id,rainfall_realtime
0,2022-04-07T101422,S77,0.0
1,2022-04-07T101422,S109,0.0
2,2022-04-07T101422,S90,0.0
3,2022-04-07T101422,S114,0.0
4,2022-04-07T101422,S50,0.0


## Realtime non rainfall

In [18]:
non_rainfall_realtime.head()

Unnamed: 0,timestamp,station_id,station_loc,station_name,wind_speed_realtime,wind_dir_realtime,call_timestamp,RH%_realtime,air_temp_realtime
0,2022-04-07T101000,S107,"(1.3135, 103.9625)",East Coast Parkway,11.9,220,2022-04-07T101422,,
1,2022-04-07T101000,S44,"(1.34583, 103.68166)",Nanyang Avenue,5.2,205,2022-04-07T101422,,
2,2022-04-07T101000,S24,"(1.3678, 103.9826)",Upper Changi Road North,4.0,281,2022-04-07T101422,,
3,2022-04-07T101500,S107,"(1.3135, 103.9625)",East Coast Parkway,11.2,225,2022-04-07T102045,82.4,26.5
4,2022-04-07T101500,S44,"(1.34583, 103.68166)",Nanyang Avenue,5.2,206,2022-04-07T102045,87.8,24.9


In [19]:
non_rainfall_realtime = non_rainfall_realtime.rename(columns = {"RH%_realtime" : "humidity_realtime", 
                                                               "station_id" : "non_rainfall_station_id"})

In [20]:
non_rainfall_realtime = non_rainfall_realtime[["call_timestamp", "non_rainfall_station_id", "wind_speed_realtime", 
                                               "wind_dir_realtime", "humidity_realtime", "air_temp_realtime"]]
non_rainfall_realtime.head()

Unnamed: 0,call_timestamp,non_rainfall_station_id,wind_speed_realtime,wind_dir_realtime,humidity_realtime,air_temp_realtime
0,2022-04-07T101422,S107,11.9,220,,
1,2022-04-07T101422,S44,5.2,205,,
2,2022-04-07T101422,S24,4.0,281,,
3,2022-04-07T102045,S107,11.2,225,82.4,26.5
4,2022-04-07T102045,S44,5.2,206,87.8,24.9


## 4 Day Forecast

In [21]:
forecast_4d["4day_date"] = pd.to_datetime(forecast_4d["4day_date"])
forecast_4d["grp_row_num"] = forecast_4d.sort_values("4day_date").groupby(["4day_update_timestamp", "call_timestamp"])["4day_date"].cumcount()+1
forecast_4d["grp_row_num"] = forecast_4d.grp_row_num.apply(lambda x : str(x))
forecast_4d = forecast_4d.drop("4day_date", axis = 1)
forecast_4d = forecast_4d.pivot(index = "call_timestamp", columns = "grp_row_num")
forecast_4d.columns = ['_'.join(col) for col in forecast_4d.columns]
forecast_4d = forecast_4d.drop(["4day_update_timestamp_1", "4day_update_timestamp_2", 
                  "4day_update_timestamp_3", "4day_update_timestamp_4"], axis = 1)
forecast_4d = forecast_4d.reset_index()
forecast_4d.head()

Unnamed: 0,call_timestamp,4day_forecast_1,4day_forecast_2,4day_forecast_3,4day_forecast_4,4day_temperature_low_1,4day_temperature_low_2,4day_temperature_low_3,4day_temperature_low_4,4day_temperature_high_1,...,4day_wind_speed_low_3,4day_wind_speed_low_4,4day_wind_speed_high_1,4day_wind_speed_high_2,4day_wind_speed_high_3,4day_wind_speed_high_4,4day_wind_direction_1,4day_wind_direction_2,4day_wind_direction_3,4day_wind_direction_4
0,2022-04-07T101422,Pre-dawn and early morning thundery showers,Afternoon thundery showers,Afternoon thundery showers,Morning thundery showers,23,24,24,23,33,...,5,5,20,20,15,15,W,W,W,SW
1,2022-04-07T102045,Pre-dawn and early morning thundery showers,Afternoon thundery showers,Afternoon thundery showers,Morning thundery showers,23,24,24,23,33,...,5,5,20,20,15,15,W,W,W,SW
2,2022-04-07T102706,Pre-dawn and early morning thundery showers,Afternoon thundery showers,Afternoon thundery showers,Morning thundery showers,23,24,24,23,33,...,5,5,20,20,15,15,W,W,W,SW
3,2022-04-07T103335,Pre-dawn and early morning thundery showers,Afternoon thundery showers,Afternoon thundery showers,Morning thundery showers,23,24,24,23,33,...,5,5,20,20,15,15,W,W,W,SW
4,2022-04-07T103957,Pre-dawn and early morning thundery showers,Afternoon thundery showers,Afternoon thundery showers,Morning thundery showers,23,24,24,23,33,...,5,5,20,20,15,15,W,W,W,SW


## 24 Hour Forecast

No processing needed. But got NA to drop 

In [22]:
forecast_24h.head()

Unnamed: 0,24hr_start,24hr_end,24hr_general_forecast,24hr_general_relative_humidity_low,24hr_general_relative_humidity_high,24hr_general_temperature_low,24hr_general_temperature_high,24hr_general_wind_speed_low,24hr_general_wind_speed_high,24hr_general_wind_direction,...,24hr_period_2_forecast_south,24hr_period_2_forecast_north,24hr_period_3_start,24hr_period_3_end,24hr_period_3_forecast_west,24hr_period_3_forecast_east,24hr_period_3_forecast_central,24hr_period_3_forecast_south,24hr_period_3_forecast_north,call_timestamp
0,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,Partly Cloudy (Day),Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy,Cloudy,Cloudy,Cloudy,Cloudy,2022-04-07T101422
1,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,Partly Cloudy (Day),Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy,Cloudy,Cloudy,Cloudy,Cloudy,2022-04-07T102045
2,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,Partly Cloudy (Day),Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy,Cloudy,Cloudy,Cloudy,Cloudy,2022-04-07T102706
3,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,Partly Cloudy (Day),Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy,Cloudy,Cloudy,Cloudy,Cloudy,2022-04-07T103335
4,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,Partly Cloudy (Day),Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy,Cloudy,Cloudy,Cloudy,Cloudy,2022-04-07T103957


In [23]:
forecast_24h.isnull().any()

24hr_start                             False
24hr_end                               False
24hr_general_forecast                  False
24hr_general_relative_humidity_low     False
24hr_general_relative_humidity_high    False
24hr_general_temperature_low           False
24hr_general_temperature_high          False
24hr_general_wind_speed_low            False
24hr_general_wind_speed_high           False
24hr_general_wind_direction            False
24hr_period_1_start                    False
24hr_period_1_end                      False
24hr_period_1_forecast_west            False
24hr_period_1_forecast_east            False
24hr_period_1_forecast_central         False
24hr_period_1_forecast_south           False
24hr_period_1_forecast_north           False
24hr_period_2_start                    False
24hr_period_2_end                      False
24hr_period_2_forecast_west            False
24hr_period_2_forecast_east            False
24hr_period_2_forecast_central         False
24hr_perio

In [24]:
forecast_24h = forecast_24h.drop(["24hr_period_4_start", "24hr_period_4_end", "24hr_period_4_forecast_west", 
                                 "24hr_period_4_forecast_east", "24hr_period_4_forecast_central", 
                                 "24hr_period_4_forecast_south", "24hr_period_4_forecast_north"], axis = 1)

KeyError: "['24hr_period_4_start', '24hr_period_4_end', '24hr_period_4_forecast_west', '24hr_period_4_forecast_east', '24hr_period_4_forecast_central', '24hr_period_4_forecast_south', '24hr_period_4_forecast_north'] not found in axis"

In [25]:
long_df1 = pd.melt(forecast_24h, id_vars = ["call_timestamp", "24hr_period_1_start", "24hr_period_1_end"],
        value_vars= ["24hr_period_1_forecast_west", 
                                 "24hr_period_1_forecast_east", "24hr_period_1_forecast_central", 
                                 "24hr_period_1_forecast_south", "24hr_period_1_forecast_north"])

long_df2 = pd.melt(forecast_24h, id_vars = ["call_timestamp", "24hr_period_2_start", "24hr_period_2_end"],
        value_vars= ["24hr_period_2_forecast_west", 
                                 "24hr_period_2_forecast_east", "24hr_period_2_forecast_central", 
                                 "24hr_period_2_forecast_south", "24hr_period_2_forecast_north"])

long_df3 = pd.melt(forecast_24h, id_vars = ["call_timestamp", "24hr_period_3_start", "24hr_period_3_end"],
        value_vars= ["24hr_period_3_forecast_west", 
                                 "24hr_period_3_forecast_east", "24hr_period_3_forecast_central", 
                                 "24hr_period_3_forecast_south", "24hr_period_3_forecast_north"])

In [26]:
long_df1["compass"] = long_df1.variable.apply(lambda x : x.split("_")[-1])
long_df2["compass"] = long_df2.variable.apply(lambda x : x.split("_")[-1])
long_df3["compass"] = long_df3.variable.apply(lambda x : x.split("_")[-1])

In [27]:
long_df1 = long_df1.drop("variable", axis = 1).rename(columns = {"value" : "24hr_period_1"})
long_df2 = long_df2.drop("variable", axis = 1).rename(columns = {"value" : "24hr_period_2"})
long_df3 = long_df3.drop("variable", axis = 1).rename(columns = {"value" : "24hr_period_3"})

In [28]:
forecast_24h = forecast_24h.drop(["24hr_period_1_forecast_west", 
                                 "24hr_period_1_forecast_east", "24hr_period_1_forecast_central", 
                                 "24hr_period_1_forecast_south", "24hr_period_1_forecast_north", 
                                 "24hr_period_2_forecast_west", 
                                 "24hr_period_2_forecast_east", "24hr_period_2_forecast_central", 
                                 "24hr_period_2_forecast_south", "24hr_period_2_forecast_north", 
                                 "24hr_period_3_forecast_west", 
                                 "24hr_period_3_forecast_east", "24hr_period_3_forecast_central", 
                                 "24hr_period_3_forecast_south", "24hr_period_3_forecast_north", 
                                 "24hr_period_1_start", "24hr_period_1_end", 
                                 "24hr_period_2_start", "24hr_period_2_end", 
                                 "24hr_period_3_start", "24hr_period_3_end"], axis = 1)

In [29]:
forecast_24h = forecast_24h\
.merge(long_df1, how = "inner", on = "call_timestamp")\
.merge(long_df2, how = "inner", on = ["call_timestamp", "compass"])\
.merge(long_df3, how = "inner", on = ["call_timestamp", "compass"])

In [30]:
forecast_24h.head()

Unnamed: 0,24hr_start,24hr_end,24hr_general_forecast,24hr_general_relative_humidity_low,24hr_general_relative_humidity_high,24hr_general_temperature_low,24hr_general_temperature_high,24hr_general_wind_speed_low,24hr_general_wind_speed_high,24hr_general_wind_direction,...,24hr_period_1_start,24hr_period_1_end,24hr_period_1,compass,24hr_period_2_start,24hr_period_2_end,24hr_period_2,24hr_period_3_start,24hr_period_3_end,24hr_period_3
0,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,2022-04-07T060000,2022-04-07T120000,Showers,west,2022-04-07T120000,2022-04-07T180000,Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy
1,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,2022-04-07T060000,2022-04-07T120000,Showers,east,2022-04-07T120000,2022-04-07T180000,Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy
2,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,2022-04-07T060000,2022-04-07T120000,Showers,central,2022-04-07T120000,2022-04-07T180000,Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy
3,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,2022-04-07T060000,2022-04-07T120000,Showers,south,2022-04-07T120000,2022-04-07T180000,Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy
4,2022-04-07T060000,2022-04-08T060000,Showers,60,95,24,33,5,15,NW,...,2022-04-07T060000,2022-04-07T120000,Showers,north,2022-04-07T120000,2022-04-07T180000,Partly Cloudy (Day),2022-04-07T180000,2022-04-08T060000,Cloudy


## 2 Hour Forecast

In [31]:
forecast_2h.head()

Unnamed: 0,2hr_forecast_area,2hr_forecast_area_loc,2hr_forecast_value,2hr_start,2hr_end,call_timestamp
0,Ang Mo Kio,"(1.375, 103.839)",Showers,2022-04-07T100000,2022-04-07T120000,2022-04-07T101422
1,Bedok,"(1.321, 103.924)",Showers,2022-04-07T100000,2022-04-07T120000,2022-04-07T101422
2,Bishan,"(1.350772, 103.839)",Showers,2022-04-07T100000,2022-04-07T120000,2022-04-07T101422
3,Boon Lay,"(1.304, 103.701)",Showers,2022-04-07T100000,2022-04-07T120000,2022-04-07T101422
4,Bukit Batok,"(1.353, 103.754)",Showers,2022-04-07T100000,2022-04-07T120000,2022-04-07T101422


In [32]:
forecast_2h = forecast_2h.drop(["2hr_forecast_area_loc", "2hr_start", "2hr_end"], axis = 1)
forecast_2h.head()

Unnamed: 0,2hr_forecast_area,2hr_forecast_value,call_timestamp
0,Ang Mo Kio,Showers,2022-04-07T101422
1,Bedok,Showers,2022-04-07T101422
2,Bishan,Showers,2022-04-07T101422
3,Boon Lay,Showers,2022-04-07T101422
4,Bukit Batok,Showers,2022-04-07T101422


## Bing Data 

In [33]:
bing_key_table = bing_key_table[["camera_id", "direction"]].rename(columns = {"camera_id" : "cam_id"})

In [34]:
bing_data = bing_data[["camera_id", "direction", "call_timestamp", "trafficCongestion",
                      "travelDistance", "travelDuration", "travelDurationTraffic"]]\
.rename(columns = {"camera_id" : "cam_id"})

In [35]:
bing_data["ratio"] = bing_data.travelDurationTraffic / bing_data.travelDuration

In [36]:
bing_data.head()

Unnamed: 0,cam_id,direction,call_timestamp,trafficCongestion,travelDistance,travelDuration,travelDurationTraffic,ratio
0,1001,1,2022-04-07T101422,,0.1,4,3,0.75
1,1001,2,2022-04-07T101422,,0.099,4,3,0.75
2,1002,1,2022-04-07T101422,Mild,0.099,5,6,1.2
3,1002,2,2022-04-07T101422,Medium,0.1,4,5,1.25
4,1003,1,2022-04-07T101422,,0.105,6,5,0.833333


In [37]:
bing_data[["trafficCongestion", "ratio"]].groupby("trafficCongestion").agg([min, max, np.mean])

Unnamed: 0_level_0,ratio,ratio,ratio
Unnamed: 0_level_1,min,max,mean
trafficCongestion,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Heavy,1.5,12.75,1.926756
Medium,1.25,1.464286,1.268546
Mild,1.1,1.242857,1.184653
,0.0,1.096774,0.901675


In [38]:
bing_data[["trafficCongestion", "cam_id"]].groupby("trafficCongestion").count() # need to look out for unknowns 

Unnamed: 0_level_0,cam_id
trafficCongestion,Unnamed: 1_level_1
Heavy,1250
Medium,2258
Mild,711
,4552


So basically clear case of correlated variables 

In [39]:
bing_data[bing_data.trafficCongestion == "Unknown"]

Unnamed: 0,cam_id,direction,call_timestamp,trafficCongestion,travelDistance,travelDuration,travelDurationTraffic,ratio


In [40]:
bing_data[bing_data.cam_id == 3702]

Unnamed: 0,cam_id,direction,call_timestamp,trafficCongestion,travelDistance,travelDuration,travelDurationTraffic,ratio
57,3702,1,2022-04-07T101422,,0.201,8,8,1.000
58,3702,2,2022-04-07T101422,,0.198,8,7,0.875
236,3702,1,2022-04-07T102045,,0.201,8,8,1.000
237,3702,2,2022-04-07T102045,,0.198,8,7,0.875
415,3702,1,2022-04-07T102706,,0.201,8,8,1.000
...,...,...,...,...,...,...,...,...
8292,3702,2,2022-04-07T150925,,0.198,8,7,0.875
8470,3702,1,2022-04-07T151546,,0.201,8,8,1.000
8471,3702,2,2022-04-07T151546,,0.198,8,7,0.875
8649,3702,1,2022-04-07T152209,,0.201,8,8,1.000


In [41]:
bing_data = bing_data[["cam_id", "direction", "call_timestamp", "trafficCongestion"]]
##bing_data = bing_data[bing_data.trafficCongestion != "Unknown"]

In [42]:
bing_data[bing_data.call_timestamp == "2022-04-05T162939"]

Unnamed: 0,cam_id,direction,call_timestamp,trafficCongestion


# Combining Tables 

Logic would be to first gather all the time that EVERY dataset has.  
This is to allow for it to be able to join all table. 
Also need to expand the camera mapping 

In [43]:
cam_all = cam_to_loc.merge(bing_key_table, on = "cam_id")

In [44]:
all_time = set(rainfall_realtime.call_timestamp)\
.intersection(set(non_rainfall_realtime.call_timestamp))\
.intersection(set(forecast_4d.call_timestamp))\
.intersection(set(forecast_24h.call_timestamp))\
.intersection(set(forecast_2h.call_timestamp))\
.intersection(set(bing_data.call_timestamp))

In [45]:
all_time = pd.DataFrame(all_time, columns = ["call_timestamp"])

In [46]:
all_time.sort_values("call_timestamp")

Unnamed: 0,call_timestamp
24,2022-04-07T101422
40,2022-04-07T102045
2,2022-04-07T102706
48,2022-04-07T103335
45,2022-04-07T103957
43,2022-04-07T104622
42,2022-04-07T105244
0,2022-04-07T110012
37,2022-04-07T110636
44,2022-04-07T111301


Next step is to cross join this with the camera.  
All cameras should have entries for all time. 

In [47]:
base_df = all_time.merge(cam_all, how = "cross") # change to cam_all

In [48]:
base_df

Unnamed: 0,call_timestamp,cam_id,non_rainfall_station_id,rainfall_station_id,2hr_forecast_area,compass,direction
0,2022-04-07T110012,1001,S108,S119,Kallang,south,1
1,2022-04-07T110012,1001,S108,S119,Kallang,south,2
2,2022-04-07T110012,1002,S108,S215,Geylang,east,1
3,2022-04-07T110012,1002,S108,S215,Geylang,east,2
4,2022-04-07T110012,1003,S108,S123,Kallang,central,1
...,...,...,...,...,...,...,...
8766,2022-04-07T103335,9704,S44,S40,Mandai,north,2
8767,2022-04-07T103335,9705,S44,S104,Mandai,north,1
8768,2022-04-07T103335,9705,S44,S104,Mandai,north,2
8769,2022-04-07T103335,9706,S44,S40,Mandai,north,1


In [49]:
df = base_df.merge(rainfall_realtime, on = ["call_timestamp", "rainfall_station_id"], how = "left")

In [50]:
len(df) == len(base_df)

True

In [51]:
df2 = df.merge(non_rainfall_realtime, on = ["call_timestamp", "non_rainfall_station_id"], how = "left")

In [52]:
len(df2) == len(base_df)

True

In [53]:
df3 = df2.merge(forecast_2h, on = ["call_timestamp", "2hr_forecast_area"], how = "left")

In [54]:
len(df3) == len(base_df)

True

In [55]:
df4 = df3.merge(forecast_4d, on = ["call_timestamp"])

In [56]:
len(df4) == len(base_df)

True

In [57]:
df5 = df4.merge(forecast_24h, on = ["call_timestamp", "compass"])

In [58]:
len(df5) == len(base_df)

True

In [59]:
df6 = df5.merge(bing_data, on = ["call_timestamp", "cam_id", "direction"])

In [60]:
len(df6) == len(base_df)

True