https://aws-data-wrangler.readthedocs.io/en/stable/stubs/awswrangler.s3.read_csv.html

The script below performs the following:
1. assigns ADS-B Records to NOAA RAP Cells by obtaining a cell with a minimum distance from an aircraft
2. assigned records were saved as CSV files in S3 Bucket

In [1]:
pip install awswrangler

Collecting awswrangler
  Downloading awswrangler-2.10.0-py3-none-any.whl (184 kB)
[K     |████████████████████████████████| 184 kB 40.4 MB/s eta 0:00:01
[?25hCollecting pymysql<1.1.0,>=0.9.0
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 3.5 MB/s  eta 0:00:01
Collecting pg8000<1.21.0,>=1.16.0
  Downloading pg8000-1.20.0-py3-none-any.whl (34 kB)
Collecting redshift-connector~=2.0.882
  Downloading redshift_connector-2.0.883-py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 525 kB/s  eta 0:00:01
Collecting scramp>=1.4.0
  Downloading scramp-1.4.0-py3-none-any.whl (8.4 kB)
Installing collected packages: scramp, redshift-connector, pymysql, pg8000, awswrangler
Successfully installed awswrangler-2.10.0 pg8000-1.20.0 pymysql-1.0.2 redshift-connector-2.0.883 scramp-1.4.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may

In [2]:
pip install haversine

Collecting haversine
  Downloading haversine-2.3.1-py2.py3-none-any.whl (5.5 kB)
Installing collected packages: haversine
Successfully installed haversine-2.3.1
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import sagemaker
import boto3
import awswrangler as wr

import pandas as pd
import numpy as np
from datetime import datetime

import json
from haversine import haversine, Unit
from scipy.spatial import distance

In [4]:
input_bucket = 'adsb-data'
input_subfolder = ''

output_bucket = 'partly-cloudy-common-area'
output_subfolder = 'adbs-conus-rap-merge'

In [5]:
from sagemaker import get_execution_role
role = get_execution_role()

In [None]:
# Examine the files in the 'adsb-data s3 Bucket
conn = boto3.client('s3')
contents = conn.list_objects(Bucket= input_bucket, Prefix= input_subfolder)['Contents']
for f in contents:
    print(f['Key'])

In [None]:
# Examine the contents in the 'adsb-data' s3 Bucket
contents

### 1. Convert json to a csv (this part was already done in adsb-json-to-csv-store-in-s3.ipynb)
#### Skip to Section 2 below
#########################################################################################################################

In [8]:
# Get all file names in 'adsb-data' Bucket into a list 
myFileNames = [f['Key'] for f in contents] # <---------------------- Using 'AWS contents' does not work!!!
# myFileNames = ['2016_06_20_0002Z.json', '2016_06_20_0003Z.json']

In [9]:
# Generate all file names assume to be in 'adsb-data' <--------------------------------- Redo by Iteration
dateTimes = pd.date_range('2016-06-20 00:00:00', '2016-06-20 23:59:00', freq= 'T')
dateTimesSer = pd.Series([str(dateTime) for dateTime in dateTimes], name= 'temp')
myFileNames = [dateTimesSer[i][0:4] + '_' + dateTimesSer[i][5:7] + '_' + dateTimesSer[i][8:10] + '_' + dateTimesSer[i][11:13] + dateTimesSer[i][14:16] +'Z.json' for i in range(len(dateTimesSer))]

In [None]:
myFileNames

In [11]:
%%time
# Obtain necessary data items from all json files in 'adsb-data' in a single dataframe

dt = []
eyedee = []
alt = []
lat = []
long = []

for myFileName in myFileNames:
    s3_obj = boto3.client('s3')
    s3_myobj = s3_obj.get_object(Bucket= input_bucket, Key= myFileName)
    s3_mydata = s3_myobj['Body'].read().decode('utf-8')
    js = json.loads(s3_mydata)
    
    dattim = datetime(year= int(myFileName[0:4]),
                      month= int(myFileName[5:7]),
                      day= int(myFileName[8:10]),
                      hour= int(myFileName[11:13]),
                      minute= int(myFileName[13:15]),
                      second= 0)
    
    for i in range(len(js['acList'])):
        dt.append(str(dattim))
        if "Id" in js['acList'][i].keys():
            eyedee.append(js['acList'][i]['Id'])
        else:
            eyedee.append(np.nan)
        if "Alt" in js['acList'][i].keys():
            alt.append(js['acList'][i]['Alt'])
        else:
            alt.append(np.nan)
        if ("Lat" in js['acList'][i].keys()):
            if type(js['acList'][i]['Lat']) == np.float:
                lat.append(js['acList'][i]['Lat'])
            else:
                lat.append(np.nan)
        else:
            lat.append(np.nan)
        if "Long" in js['acList'][i].keys():
            long.append(js['acList'][i]['Long'])
        else:
            long.append(np.nan)


CPU times: user 6min 18s, sys: 9.86 s, total: 6min 28s
Wall time: 11min 59s


In [12]:
df = pd.DataFrame({"DateTime":dt, "id":eyedee, 'Altitude':alt, 'Latitude':lat, 'Longitude':long})

In [13]:
df

Unnamed: 0,DateTime,id,Altitude,Latitude,Longitude
0,2016-06-20 00:00:00,8721588,31000.0,,
1,2016-06-20 00:00:00,2961715,,0.000000,-2147.483648
2,2016-06-20 00:00:00,10607858,700.0,,
3,2016-06-20 00:00:00,12608926,2775.0,,
4,2016-06-20 00:00:00,7786027,300.0,1.334150,103.987221
...,...,...,...,...,...
6743149,2016-06-20 23:59:00,11401618,6950.0,,
6743150,2016-06-20 23:59:00,11401617,3407.0,,
6743151,2016-06-20 23:59:00,5271422,1000.0,45.481979,13.648616
6743152,2016-06-20 23:59:00,11401619,6752.0,,


In [60]:
# Examine number of records with NULL
df.isnull().sum()

DateTime          0
id                0
Altitude      90618
Latitude     718818
Longitude    718818
dtype: int64

In [61]:
df['DateTime'].value_counts()

2016-06-20 15:43:00    6711
2016-06-20 15:38:00    6694
2016-06-20 14:52:00    6656
2016-06-20 14:44:00    6649
2016-06-20 14:55:00    6648
                       ... 
2016-06-20 04:13:00    2668
2016-06-20 04:06:00    2667
2016-06-20 04:10:00    2643
2016-06-20 04:09:00    2629
2016-06-20 03:07:00     736
Name: DateTime, Length: 1439, dtype: int64

In [62]:
df_select = df.dropna(axis= 0, how= 'any').copy() # Per Dr. Cherry on 6/17/2021

In [17]:
df_select

Unnamed: 0,DateTime,id,Altitude,Latitude,Longitude
4,2016-06-20 00:00:00,7786027,300.0,1.334150,103.987221
13,2016-06-20 00:00:00,2900289,3752492.0,0.000003,0.000000
23,2016-06-20 00:00:00,4196356,2900.0,51.487190,-0.581210
24,2016-06-20 00:00:00,4494056,25450.0,51.418110,2.221830
25,2016-06-20 00:00:00,11287741,39525.0,50.827541,0.748670
...,...,...,...,...,...
6743117,2016-06-20 23:59:00,8189898,0.0,-33.870198,151.210213
6743128,2016-06-20 23:59:00,12193915,55800.0,37.812332,-120.652634
6743130,2016-06-20 23:59:00,3289137,3159599.0,0.000706,0.000000
6743133,2016-06-20 23:59:00,2894892,2894892.0,0.000000,-2147.483648


In [65]:
minLatRAP = min(latlongRAP['Lat']) - 0.04
maxLatRAP = max(latlongRAP['Lat']) + 0.04
minLonRAP = min(latlongRAP['Lon']) - 0.2
maxLonRAP = max(latlongRAP['Lon']) + 0.2

In [66]:
adsb_conus = df_select.loc[(df_select['Latitude'] >= minLatRAP) & 
                          (df_select['Latitude'] <= maxLatRAP) &
                          (df_select['Longitude'] >= minLonRAP) & 
                          (df_select['Longitude'] <= maxLonRAP)].copy()

In [67]:
adsb_conus.reset_index(drop=True, inplace= True)

In [68]:
adsb_conus

Unnamed: 0,DateTime,id,Altitude,Latitude,Longitude
0,2016-06-20 00:00:00,4196601,2350.0,36.076401,-115.127663
1,2016-06-20 00:00:00,11069170,10675.0,35.138141,-80.789509
2,2016-06-20 00:00:00,10700817,7400.0,30.632210,-81.586450
3,2016-06-20 00:00:00,11281093,1575.0,40.742472,-73.840173
4,2016-06-20 00:00:00,12598340,37000.0,45.256130,-81.588492
...,...,...,...,...,...
3153946,2016-06-20 23:59:00,11226973,75.0,42.375396,-71.005932
3153947,2016-06-20 23:59:00,11165706,33975.0,39.073242,-82.903549
3153948,2016-06-20 23:59:00,11364484,13075.0,33.685525,-111.892433
3153949,2016-06-20 23:59:00,11402207,12150.0,41.788536,-87.750229


output_bucket = 'partly-cloudy-common-area'
output_subfolder = 'adbs-conus-rap-merge'
adsb_conusFileName = 'adsb_conus.csv'

wr.s3.to_csv(adsb_conus, f"s3://{output_bucket}/{output_subfolder}/{adsb_conusFileName}", index=False)

#########################################################################################################################
### 2. ADS-B Records Assigned to NOAA RAP Cells 
#########################################################################################################################

In [6]:
latlong_bucket = 'partly-cloudy-common-area'
latlongFileName = 'latlonRAP.csv'
latlongRAP = wr.s3.read_csv(path= f"s3://{latlong_bucket}/{latlongFileName}")

In [7]:
latlongRAP

Unnamed: 0,Nx,Ny,Lat,Lon
0,1,1,16.281000,-126.138000
1,2,1,16.322011,-125.954684
2,3,1,16.362789,-125.771252
3,4,1,16.403332,-125.587705
4,5,1,16.443642,-125.404045
...,...,...,...,...
67720,297,225,55.648911,-58.431595
67721,298,225,55.607604,-58.167947
67722,299,225,55.565986,-57.904583
67723,300,225,55.524058,-57.641507


In [8]:
output_bucket = 'partly-cloudy-common-area'
output_subfolder = 'adbs-conus-rap-merge'
adsb_conusFileName = 'adsb_conus.csv'

adsb_conus = wr.s3.read_csv(path= f"s3://{output_bucket}/{output_subfolder}/{adsb_conusFileName}")

In [9]:
adsb_conus

Unnamed: 0,DateTime,id,Altitude,Latitude,Longitude
0,2016-06-20 00:00:00,4196601,2350.0,36.076401,-115.127663
1,2016-06-20 00:00:00,11069170,10675.0,35.138141,-80.789509
2,2016-06-20 00:00:00,10700817,7400.0,30.632210,-81.586450
3,2016-06-20 00:00:00,11281093,1575.0,40.742472,-73.840173
4,2016-06-20 00:00:00,12598340,37000.0,45.256130,-81.588492
...,...,...,...,...,...
3153946,2016-06-20 23:59:00,11226973,75.0,42.375396,-71.005932
3153947,2016-06-20 23:59:00,11165706,33975.0,39.073242,-82.903549
3153948,2016-06-20 23:59:00,11364484,13075.0,33.685525,-111.892433
3153949,2016-06-20 23:59:00,11402207,12150.0,41.788536,-87.750229


In [9]:
start_index = 3000000 # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Change!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
end_index = 3100000 # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Change!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

adsb_conus_select = adsb_conus.iloc[start_index:end_index,].copy()

In [10]:
adsb_conus_select

Unnamed: 0,DateTime,id,Altitude,Latitude,Longitude
3000000,2016-06-20 23:09:00,11333028,37975.0,38.681604,-80.065186
3000001,2016-06-20 23:09:00,11173430,8075.0,33.451126,-117.912744
3000002,2016-06-20 23:09:00,4317360,36000.0,27.818081,-96.251590
3000003,2016-06-20 23:09:00,12597023,37000.0,50.518204,-106.656285
3000004,2016-06-20 23:09:00,10536633,27000.0,42.310584,-75.855869
...,...,...,...,...,...
3099995,2016-06-20 23:41:00,11423422,34000.0,43.292587,-120.821632
3099996,2016-06-20 23:41:00,10717701,43025.0,36.938324,-92.134737
3099997,2016-06-20 23:41:00,10912949,33025.0,38.497696,-92.380152
3099998,2016-06-20 23:41:00,10925303,33975.0,38.200378,-85.182670


In [11]:
# Obtain NOAA RAP hPa (to the nearest 25 hPa) from altitude value in feet
def alt_ft_to_nearest_hPa(alt):
    if alt > 51805: # upper limit altitude of 100 hPa (51805 ft)
        return 100
    elif alt < 364: # lower limit altitude of 1000 hPa (364 ft)
        return 1000
    else:
        a = (1/0.190284)
        b =  alt/145366.45
        P_hpa  = ((1-b)**a) * 1013.25
        return int(25 * round(P_hpa/25))

In [12]:
ls = list(adsb_conus_select['Altitude'])
nearest_hpa = [alt_ft_to_nearest_hPa(i) for i  in ls]

adsb_conus_select['hPa'] = nearest_hpa

In [13]:
adsb_conus_select

Unnamed: 0,DateTime,id,Altitude,Latitude,Longitude,hPa
3000000,2016-06-20 23:09:00,11333028,37975.0,38.681604,-80.065186,200
3000001,2016-06-20 23:09:00,11173430,8075.0,33.451126,-117.912744,750
3000002,2016-06-20 23:09:00,4317360,36000.0,27.818081,-96.251590,225
3000003,2016-06-20 23:09:00,12597023,37000.0,50.518204,-106.656285,225
3000004,2016-06-20 23:09:00,10536633,27000.0,42.310584,-75.855869,350
...,...,...,...,...,...,...
3099995,2016-06-20 23:41:00,11423422,34000.0,43.292587,-120.821632,250
3099996,2016-06-20 23:41:00,10717701,43025.0,36.938324,-92.134737,150
3099997,2016-06-20 23:41:00,10912949,33025.0,38.497696,-92.380152,250
3099998,2016-06-20 23:41:00,10925303,33975.0,38.200378,-85.182670,250


In [14]:
# Develop lists of lat,long tuples for haversine
adsb_coord = list(zip(adsb_conus_select['Latitude'], adsb_conus_select['Longitude'])) # a list of (Latitude, Longitude) tuples from "adsb_conus"
latlonRAP_coord = list(zip(latlongRAP['Lat'], latlongRAP['Lon'])) # a list of (Lat, Lon) tuples from "latlongRAP"

In [None]:
%%time
from datetime import datetime

# for each (Latitude, Longitude) tuple in "adsb_coord" list, find the index in 'latlonRAP_coord' that has the minimum distance to (Lat, Lon) tuple 
idx = []
min_distances = []
for i in range(len(adsb_coord)):
    distances = [haversine(adsb_coord[i], latlonRAP_coord[j]) for j in range(len(latlonRAP_coord))]
    idx.append(np.argmin(distances))
    min_distances.append(min(distances))
    print(i, datetime.now().time())

In [16]:
x = np.array(latlongRAP['Nx']) # All "Nx" in 'latlongRAP' in an array
y = np.array(latlongRAP['Ny']) # All "Ny" in 'latlongRAP' in an array

adsb_conus_select['Nx'] = list(x[idx]) # "Nx" values with indexes with minimum distance
adsb_conus_select['Ny'] = list(y[idx]) # "Ny" values with indexes with minimum distance
adsb_conus_select['distToCellCenter'] = min_distances

In [17]:
adsb_conus_select_final = adsb_conus_select.loc[adsb_conus_select['distToCellCenter'] < 14.15].copy()
adsb_conus_select_final

Unnamed: 0,DateTime,id,Altitude,Latitude,Longitude,hPa,Nx,Ny,distToCellCenter
3000000,2016-06-20 23:09:00,11333028,37975.0,38.681604,-80.065186,200,231,109,8.772418
3000001,2016-06-20 23:09:00,11173430,8075.0,33.451126,-117.912744,750,60,85,8.946226
3000002,2016-06-20 23:09:00,4317360,36000.0,27.818081,-96.251590,225,159,45,9.051248
3000003,2016-06-20 23:09:00,12597023,37000.0,50.518204,-106.656285,225,120,177,6.954068
3000004,2016-06-20 23:09:00,10536633,27000.0,42.310584,-75.855869,350,246,132,1.808094
...,...,...,...,...,...,...,...,...,...
3099995,2016-06-20 23:41:00,11423422,34000.0,43.292587,-120.821632,250,57,142,4.529848
3099996,2016-06-20 23:41:00,10717701,43025.0,36.938324,-92.134737,150,178,96,3.722372
3099997,2016-06-20 23:41:00,10912949,33025.0,38.497696,-92.380152,250,177,105,10.833584
3099998,2016-06-20 23:41:00,10925303,33975.0,38.200378,-85.182670,250,208,104,11.866312


In [18]:
# Examine the list of files are already in the output_bucket (i.e., "partly-cloudy-common-area")
conn = boto3.client('s3')
contents = conn.list_objects(Bucket= output_bucket, Prefix= output_subfolder)['Contents']
for f in contents:
    print(f['Key'])

adbs-conus-rap-merge/
adbs-conus-rap-merge/adsb_conus_rap_0_100000.csv
adbs-conus-rap-merge/adsb_conus_rap_1000000_1100000.csv
adbs-conus-rap-merge/adsb_conus_rap_100000_200000.csv
adbs-conus-rap-merge/adsb_conus_rap_1100000_1200000.csv
adbs-conus-rap-merge/adsb_conus_rap_1200000_1300000.csv
adbs-conus-rap-merge/adsb_conus_rap_1300000_1400000.csv
adbs-conus-rap-merge/adsb_conus_rap_1400000_1500000.csv
adbs-conus-rap-merge/adsb_conus_rap_1500000_1600000.csv
adbs-conus-rap-merge/adsb_conus_rap_1600000_1700000.csv
adbs-conus-rap-merge/adsb_conus_rap_1700000_1800000.csv
adbs-conus-rap-merge/adsb_conus_rap_1800000_1900000.csv
adbs-conus-rap-merge/adsb_conus_rap_1900000_2000000.csv
adbs-conus-rap-merge/adsb_conus_rap_2000000_2100000.csv
adbs-conus-rap-merge/adsb_conus_rap_200000_300000.csv
adbs-conus-rap-merge/adsb_conus_rap_2100000_2200000.csv
adbs-conus-rap-merge/adsb_conus_rap_2200000_2300000.csv
adbs-conus-rap-merge/adsb_conus_rap_2300000_2400000.csv
adbs-conus-rap-merge/adsb_conus_rap_2

In [19]:
output_bucket = 'partly-cloudy-common-area'
output_subfolder = 'adbs-conus-rap-merge'

outputFileName = 'adsb_conus_rap_' + str(start_index) + "_" + str(end_index) + '.csv' # <------------------ Specify

In [20]:
wr.s3.to_csv(adsb_conus_select_final, f"s3://{output_bucket}/{output_subfolder}/{outputFileName}", index=False)

{'paths': ['s3://partly-cloudy-common-area/adbs-conus-rap-merge/adsb_conus_rap_3000000_3100000.csv'],
 'partitions_values': {}}

wr.s3.to_parquet(adsb_conus, f"s3://{output_bucket}/{outputFileName}")