In [None]:
##Generate dummy prod file from the test data
import pandas as pd, geopandas as gpd
DATA_DIR = 'data'
df_satmeta = pd.read_csv(f'{DATA_DIR}/raw/pm25_satellite_metadata.csv')
df_satmeta = df_satmeta[(df_satmeta.location=='la') & (df_satmeta['product']=='maiac') & (df_satmeta.time_start >= '2021-08-23') & (df_satmeta.time_end <='2021-08-25')]
df_satmeta['split'] = 'prod'

df_labels = pd.read_csv(f'{DATA_DIR}/raw/submission_format.csv')
df_grid_meta = gpd.read_file(f'{DATA_DIR}/raw/grid_metadata.csv')[['grid_id','location']]
df_grid_meta['location']=df_grid_meta.location.map({'Delhi':'dl','Los Angeles (SoCAB)':'la','Taipei':'tpe'})
df_labels = pd.merge(df_grid_meta,df_labels).sort_values(by=['datetime','grid_id']).reset_index(drop=True)
df_labels = df_labels[(df_labels.location=='la') & (df_labels.datetime=='2021-08-24T08:00:00Z')].drop(columns='location')

df_satmeta.to_csv(f'{DATA_DIR}/raw/prod_satellite_metadata.csv',index=False)
df_labels.to_csv(f'{DATA_DIR}/raw/prod_submission_format.csv',index=False)

#Preprocessing

In [None]:
##Extract NASADEM elevation data (Rerun required only if grid_metadata changes)
!python src/data/extract_nasadem.py --data_dir data


  da = (xr.open_rasterio(signed_asset.href))
100% 54/54 [00:59<00:00,  1.10s/it]
Saved data to data/interim/elevation.csv


In [None]:
##Extract MAIAC data from aws
!python src/data/extract_maiac.py --data_dir data --path_satmeta data/raw/prod_satellite_metadata.csv --stage prod --s3url us_url --cleanup --n_jobs 16

using 4 processes
QUEUEING TASKS | : 100% 1/1 [00:00<00:00, 43.40it/s]
PROCESSING TASKS | : 100% 1/1 [00:04<00:00,  4.95s/it]
COLLECTING RESULTS | : 100% 1/1 [00:00<00:00, 19239.93it/s]
Saved data to data/interim/maiac/maiac_prod.csv


In [None]:
##Extract GFS data
!python src/data/extract_gfs.py --data_dir data --stage prod --labels data/raw/prod_submission_format.csv --n_jobs 16 

{
   "status": "ok",
   "request_duration": "0.07719 seconds",
   "code": 200,
   "messages": [],
   "result": {
      "subsetting_available": true,
      "data": [
         {
            "request_type": "T",
            "group_index": 0
         }
      ]
   },
   "request_end": "2022-04-05T09:25:52.457296",
   "request_start": "2022-04-05T09:25:52.380106"
}
Location  la  data request range:  202108210000/to/202108260000
Found existing file data/interim/tmp/gfs/prod/param_group0/la//202108210000_202108260000.tar. Skipping request
Found existing file data/interim/tmp/gfs/prod/param_group1/la//202108210000_202108260000.tar. Skipping request
Found existing file data/interim/tmp/gfs/prod/param_group2/la//202108210000_202108260000.tar. Skipping request
Found existing file data/interim/tmp/gfs/prod/param_group3/la//202108210000_202108260000.tar. Skipping request
Found existing file data/interim/tmp/gfs/prod/param_group4/la//202108210000_202108260000.tar. Skipping request
Found existing file

In [None]:
##Create final datasets
!python src/data/create_dataset.py --data_dir data --stage prod --labels data/raw/prod_submission_format.csv --n_gfs 8 --n_jobs 4
!python src/data/create_dataset.py --data_dir data --stage prod --labels data/raw/prod_submission_format.csv --n_gfs 10 --n_jobs 4
!python src/data/create_dataset.py --data_dir data --stage prod --labels data/raw/prod_submission_format.csv --n_gfs 12 --n_jobs 4

QUEUEING TASKS | : 100% 12/12 [00:00<00:00, 583.98it/s]
PROCESSING TASKS | : 100% 12/12 [00:00<00:00, 23.22it/s]
COLLECTING RESULTS | : 100% 12/12 [00:00<00:00, 87533.30it/s]
Saved data to data/processed/prod/prod_tail8.pkl
QUEUEING TASKS | : 100% 12/12 [00:00<00:00, 673.55it/s]
PROCESSING TASKS | : 100% 12/12 [00:00<00:00, 25.94it/s]
COLLECTING RESULTS | : 100% 12/12 [00:00<00:00, 142987.64it/s]
Saved data to data/processed/prod/prod_tail10.pkl
QUEUEING TASKS | : 100% 12/12 [00:00<00:00, 867.76it/s]
PROCESSING TASKS | : 100% 12/12 [00:00<00:00, 26.20it/s]
COLLECTING RESULTS | : 100% 12/12 [00:00<00:00, 154391.56it/s]
Saved data to data/processed/prod/prod_tail12.pkl


#Inference

In [None]:
##Generate predictions 
!python src/inference.py --data_dir data --model_dir models --stage prod --subformat_path data/raw/prod_submission_format.csv --output_path prod_predictions.csv 

##### RUNNING INFERENCE FOR DATASET 8 ##########
location 1, fold: 0, samples:12 11.838442911168055,19.633232122192965
location 1, fold: 1, samples:12 6.493850004545958,14.132023713528994
location 1, fold: 2, samples:12 9.248896598347619,16.972188005167038
location 1, fold: 3, samples:12 9.101350675409368,15.209078867888177
location 1, fold: 4, samples:12 7.89706217376716,16.23312452333085
##### RUNNING INFERENCE FOR DATASET 10 ##########
location 1, fold: 0, samples:12 8.566777642541842,15.181989075772378
location 1, fold: 1, samples:12 7.680630757364389,13.11749017961876
location 1, fold: 2, samples:12 10.006882997113562,15.74614888593055
location 1, fold: 3, samples:12 5.321577336537858,15.908995268203228
location 1, fold: 4, samples:12 6.059830847524582,18.70360814176277
##### RUNNING INFERENCE FOR DATASET 12 ##########
location 1, fold: 0, samples:12 7.256000973480645,14.766788765714763
location 1, fold: 1, samples:12 5.835140827591317,15.115718220219826
location 1, fold: 2, sampl

In [None]:
pred = pd.read_csv('prod_predictions.csv')
pred

Unnamed: 0,datetime,grid_id,value
0,2021-08-24T08:00:00Z,3S31A,11.528815
1,2021-08-24T08:00:00Z,A2FBI,13.584993
2,2021-08-24T08:00:00Z,DHO4M,15.744629
3,2021-08-24T08:00:00Z,DJN0F,12.582507
4,2021-08-24T08:00:00Z,E5P9N,14.168869
5,2021-08-24T08:00:00Z,H96P6,13.643406
6,2021-08-24T08:00:00Z,PG3MI,13.036712
7,2021-08-24T08:00:00Z,QJHW4,11.640799
8,2021-08-24T08:00:00Z,VBLD0,11.69637
9,2021-08-24T08:00:00Z,WT52R,11.859576


In [None]:
##compare with generated test submission file
sub=pd.read_csv('Submission.csv')
sub=sub[sub.datetime=='2021-08-24T08:00:00Z'].reset_index(drop=True)
sub

Unnamed: 0,datetime,grid_id,value
0,2021-08-24T08:00:00Z,3S31A,11.528815
1,2021-08-24T08:00:00Z,A2FBI,13.584993
2,2021-08-24T08:00:00Z,DHO4M,15.744629
3,2021-08-24T08:00:00Z,DJN0F,12.582507
4,2021-08-24T08:00:00Z,E5P9N,14.168869
5,2021-08-24T08:00:00Z,H96P6,13.643406
6,2021-08-24T08:00:00Z,PG3MI,13.036712
7,2021-08-24T08:00:00Z,QJHW4,11.640799
8,2021-08-24T08:00:00Z,VBLD0,11.69637
9,2021-08-24T08:00:00Z,WT52R,11.859576


In [None]:
(sub==pred).all()

datetime    True
grid_id     True
value       True
dtype: bool