## Processing train maiac data

In [None]:
from collections import defaultdict
import datetime
import gc
import os
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rasterio as rio
import rioxarray as rxr
from rioxarray.exceptions import NoDataInBounds
from tqdm import tqdm
import xarray


gc.enable()
warnings.simplefilter('ignore')


DATA_DIR = '../data/raw/data'
PRODUCT = 'maiac'
SPLIT = 'train'
LOCATION_MAP = {'Taipei': 'tpe', 'Delhi': 'dl', 'Los Angeles (SoCAB)': 'la'}

train_data = pd.read_csv('../data/train_labels.csv')
satellite_data = pd.read_csv('../data/pm25_satellite_metadata.csv')
satellite_data = satellite_data[satellite_data['split'] == SPLIT]
satellite_data = satellite_data[satellite_data['product'] == PRODUCT]
grid_data = pd.read_csv('../data/grid_metadata.csv')

REQUIRED_BANDS_0 = ['Optical_Depth_047', 'Optical_Depth_055', 'AOD_Uncertainty', 'FineModeFraction', 'Column_WV', 'AOD_QA', 'AOD_MODEL', 'Injection_Height']
SAVE_DIR = "../data/raw/proc_data"

if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)
indices = [idx for idx in range(len(train_data))]
print("Processing %d instances..." %len(indices))


def project_and_save(cur_satdata, el, geometry, idx):
    dt = datetime.datetime.strptime(el['datetime'], "%Y-%m-%dT%H:%M:%SZ")
    cur_satdata['time_end'] = cur_satdata['time_end'].apply(lambda x:datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S+00:00'))
    possible = cur_satdata[cur_satdata['time_end'] < dt].sort_values('time_end', ascending=False).reset_index()
    if len(possible) == 0:
        assets = {}
        for band in REQUIRED_BANDS_0:
            assets[band] = np.nan
        assets['label'] = np.nan
        np.savez_compressed(os.path.join(SAVE_DIR, f"{idx}.npz"), **assets)
        return
    # assert len(possible) == 1, f"{len(possible)} possible files found for {pdt}"
    assets = {}
    for k in range(len(possible)):
        filename = possible['granule_id'].iloc[k]
        filename = os.path.join(DATA_DIR, SPLIT, PRODUCT, filename[:4], filename)

        data = rxr.open_rasterio(filename, masked=True)
        geometries = [
            {
                'type': 'Polygon',
                'coordinates': [geometry]
            }
        ]
    
        try:
            clipped_0 = data[0].rio.clip(geometries, crs=4326)
        except NoDataInBounds:
            continue
        assets['filename'] = filename
        break
    else:
        print(f"No data for {dt}, index: {idx}")
        
    for band in REQUIRED_BANDS_0:
        band_data = np.array(clipped_0[band].as_numpy())
        assets[band] = band_data

    assets['label'] = el['value']
    np.savez_compressed(os.path.join(SAVE_DIR, f"{idx}.npz"), **assets)


for idx in tqdm(indices):
    el = train_data.iloc[idx]
    grid_id = el['grid_id']
    location = grid_data[grid_data['grid_id'] == grid_id]['location'].values[0]
    cur_satdata = satellite_data[satellite_data['location'] == LOCATION_MAP[location]]

    geometry = grid_data[grid_data['grid_id'] == grid_id]['wkt'].values[0]
    geometry = geometry.replace('(', '', -1)
    geometry = geometry.replace(')', '', -1)
    geometry = geometry.replace(',', '', -1)
    geometry = list(map(float, geometry.split()[1:]))
    geometry = [geometry[i:i+2] for i in range(0, len(geometry), 2)]

    project_and_save(cur_satdata, el, geometry, idx)

In [None]:
def _load_features(path_dir: str, total: int) -> pd.DataFrame:
    """
    Load features from .npz files.
    """
    features = defaultdict(lambda:[])
    for idx in tqdm(range(total)):
        filename = os.path.join(path_dir, f"{idx}.npz")
        if not os.path.exists(filename):
            continue
        data = np.load(filename)
        for key in data.keys():
            if key in ['filename', 'label']:
                continue
            _band = data[key].ravel()
            _band = np.concatenate((
                _band[_band >= 0], _band[_band < 0]
            )) # removing nan values
            mean, var = _band.mean(), _band.std() ** 2
            features[key + '_mean'].append(mean)
            features[key + '_var'].append(var)
    k = len(features[list(features.keys())[0]])
    for _ in range(total - k):
        for k in features.keys():
            features[k].append(np.nan)
    return pd.DataFrame(features)


train_maiac = _load_features(SAVE_DIR, len(train_data))

In [None]:
# Replace data with data before midnight of current day if it exists
features = train_maiac.columns.tolist()
train_maiac['datetime'] = train_maiac['datetime'].apply(
    lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ")
)
train_maiac['grid_id'] = train_data['grid_id']

new_data = defaultdict(lambda: [])
for idx in tqdm(range(train_maiac.shape[0])):
    el = train_maiac.iloc[idx]
    dt = el['datetime']
    cur_data = train_maiac[train_maiac['grid_id'] == el['grid_id']]
    cur_data = cur_data[cur_data['datetime'] > dt].sort_values('datetime', ascending=True).reset_index()
    if len(cur_data) == 0 or cur_data.iloc[0]['datetime'] - dt > datetime.timedelta(1, 0):
        for k in features:
            new_data[k].append(train_maiac.iloc[idx][k])
        continue
    for k in features:
        new_data[k].append(cur_data.iloc[0][k])

train_maiac = pd.DataFrame(new_data)
train_maiac.to_csv('../data/proc/train_maiac.csv', index=False)

## Processing test maiac data

In [None]:
from collections import defaultdict
import datetime
import gc
import os
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rasterio as rio
import rioxarray as rxr
from rioxarray.exceptions import NoDataInBounds
from tqdm import tqdm
import xarray


gc.enable()
warnings.simplefilter('ignore')


DATA_DIR = '../data/raw/data'
PRODUCT = 'maiac'
SPLIT = 'test'
LOCATION_MAP = {'Taipei': 'tpe', 'Delhi': 'dl', 'Los Angeles (SoCAB)': 'la'}

train_data = pd.read_csv('../data/submission_format.csv')
satellite_data = pd.read_csv('../data/pm25_satellite_metadata.csv')
satellite_data = satellite_data[satellite_data['split'] == SPLIT]
satellite_data = satellite_data[satellite_data['product'] == PRODUCT]
grid_data = pd.read_csv('../data/grid_metadata.csv')

REQUIRED_BANDS_0 = ['Optical_Depth_047', 'Optical_Depth_055', 'AOD_Uncertainty', 'FineModeFraction', 'Column_WV', 'AOD_QA', 'AOD_MODEL', 'Injection_Height']
SAVE_DIR = "../data/raw/proc_data_test"

if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)
indices = [idx for idx in range(len(train_data))]
print("Processing %d instances..." %len(indices))


def project_and_save(cur_satdata, el, geometry, idx):
    dt = datetime.datetime.strptime(el['datetime'], "%Y-%m-%dT%H:%M:%SZ")
    cur_satdata['time_end'] = cur_satdata['time_end'].apply(lambda x:datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S+00:00'))
    possible = cur_satdata[cur_satdata['time_end'] < dt].sort_values('time_end', ascending=False).reset_index()
    if len(possible) == 0:
        assets = {}
        for band in REQUIRED_BANDS_0:
            assets[band] = np.nan
        assets['label'] = np.nan
        np.savez_compressed(os.path.join(SAVE_DIR, f"{idx}.npz"), **assets)
        return
    # assert len(possible) == 1, f"{len(possible)} possible files found for {pdt}"
    assets = {}
    for k in range(len(possible)):
        filename = possible['granule_id'].iloc[k]
        filename = os.path.join(DATA_DIR, SPLIT, PRODUCT, filename[:4], filename)

        data = rxr.open_rasterio(filename, masked=True)
        geometries = [
            {
                'type': 'Polygon',
                'coordinates': [geometry]
            }
        ]
    
        try:
            clipped_0 = data[0].rio.clip(geometries, crs=4326)
        except NoDataInBounds:
            continue
        assets['filename'] = filename
        break
    else:
        print(f"No data for {dt}, index: {idx}")
        
    for band in REQUIRED_BANDS_0:
        band_data = np.array(clipped_0[band].as_numpy())
        assets[band] = band_data

    assets['label'] = el['value']
    np.savez_compressed(os.path.join(SAVE_DIR, f"{idx}.npz"), **assets)


for idx in tqdm(indices):
    el = train_data.iloc[idx]
    grid_id = el['grid_id']
    location = grid_data[grid_data['grid_id'] == grid_id]['location'].values[0]
    cur_satdata = satellite_data[satellite_data['location'] == LOCATION_MAP[location]]

    geometry = grid_data[grid_data['grid_id'] == grid_id]['wkt'].values[0]
    geometry = geometry.replace('(', '', -1)
    geometry = geometry.replace(')', '', -1)
    geometry = geometry.replace(',', '', -1)
    geometry = list(map(float, geometry.split()[1:]))
    geometry = [geometry[i:i+2] for i in range(0, len(geometry), 2)]

    project_and_save(cur_satdata, el, geometry, idx)

In [None]:
def _load_features(path_dir: str, total: int) -> pd.DataFrame:
    """
    Load features from .npz files.
    """
    features = defaultdict(lambda:[])
    for idx in tqdm(range(total)):
        filename = os.path.join(path_dir, f"{idx}.npz")
        if not os.path.exists(filename):
            continue
        data = np.load(filename)
        for key in data.keys():
            if key in ['filename', 'label']:
                continue
            _band = data[key].ravel()
            _band = np.concatenate((
                _band[_band >= 0], _band[_band < 0]
            )) # removing nan values
            mean, var = _band.mean(), _band.std() ** 2
            features[key + '_mean'].append(mean)
            features[key + '_var'].append(var)
    k = len(features[list(features.keys())[0]])
    for _ in range(total - k):
        for k in features.keys():
            features[k].append(np.nan)
    return pd.DataFrame(features)


test_maiac = _load_features(SAVE_DIR, len(train_data))

In [None]:
# Replace data with data before midnight of current day if it exists
features = test_maiac.columns.tolist()
test_maiac['datetime'] = test_maiac['datetime'].apply(
    lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ")
)
test_maiac['grid_id'] = train_data['grid_id']

new_data = defaultdict(lambda: [])
for idx in tqdm(range(test_maiac.shape[0])):
    el = test_maiac.iloc[idx]
    dt = el['datetime']
    cur_data = test_maiac[test_maiac['grid_id'] == el['grid_id']]
    cur_data = cur_data[cur_data['datetime'] > dt].sort_values('datetime', ascending=True).reset_index()
    if len(cur_data) == 0 or cur_data.iloc[0]['datetime'] - dt > datetime.timedelta(1, 0):
        for k in features:
            new_data[k].append(test_maiac.iloc[idx][k])
        continue
    for k in features:
        new_data[k].append(cur_data.iloc[0][k])

test_maiac = pd.DataFrame(new_data)
test_maiac.to_csv('../data/proc/test_maiac.csv', index=False)