## Compressing MISR data

In [1]:
import os
import netCDF4
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import datetime
import rasterio as rio
import rioxarray as rxr
import gc;gc.enable()
from collections import defaultdict
from tqdm import tqdm
warnings.simplefilter('ignore')

# EDIT
DATA_DIR = '../../../bucket/pm25/'
PRODUCT = 'misr'
LOCATION_MAP = {'Taipei': 'tpe', 'Delhi': 'dl', 'Los Angeles (SoCAB)': 'la'}

satellite_data = pd.read_csv('../data/pm25_satellite_metadata.csv')
satellite_data = satellite_data[satellite_data['product'] == PRODUCT]
satellite_data.shape

ModuleNotFoundError: No module named 'matplotlib'

In [4]:
REQUIRED_BANDS = ['Latitude', 'Longitude', 'Aerosol_Optical_Depth', 
                  'Aerosol_Optical_Depth_Uncertainty', 'Angstrom_Exponent_550_860nm', 
                  'Spectral_AOD_Scaling_Coeff', 'Absorption_Aerosol_Optical_Depth',
                  'Nonspherical_Aerosol_Optical_Depth', 'Small_Mode_Aerosol_Optical_Depth',
                  'Medium_Mode_Aerosol_Optical_Depth', 'Large_Mode_Aerosol_Optical_Depth']
def fillna(data, col):
    temp = data[col]
    fillvalue = temp._FillValue
    temp = np.array(temp)
    temp[temp == fillvalue] = np.nan
    return temp

In [None]:
SAVE_DIR = '../data/raw/proc_misr'
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

def load_and_save(filename, name):
    data = netCDF4.Dataset(filename, mode='r')
    data = data.groups['4.4_KM_PRODUCTS'].variables
    assets = {}
    for band in REQUIRED_BANDS:
        assets[band] = fillna(data, band)
    np.savez_compressed(os.path.join(SAVE_DIR, f"{name[:-3]}.npz"), **assets)

for idx in tqdm(range(len(satellite_data))):
    el = satellite_data.iloc[idx]
    name = el['granule_id']
    filename = os.path.join(DATA_DIR, el['split'], PRODUCT, name[:4], name)
    load_and_save(filename, name)

 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 815/1017 [1:59:20<34:13, 10.17s/it]

In [7]:
def floor(x, n=0):
    return np.floor(x * 10**n) / 10**n

def ceil(x, n=0):
    return np.ceil(x * 10**n) / 10**n

def get_bounds(geometry):
    geometry = np.array(geometry)
    long = [
        floor(geometry[:, 0].min(), 1),
        ceil(geometry[:, 0].max(), 1)
    ]
    lat = [
        floor(geometry[:, 1].min(), 1),
        ceil(geometry[:, 1].max(), 1)
    ]
    return long, lat

def mask(filename, geometry):
    data = np.load(filename)
    assets = {}
    for key in data.keys():
        assets[key] = data[key].ravel()
    longb, latb = get_bounds(geometry)
    latitude = assets['Latitude']
    longitude = assets['Longitude']
    indices = (latitude >= latb[0]) & (latitude <= latb[1]) & (longitude >= longb[0]) & (longitude <= longb[1])

    new_ass = {}
    for k in REQUIRED_BANDS:
        new_ass[k] = assets[k][indices]
    # new_ass['geometry'] = [Point(lg, lt) for lg, lt in zip(new_ass['Longitude'], new_ass['Latitude'])]
                                                                   
    df = pd.DataFrame(new_ass)
    # poly = Polygon(geometry)
    # df['mask'] = df['geometry'].apply(lambda x: poly.contains(x))
    # df = df[df['mask'] == True]
    # df.drop(columns=['mask', 'geometry'], inplace=True)
    return df

## Processing train data

In [8]:
import os
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import datetime
import gc;gc.enable()
from collections import defaultdict
from shapely.geometry import Point, Polygon, box
from tqdm import tqdm
warnings.simplefilter('ignore')
from pathlib import Path

DATA_DIR = Path('../data/raw/proc_misr')
PRODUCT = 'misr'
SPLIT = 'train'
LOCATION_MAP = {'Taipei': 'tpe', 'Delhi': 'dl', 'Los Angeles (SoCAB)': 'la'}

train_data = pd.read_csv('../data/train_labels.csv')
satellite_data = pd.read_csv('../data/pm25_satellite_metadata.csv')
satellite_data = satellite_data[satellite_data['split'] == SPLIT]
satellite_data = satellite_data[satellite_data['product'] == PRODUCT]
satellite_data['time_end'] = satellite_data['time_end'].apply(
    lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S+00:00')
)
grid_data = pd.read_csv('../data/grid_metadata.csv')

In [9]:
from collections import defaultdict

total_data = defaultdict(lambda: [])
REQUIRED_BANDS = ['Latitude', 'Longitude', 'Aerosol_Optical_Depth', 
                  'Aerosol_Optical_Depth_Uncertainty', 'Angstrom_Exponent_550_860nm', 
                  'Absorption_Aerosol_Optical_Depth',
                  'Nonspherical_Aerosol_Optical_Depth', 'Small_Mode_Aerosol_Optical_Depth',
                  'Medium_Mode_Aerosol_Optical_Depth', 'Large_Mode_Aerosol_Optical_Depth']
LOCATION_MAP = {
    'Taipei': 'tpe',
    'Delhi': 'dl',
    'Los Angeles (SoCAB)': 'la'
}
LOC_GRIDS = {}
for location in LOCATION_MAP.keys():
    LOC_GRIDS[
        LOCATION_MAP[location]
    ] = grid_data[grid_data['location'] == location]['grid_id'].values.tolist()

GRID_GEOMETRY = {}
for grid in grid_data['grid_id'].unique():
    geometry = grid_data[grid_data['grid_id'] == grid]['wkt'].values[0]
    geometry = geometry.replace('(', '', -1)
    geometry = geometry.replace(')', '', -1)
    geometry = geometry.replace(',', '', -1)
    geometry = list(map(float, geometry.split()[1:]))
    geometry = [geometry[i:i+2] for i in range(0, len(geometry), 2)]

    GRID_GEOMETRY[grid] = geometry
DATA_DIR = '../data/raw/proc_misr'

def box_mask(filename, geometries):
    data = np.load(filename)
    assets = {}
    for key in data.keys():
        assets[key] = data[key].ravel()
        shape = len(assets[key])
    
    indices = np.array([False for _ in range(shape)])
    for geometry in geometries:
        longb, latb = get_bounds(geometry)
        latitude = assets['Latitude']
        longitude = assets['Longitude']
        cur_indices = (latitude >= latb[0]) & (latitude <= latb[1]) & (longitude >= longb[0]) & (longitude <= longb[1])
        indices = indices | cur_indices
    
    new_ass = {}
    for k in REQUIRED_BANDS:
        new_ass[k] = assets[k][indices]
    # new_ass['geometry'] = [Point(lg, lt) for lg, lt in zip(new_ass['longitude'], new_ass['latitude'])]
    return new_ass

def poly_mask(assets, geometry):                                                                   
    df = pd.DataFrame(assets)

    longb, latb = get_bounds(geometry)
    latitude = assets['Latitude']
    longitude = assets['Longitude']
    cur_indices = (latitude >= latb[0]) & (latitude <= latb[1]) & (longitude >= longb[0]) & (longitude <= longb[1])
    df = df.loc[cur_indices]
    return df

def get_data(filename, geometries, grid_ids, data_dict):
    assets = box_mask(filename, geometries)
    for geometry, grid_id in zip(geometries, grid_ids):
        new_ass = poly_mask(assets, geometry)
        data_dict['file_id'].append(
            os.path.split(filename)[-1].replace('.npz', '') + f"_{grid_id}"
        )
        for key in REQUIRED_BANDS:
            _band = new_ass[key].values
            _band = np.concatenate((
                _band[_band <= 0], _band[_band > 0]
            ))
            data_dict[f"{key}_mean"].append(_band.mean())
            data_dict[f"{key}_var"].append(_band.std() ** 2)

    return data_dict


for idx in tqdm(range(satellite_data.shape[0])):
    filename = satellite_data['granule_id'].values[idx]
    filename = filename.replace('.nc', '.npz')
    filename = os.path.join(DATA_DIR, filename)
    location = satellite_data['location'].values[idx]

    geometries = [GRID_GEOMETRY[grid_id] for grid_id in LOC_GRIDS[location]]
    total_data = get_data(filename, geometries, LOC_GRIDS[location], total_data)
    
total_data = pd.DataFrame(total_data)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 788/788 [06:42<00:00,  1.96it/s]


In [10]:
REQUIRED_BANDS = ['Latitude', 'Longitude', 'Aerosol_Optical_Depth', 
                  'Aerosol_Optical_Depth_Uncertainty', 'Angstrom_Exponent_550_860nm', 
                  'Absorption_Aerosol_Optical_Depth',
                  'Nonspherical_Aerosol_Optical_Depth', 'Small_Mode_Aerosol_Optical_Depth',
                  'Medium_Mode_Aerosol_Optical_Depth', 'Large_Mode_Aerosol_Optical_Depth']
indices = list(range(len(train_data)))
total_train_data = defaultdict(lambda: [])

for idx in tqdm(indices):
    el = train_data.iloc[idx]
    grid_id = el['grid_id']
    location = grid_data[grid_data['grid_id'] == grid_id]['location'].values[0]
    cur_satdata = satellite_data[satellite_data['location'] == LOCATION_MAP[location]]

    dt = datetime.datetime.strptime(el['datetime'], "%Y-%m-%dT%H:%M:%SZ")
    dt = dt + datetime.timedelta(1, 0)
    possible = cur_satdata[cur_satdata['time_end'] < dt].sort_values('time_end', ascending=False).reset_index()
    
    if len(possible) == 0:
        for band in REQUIRED_BANDS:
            total_train_data[f"{band}_mean"].append(np.nan)
            total_train_data[f"{band}_var"].append(np.nan)
        total_train_data['filename'].append(np.nan)
        continue
    
    filename = possible['granule_id'].iloc[0]
    filename = f"{filename[:-3]}_{grid_id}"
    
    # cur_data = mask(filename, geometry)
    cur_data = total_data[total_data['file_id'] == filename]
    for key in REQUIRED_BANDS:
        total_train_data[f"{key}_mean"].append(
            cur_data[f"{key}_mean"].values[0]
        )
        total_train_data[f"{key}_var"].append(
            cur_data[f"{key}_var"].values[0]
        ) 
    total_train_data['filename'].append(filename)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 34312/34312 [03:00<00:00, 190.19it/s]


In [11]:
total_train_data = pd.DataFrame(total_train_data)
total_train_data.to_csv('../data/proc/train_misr.csv', index=False)

## Processing test data

In [16]:
import os
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import datetime
import gc;gc.enable()
from collections import defaultdict
from shapely.geometry import Point, Polygon, box
from tqdm import tqdm
warnings.simplefilter('ignore')
from pathlib import Path

DATA_DIR = Path('../data/raw/proc_misr')
PRODUCT = 'misr'
SPLIT = 'test'
LOCATION_MAP = {'Taipei': 'tpe', 'Delhi': 'dl', 'Los Angeles (SoCAB)': 'la'}

train_data = pd.read_csv('../data/submission_format.csv') # EDIT - should change this to test_data and all following
satellite_data = pd.read_csv('../data/pm25_satellite_metadata.csv')
satellite_data = satellite_data[satellite_data['split'] == SPLIT]
satellite_data = satellite_data[satellite_data['product'] == PRODUCT]
satellite_data['time_end'] = satellite_data['time_end'].apply(
    lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S+00:00')
)
grid_data = pd.read_csv('../data/grid_metadata.csv')

In [17]:
from collections import defaultdict

total_data = defaultdict(lambda: [])
REQUIRED_BANDS = ['Latitude', 'Longitude', 'Aerosol_Optical_Depth', 
                  'Aerosol_Optical_Depth_Uncertainty', 'Angstrom_Exponent_550_860nm', 
                  'Absorption_Aerosol_Optical_Depth',
                  'Nonspherical_Aerosol_Optical_Depth', 'Small_Mode_Aerosol_Optical_Depth',
                  'Medium_Mode_Aerosol_Optical_Depth', 'Large_Mode_Aerosol_Optical_Depth']
LOCATION_MAP = {
    'Taipei': 'tpe',
    'Delhi': 'dl',
    'Los Angeles (SoCAB)': 'la'
}
LOC_GRIDS = {}
for location in LOCATION_MAP.keys():
    LOC_GRIDS[
        LOCATION_MAP[location]
    ] = grid_data[grid_data['location'] == location]['grid_id'].values.tolist()

GRID_GEOMETRY = {}
for grid in grid_data['grid_id'].unique():
    geometry = grid_data[grid_data['grid_id'] == grid]['wkt'].values[0]
    geometry = geometry.replace('(', '', -1)
    geometry = geometry.replace(')', '', -1)
    geometry = geometry.replace(',', '', -1)
    geometry = list(map(float, geometry.split()[1:]))
    geometry = [geometry[i:i+2] for i in range(0, len(geometry), 2)]

    GRID_GEOMETRY[grid] = geometry
DATA_DIR = '../data/raw/proc_misr' # EDIT directory here

def box_mask(filename, geometries):
    data = np.load(filename)
    assets = {}
    for key in data.keys():
        assets[key] = data[key].ravel()
        shape = len(assets[key])
    
    indices = np.array([False for _ in range(shape)])
    for geometry in geometries:
        longb, latb = get_bounds(geometry)
        latitude = assets['Latitude']
        longitude = assets['Longitude']
        cur_indices = (latitude >= latb[0]) & (latitude <= latb[1]) & (longitude >= longb[0]) & (longitude <= longb[1])
        indices = indices | cur_indices
    
    new_ass = {}
    for k in REQUIRED_BANDS:
        new_ass[k] = assets[k][indices]
    # new_ass['geometry'] = [Point(lg, lt) for lg, lt in zip(new_ass['longitude'], new_ass['latitude'])]
    return new_ass

def poly_mask(assets, geometry):                                                                   
    df = pd.DataFrame(assets)

    longb, latb = get_bounds(geometry)
    latitude = assets['Latitude']
    longitude = assets['Longitude']
    cur_indices = (latitude >= latb[0]) & (latitude <= latb[1]) & (longitude >= longb[0]) & (longitude <= longb[1])
    df = df.loc[cur_indices]
    return df

def get_data(filename, geometries, grid_ids, data_dict):
    assets = box_mask(filename, geometries)
    for geometry, grid_id in zip(geometries, grid_ids):
        new_ass = poly_mask(assets, geometry)
        data_dict['file_id'].append(
            os.path.split(filename)[-1].replace('.npz', '') + f"_{grid_id}"
        )
        for key in REQUIRED_BANDS:
            _band = new_ass[key].values
            _band = np.concatenate((
                _band[_band <= 0], _band[_band > 0]
            ))
            data_dict[f"{key}_mean"].append(_band.mean())
            data_dict[f"{key}_var"].append(_band.std() ** 2)

    return data_dict


for idx in tqdm(range(satellite_data.shape[0])):
    filename = satellite_data['granule_id'].values[idx]
    filename = filename.replace('.nc', '.npz')
    filename = os.path.join(DATA_DIR, filename)
    location = satellite_data['location'].values[idx]

    geometries = [GRID_GEOMETRY[grid_id] for grid_id in LOC_GRIDS[location]]
    total_data = get_data(filename, geometries, LOC_GRIDS[location], total_data)
    
total_data = pd.DataFrame(total_data)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [01:52<00:00,  2.04it/s]


In [18]:
REQUIRED_BANDS = ['Latitude', 'Longitude', 'Aerosol_Optical_Depth', 
                  'Aerosol_Optical_Depth_Uncertainty', 'Angstrom_Exponent_550_860nm', 
                  'Absorption_Aerosol_Optical_Depth',
                  'Nonspherical_Aerosol_Optical_Depth', 'Small_Mode_Aerosol_Optical_Depth',
                  'Medium_Mode_Aerosol_Optical_Depth', 'Large_Mode_Aerosol_Optical_Depth']
indices = list(range(len(train_data)))
total_train_data = defaultdict(lambda: [])

for idx in tqdm(indices):
    el = train_data.iloc[idx]
    grid_id = el['grid_id']
    location = grid_data[grid_data['grid_id'] == grid_id]['location'].values[0]
    cur_satdata = satellite_data[satellite_data['location'] == LOCATION_MAP[location]]

    dt = datetime.datetime.strptime(el['datetime'], "%Y-%m-%dT%H:%M:%SZ")
    dt = dt + datetime.timedelta(1, 0)
    possible = cur_satdata[cur_satdata['time_end'] < dt].sort_values('time_end', ascending=False).reset_index()
    
    if len(possible) == 0:
        for band in REQUIRED_BANDS:
            total_train_data[f"{band}_mean"].append(np.nan)
            total_train_data[f"{band}_var"].append(np.nan)
        total_train_data['filename'].append(np.nan)
        continue
    
    filename = possible['granule_id'].iloc[0]
    filename = f"{filename[:-3]}_{grid_id}"
    
    # cur_data = mask(filename, geometry)
    cur_data = total_data[total_data['file_id'] == filename]
    for key in REQUIRED_BANDS:
        total_train_data[f"{key}_mean"].append(
            cur_data[f"{key}_mean"].values[0]
        )
        total_train_data[f"{key}_var"].append(
            cur_data[f"{key}_var"].values[0]
        ) 
    total_train_data['filename'].append(filename)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13504/13504 [00:42<00:00, 319.40it/s]


In [19]:
total_train_data = pd.DataFrame(total_train_data)
total_train_data.to_csv('../data/proc/test_misr.csv', index=False)