In [1]:
import os
import scipy.io
import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from functions import crism_to_mat, filter_bad_pixels, remove_spikes_column

In [2]:
# Load .mat file
data_a = scipy.io.loadmat('datasets/CRISM_bland_unratioed.mat')
data_b = scipy.io.loadmat('datasets/CRISM_labeled_pixels_ratioed.mat')

def get_id(x):
    return x[0][:5]
vfunc = np.vectorize(get_id)
im_names_a = np.char.upper(vfunc(data_a['im_names']).flatten().astype(str))
pixims_a = data_a['pixims'].flatten()                   # numerical ID of the original image
coordinates_a = data_a['pixcrds']             # (x,y) point coordinates in the original image

im_names_b = np.char.upper(np.array(data_b['im_names']).flatten().astype(str))
pixims_b = data_b['pixims'].flatten() 
coordinates_b = data_b['pixcrds']
labels_b = data_b['pixlabs'].flatten() 

In [3]:
bland_df_info = pd.DataFrame({
    'numerical_id': pixims_a,
    'x': coordinates_a[:, 0],
    'y': coordinates_a[:, 1]
})

image_id_mapping = dict(enumerate(im_names_a, start=1))
bland_df_info['image_id'] = bland_df_info['numerical_id'].map(image_id_mapping)
bland_df_info

Unnamed: 0,numerical_id,x,y,image_id
0,1,89,215,0289E
1,1,74,423,0289E
2,1,54,155,0289E
3,1,89,216,0289E
4,1,50,358,0289E
...,...,...,...,...
337612,340,95,27,39990
337613,340,56,34,39990
337614,340,48,32,39990
337615,340,33,26,39990


In [4]:
mineral_df_info = pd.DataFrame({
    'numerical_id': pixims_b,
    'label': labels_b,
    'x': coordinates_b[:, 0],
    'y': coordinates_b[:, 1]
})

image_id_mapping = dict(enumerate(im_names_b, start=1))
mineral_df_info['image_id'] = mineral_df_info['numerical_id'].map(image_id_mapping)
mineral_df_info

Unnamed: 0,numerical_id,label,x,y,image_id
0,1,9,275,3,027E2
1,1,9,276,4,027E2
2,1,9,272,5,027E2
3,1,9,273,5,027E2
4,1,9,278,5,027E2
...,...,...,...,...,...
592408,70,39,0,0,21B59
592409,70,39,0,0,21B59
592410,70,39,0,0,21B59
592411,70,39,0,0,21B59


In [5]:
def create_image_paths(directory_path):
    image_paths = {}
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.lower().endswith('.hdr'):
                id = file[6:11].upper()
                image_paths[id] = os.path.join(root, file)
    return image_paths

In [6]:
image_paths = create_image_paths('/Users/wj/Desktop/CRISM_DATA/A')

def process_image(image_path, id, group):
    results = []
    mat = crism_to_mat(image_path)
    if_ = mat['IF']
    if_ = filter_bad_pixels(mat['IF'])
    if_ = remove_spikes_column(mat, if_, 3)
    
    for _, row in group.iterrows():
        x, y = row['x'], row['y']
        spectra_index = np.where((mat['x'] == x) & (mat['y'] == y))[0]
        spectra = if_[spectra_index].flatten()
        results.append({'image_id': id, 'x': x, 'y': y, 'spectra': spectra})
    return results

grouped = bland_df_info.groupby('image_id')

tasks = [(im_path, id, grouped.get_group(id)) for id, im_path in image_paths.items()]
results = Parallel(n_jobs=-1, verbose=10)(delayed(process_image)(*task) for task in tasks)

# Flatten results
flat_results = [item for sublist in results for item in sublist]

# Convert the 'spectra' ndarray to a list and add each element as a separate column
for i, res in enumerate(flat_results):
    flat_results[i]['spectra'] = res['spectra'].tolist()

# Convert the list of dictionaries to a DataFrame
bland_df_data = pd.DataFrame(flat_results)

# Use apply to split the 'spectra' list into separate columns and concatenate them back to the original DataFrame
bland_df_data = pd.concat([bland_df_data.drop('spectra', axis=1), bland_df_data['spectra'].apply(pd.Series)], axis=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed:  

In [7]:
bland_df_data

Unnamed: 0,image_id,x,y,0,1,2,3,4,5,6,...,428,429,430,431,432,433,434,435,436,437
0,129D1,42,234,0.183240,0.183240,0.177040,0.181723,0.176845,0.182095,0.176937,...,0.388368,0.390600,0.394394,0.398004,0.402821,0.183240,0.183240,0.183240,0.183240,0.0
1,129D1,33,156,0.183240,0.183240,0.179713,0.177953,0.174137,0.177956,0.171644,...,0.412477,0.415097,0.417611,0.421339,0.430918,0.183240,0.183240,0.183240,0.183240,0.0
2,129D1,78,78,0.183240,0.183240,0.177950,0.170932,0.168615,0.170199,0.166566,...,0.394005,0.401058,0.406532,0.407522,0.410457,0.419562,0.183240,0.183240,0.183240,0.0
3,129D1,42,147,0.183240,0.183240,0.179855,0.182989,0.178902,0.182606,0.177878,...,0.418683,0.423789,0.430713,0.436174,0.441805,0.183240,0.183240,0.183240,0.183240,0.0
4,129D1,42,235,0.183240,0.183240,0.172990,0.176540,0.171127,0.176303,0.171870,...,0.375548,0.377796,0.379615,0.383085,0.388312,0.183240,0.183240,0.183240,0.183240,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322895,20B25,104,14,0.206947,0.206947,0.248876,0.241999,0.238517,0.240431,0.236606,...,0.245979,0.245426,0.244673,0.244643,0.243589,0.243315,0.206947,0.206947,0.206947,0.0
322896,20B25,614,11,0.206947,0.206947,0.215776,0.212822,0.211004,0.212094,0.208439,...,0.189085,0.190287,0.190405,0.190339,0.191096,0.190989,0.206947,0.206947,0.206947,0.0
322897,20B25,166,12,0.206947,0.206947,0.245765,0.246467,0.238595,0.242520,0.242089,...,0.257548,0.261690,0.265692,0.268356,0.270828,0.265770,0.206947,0.206947,0.206947,0.0
322898,20B25,54,506,0.206947,0.206947,0.239994,0.233253,0.232624,0.233809,0.233071,...,0.201306,0.205241,0.208596,0.208917,0.205818,0.206947,0.206947,0.206947,0.206947,0.0


In [8]:
image_paths = create_image_paths('/Users/wj/Desktop/CRISM_DATA/B')

def process_image(image_path, id, group):
    results = []
    mat = crism_to_mat(image_path)
    if_ = mat['IF']
    if_ = filter_bad_pixels(mat['IF'])
    if_ = remove_spikes_column(mat, if_, 3)
    
    for _, row in group.iterrows():

        if row['label'] == 39:
            continue
        x, y = row['x'], row['y']
        spectra_index = np.where((mat['x'] == x) & (mat['y'] == y))[0]
        spectra = if_[spectra_index].flatten()
        results.append({'image_id': id, 'x': x, 'y': y, 'spectra': spectra})
    return results

grouped = mineral_df_info.groupby('image_id')

tasks = [(im_path, id, grouped.get_group(id)) for id, im_path in image_paths.items()]
results = Parallel(n_jobs=-1, verbose=10)(delayed(process_image)(*task) for task in tasks)

# Flatten results
flat_results = [item for sublist in results for item in sublist]

# Convert the 'spectra' ndarray to a list and add each element as a separate column
for i, res in enumerate(flat_results):
    flat_results[i]['spectra'] = res['spectra'].tolist()

# Convert the list of dictionaries to a DataFrame
mineral_df_data = pd.DataFrame(flat_results)

# Use apply to split the 'spectra' list into separate columns and concatenate them back to the original DataFrame
mineral_df_data = pd.concat([mineral_df_data.drop('spectra', axis=1), mineral_df_data['spectra'].apply(pd.Series)], axis=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   48.3s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  66 out of  77 | elapsed:  3.4min remaining:   34.3s
[Parallel(n_jobs=-1)]: Done  74 out of  77 | elapsed:  3.7min remaining:    9.1s
[Parallel(n_jobs=-1)]: Done  77 out of  77 | elapsed:  3.8min finished


In [9]:
mineral_df_data

Unnamed: 0,image_id,x,y,0,1,2,3,4,5,6,...,428,429,430,431,432,433,434,435,436,437
0,0454E,149,64,0.166843,0.166843,0.190084,0.189474,0.185760,0.186125,0.184457,...,0.341645,0.343397,0.348540,0.353838,0.355607,0.354750,0.166843,0.166843,0.166843,0.0
1,0454E,150,64,0.166843,0.166843,0.192429,0.191811,0.187877,0.189394,0.187766,...,0.326666,0.329100,0.334543,0.340443,0.343856,0.343535,0.166843,0.166843,0.166843,0.0
2,0454E,151,64,0.166843,0.166843,0.193785,0.193023,0.189565,0.190782,0.189236,...,0.334110,0.335957,0.341401,0.348759,0.355327,0.354001,0.166843,0.166843,0.166843,0.0
3,0454E,153,64,0.166843,0.166843,0.192751,0.191769,0.187534,0.188406,0.186801,...,0.324301,0.327111,0.333029,0.339610,0.343653,0.342047,0.166843,0.166843,0.166843,0.0
4,0454E,160,64,0.166843,0.166843,0.193163,0.192942,0.189197,0.189057,0.187697,...,0.312832,0.317719,0.325508,0.329753,0.329102,0.325061,0.166843,0.166843,0.166843,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586594,05814,59,147,0.161424,0.161424,0.165725,0.164123,0.161058,0.160869,0.160224,...,0.358412,0.358010,0.359653,0.364118,0.368421,0.161424,0.161424,0.161424,0.161424,0.0
586595,05814,52,148,0.161424,0.161424,0.172172,0.170590,0.168009,0.167722,0.166930,...,0.397253,0.395572,0.396083,0.399904,0.405905,0.161424,0.161424,0.161424,0.161424,0.0
586596,05814,57,148,0.161424,0.161424,0.166005,0.164945,0.161472,0.161788,0.160700,...,0.364930,0.364860,0.364178,0.365183,0.368164,0.161424,0.161424,0.161424,0.161424,0.0
586597,05814,57,149,0.161424,0.161424,0.161848,0.160963,0.157673,0.158131,0.156798,...,0.362305,0.362013,0.361641,0.362803,0.366092,0.161424,0.161424,0.161424,0.161424,0.0


In [10]:
bland_df_data['label'] = 1
mineral_df_data['label'] = 0

df_data = pd.concat([bland_df_data, mineral_df_data])
df_data.to_pickle('df_data.pkl')