In [1]:
import pandas as pd
import numpy as np
import PIL
from PIL import Image
import dask

Image.MAX_IMAGE_PIXELS = None

In [2]:
! pip install dask



In [3]:
import dask.array as da
from dask import delayed


In [4]:
import os
import gzip
import zipfile
import requests
import matplotlib.pyplot as plt

## Data Download

In [5]:
base_url = "https://osdr.nasa.gov/osdr/data/osd/files/352.7"
headers = {"Accept": "application/json"}

In [6]:
response = requests.get(base_url, headers=headers)

In [7]:
if response.status_code == 200:
    print("Success")

Success


In [8]:
data = response.json()
studies = data.get("studies", {})
osd_study = studies.get("OSD-352", {})
study_files = osd_study.get("study_files", [])

In [None]:
if os.path.exists('rr3_dataset/csvs') and os.path.exists('rr3_dataset/img_input') and os.path.exists('rr3_dataset/tiff_input'):
    print("All directories exist. Skipping download.")
else:
    print("Some directories are missing. Running the download script.")

    # Check and download CSV files only if the 'csvs' directory is missing
    if not os.path.exists('rr3_dataset/csvs'):
        for file in study_files:
            if file.get('file_name').endswith('.csv'):
                file_name = file.get('file_name')
                file_url = f"https://osdr.nasa.gov{file.get('remote_url')}"

                save_dir = 'rr3_dataset/csvs'
                os.makedirs(save_dir, exist_ok=True)

                file_response = requests.get(file_url)
                if file_response.status_code == 200:
                    file_path = os.path.join(save_dir, file_name)
                    with open(file_path, "wb") as f:
                        f.write(file_response.content)
                    print(f"Saved: {file_path}\n")
                else:
                    print(f"Failed to download {file_name}: {file_response.status_code}\n")

    # Check and download JPG files only if the 'img_input' directory is missing
    if not os.path.exists('rr3_dataset/img_input'):
        for file in study_files:
            if file.get('file_name').endswith('.jpg'):
                file_name = file.get('file_name')
                file_url = f"https://osdr.nasa.gov{file.get('remote_url')}"

                save_dir = 'rr3_dataset/img_input'
                os.makedirs(save_dir, exist_ok=True)

                file_response = requests.get(file_url)
                if file_response.status_code == 200:
                    file_path = os.path.join(save_dir, file_name)
                    with open(file_path, "wb") as f:
                        f.write(file_response.content)
                    print(f"Saved: {file_path}\n")
                else:
                    print(f"Failed to download {file_name}: {file_response.status_code}\n")

    # Check and download TIFF files only if the 'tiff_input' directory is missing
    if not os.path.exists('rr3_dataset/tiff_input'):
        for file in study_files:
            if file.get('file_name').endswith('.tiff'):
                file_name = file.get('file_name')
                file_url = f"https://osdr.nasa.gov{file.get('remote_url')}"

                save_dir = 'rr3_dataset/tiff_input'
                os.makedirs(save_dir, exist_ok=True)

                file_response = requests.get(file_url)
                if file_response.status_code == 200:
                    file_path = os.path.join(save_dir, file_name)
                    with open(file_path, "wb") as f:
                        f.write(file_response.content)
                    print(f"Saved: {file_path}\n")
                else:
                    print(f"Failed to download {file_name}: {file_response.status_code}\n")


Directories missing. Running the download script.
GLDS-352_snATAC-Seq_CG9_per_barcode_metrics.csv
Downloading: GLDS-352_snATAC-Seq_CG9_per_barcode_metrics.csv
Saved: rr3_dataset/csvs/GLDS-352_snATAC-Seq_CG9_per_barcode_metrics.csv

GLDS-352_snATAC-Seq_CG8_per_barcode_metrics.csv
Downloading: GLDS-352_snATAC-Seq_CG8_per_barcode_metrics.csv
Saved: rr3_dataset/csvs/GLDS-352_snATAC-Seq_CG8_per_barcode_metrics.csv

GLDS-352_snATAC-Seq_CF7_per_barcode_metrics.csv
Downloading: GLDS-352_snATAC-Seq_CF7_per_barcode_metrics.csv
Saved: rr3_dataset/csvs/GLDS-352_snATAC-Seq_CF7_per_barcode_metrics.csv

GLDS-352_snATAC-Seq_CF2_per_barcode_metrics.csv
Downloading: GLDS-352_snATAC-Seq_CF2_per_barcode_metrics.csv
Saved: rr3_dataset/csvs/GLDS-352_snATAC-Seq_CF2_per_barcode_metrics.csv

GLDS-352_snATAC-Seq_CF1_per_barcode_metrics.csv
Downloading: GLDS-352_snATAC-Seq_CF1_per_barcode_metrics.csv
Saved: rr3_dataset/csvs/GLDS-352_snATAC-Seq_CF1_per_barcode_metrics.csv

GLDS-352_SpatialTranscriptomics_NASA-RR3

## Image Labeling

In [None]:
rr3_label_dict = {
    'GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_158_A1.jpg': 'Space Flight',
    'GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_158_B1.jpg': 'Space Flight',
    'GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_158_C1.jpg': 'Space Flight',
    'GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_158_D1.jpg': 'Space Flight',
    'GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_159_A1.jpg': 'Ground Control',
    'GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_159_B1.jpg': 'Ground Control',
    'GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_159_C1.jpg': 'Space Flight',
    'GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_159_D1.jpg': 'Space Flight',
    'GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_304_A1.jpg': 'Ground Control',
    'GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_304_B1.jpg': 'Ground Control',
    'GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_304_C1.jpg': 'Ground Control',
    'GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_304_D1.jpg': 'Ground Control',
}

rr3_labels = pd.DataFrame(list(rr3_label_dict.items()), columns=['image_file_path', 'label'])

rr3_labels.head()

## Image Preprocessing

In [None]:
folder_path = 'rr3_dataset/input'

all_files = sorted(os.listdir(folder_path))

for file_name in all_files:
    if file_name.lower().endswith('.jpg'):  # Check if the file is a .jpg
        file_path = os.path.join(folder_path, file_name)
        try:
            with Image.open(file_path) as img:
                width, height = img.size
                print(f"{file_name}, Dimensions: {width}x{height}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

In [None]:
rr3_labels.head()

In [None]:
output_folder = "rr3_dataset/preprocessed_images"
image_folder = "rr3_dataset/input"

In [None]:
os.makedirs(output_folder, exist_ok=True)

In [None]:
image_size = (224, 224)

In [None]:
# Example file

image_path = 'rr3_dataset/input/GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_158_A1.jpg'

img = Image.open(image_path)

intermediate_size = (img.size[0] // 4, img.size[1] // 4)
img_intermediate = img.resize(intermediate_size, Image.Resampling.NEAREST)
img_resized = img_intermediate.resize(image_size, Image.Resampling.NEAREST)

img_array = np.array(img_resized) / 255.0

plt.imshow(img_array)
plt.show()

In [None]:
image_data = []
image_labels = []
import cv2

for _, row in rr3_labels.iterrows():
    file_name = row["image_file_path"]
    label = row["label"]

    file_path = os.path.join(image_folder, file_name)
    print(file_path)
    if os.path.exists(file_path):
        try:
            with Image.open(file_path) as img:
                img_gray = img.convert("L")
                
                img_array_gray = np.array(img_gray)

                # equalized_img_array = cv2.equalizeHist(img_array_gray) # HISTOGRAM EQUALIZATION

                clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) # ADAPTIVE HE
                equalized_img_array = clahe.apply(img_array_gray)

                equalized_img = Image.fromarray(equalized_img_array)

                intermediate_size = (equalized_img.size[0] // 4, equalized_img.size[1] // 4)
                img_intermediate = equalized_img.resize(intermediate_size, Image.Resampling.NEAREST)
                img_resized = img_intermediate.resize(image_size, Image.Resampling.NEAREST)

                img_array = np.array(img_resized) / 255.0

                output_image_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}.npy")
                np.save(output_image_path, img_array)

                rr3_labels.loc[_, "processed_file_path"] = output_image_path

            del img_intermediate
            del img_resized
            del img_array

        except Exception as e:
            print(f"Error processing {file_name}: {e}")

rr3_dataset/input/GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_158_A1.jpg
rr3_dataset/input/GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_158_B1.jpg
rr3_dataset/input/GLDS-352_SpatialTranscriptomics_NASA-RR3_Sample_158_C1.jpg


In [None]:
# Alternate method with Dask-Image for resizing and normalization

def process_image(file_path, output_folder, image_size):
    print(file_path)
    try:
        with Image.open(file_path) as img:
            img_resized = img.resize(image_size, Image.Resampling.NEAREST)
            img_array = np.array(img_resized) / 255.0
            output_image_path = os.path.join(output_folder, f"{os.path.splitext(os.path.basename(file_path))[0]}.npy")
            np.save(output_image_path, img_array)
            return output_image_path
    except Exception as e:
        return f"Error processing {file_path}: {e}"

delayed_tasks = []
for _, row in rr3_labels.iterrows():
    file_name = row["image_file_path"]
    file_path = os.path.join(image_folder, file_name)
    if os.path.exists(file_path):
        delayed_task = delayed(process_image)(file_path, output_folder, image_size)
        delayed_tasks.append(delayed_task)

results = dask.compute(*delayed_tasks)

for idx, result in enumerate(results):
    if isinstance(result, str) and result.startswith("Error"):
        print(result)
    else:
        rr3_labels.loc[idx, "processed_file_path"] = result