# Task
Analyze the spatial omics data using ontology mining techniques as described in the notebook "02_ontology_mining.ipynb".

## Data loading and preprocessing

### Subtask:
Load the spatial omics data and perform initial preprocessing steps. This may include normalization, scaling, or handling missing values, depending on the data format and the specific ontology mining techniques to be used.


**Reasoning**:
Load the spatial omics data from the specified file into an AnnData object and display its structure.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# The path to your university folder will now be inside your personal mounted drive
# via the shortcut you created.
university_folder_path = '/content/drive/My Drive/Ovary'

# You can now list the files inside it to verify
import os
print(os.listdir(university_folder_path))

['HBM539.JDPH.785', 'HBM853.LCNF.879']


In [None]:
import numpy as np
import pandas as pd
import os
import anndata as ad
import scanpy as sc
import squidpy as sq

sc.logging.print_header()
print(f"squidpy=={sq.__version__}")

# The path to your university folder will now be inside your personal mounted drive
# via the shortcut you created.
university_folder_path = '/content/drive/My Drive/Ovary/HBM539.JDPH.785'

# Assuming the spatial omics data is in .h5ad format within the specified folder
# You might need to adjust the filename if it's different
data_file_path = os.path.join(university_folder_path, 'expr.h5ad')

try:
    adata = ad.read_h5ad(data_file_path)
    print(adata)
except FileNotFoundError:
    print(f"Error: The file '{data_file_path}' was not found.")
    print("Please make sure the filename and path are correct.")
except Exception as e:
    print(f"An error occurred while loading the data: {e}")

  return _bootstrap._gcd_import(name[level:], package, level)

stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/usr/local/lib/python3.11/dist-packages/numba_cuda/numba/cuda/cudadrv/driver.py", line 314, in __getattr__
    raise CudaSupportError("Error at driver init: \n%s:" %
numba.cuda.cudadrv.error.CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:


Not patching Numba


squidpy==1.6.5
AnnData object with n_obs × n_vars = 4990 × 60286
    obs: 'Tissue Coverage Fraction'
    var: 'hugo_symbol'
    uns: 'X_spatial_units', 'spatial'
    obsm: 'X_spatial', 'X_spatial_gpr', 'spatial'
    layers: 'spliced', 'spliced_unspliced_sum', 'unspliced'


Now that the data is loaded, we will perform initial preprocessing steps, including normalization, scaling, and handling missing values. We will use `scanpy` for these operations.

In [None]:
# Basic filtering: Filter out genes with less than a certain number of counts or present in less than a certain number of cells
# and cells with less than a certain number of genes or total counts.
# The exact thresholds might need adjustment based on the specific dataset.
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# Normalization: Normalize total counts per cell
sc.pp.normalize_total(adata, target_sum=1e4)

# Logarithmize the data
sc.pp.log1p(adata)

# Handle missing values (if any). AnnData and scanpy are generally designed to handle matrices without explicit missing values
# but if there are NaNs or Infs, you might need to handle them.
# For example, you could check for and replace them:
# if np.any(np.isnan(adata.X)):
#     adata.X[np.isnan(adata.X)] = 0  # Replace NaN with 0, or another appropriate value
# if np.any(np.isinf(adata.X)):
#      adata.X[np.isinf(adata.X)] = np.max(adata.X[np.isfinite(adata.X)]) # Replace Inf with max finite value

# Scaling: Scale each gene to have unit variance and zero mean.
# This is often done after logarithmizing, but before further analysis like PCA.
sc.pp.scale(adata, max_value=10)

print("Preprocessing complete.")
print(adata)

Preprocessing complete.
AnnData object with n_obs × n_vars = 0 × 0
    obs: 'Tissue Coverage Fraction', 'n_genes'
    var: 'hugo_symbol', 'n_cells', 'mean', 'std'
    uns: 'X_spatial_units', 'spatial', 'log1p'
    obsm: 'X_spatial', 'X_spatial_gpr', 'spatial'
    layers: 'spliced', 'spliced_unspliced_sum', 'unspliced'


  return X.mean(axis=axis, dtype=dtype)


In [None]:
# prompt: Save the adata into a folder in the Drive

# Define the output directory within your Drive
output_directory_path = os.path.join(university_folder_path, 'processed_data')

# Create the directory if it doesn't exist
os.makedirs(output_directory_path, exist_ok=True)

# Define the full path for the processed AnnData file
processed_adata_path = os.path.join(output_directory_path, 'processed_expr.h5ad')

# Save the processed adata object
adata.write(processed_adata_path)

print(f"Processed AnnData saved to: {processed_adata_path}")

Processed AnnData saved to: /content/drive/My Drive/Ovary/processed_data/processed_expr.h5ad


In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Alternatively, you can save the processed AnnData object to the local Colab environment. You can then download it from the Colab file explorer.

In [None]:
# Define the path to save the processed AnnData file in the local Colab environment
local_processed_adata_path = 'processed_expr.h5ad'

# Save the processed adata object to the local environment
adata.write(local_processed_adata_path)

print(f"Processed AnnData saved to: {local_processed_adata_path}")

Processed AnnData saved to: processed_expr.h5ad
