In [1]:
import sys
import json
import os
import pandas as pd
import anndata
import time
import numpy as np
from scipy import stats

## Find missing files

In [2]:
import os 

download_base = "/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/"

#directory_path = f"{download_base}metadata/averages/20230830/subclass_mean/"
directory_path = f"{download_base}metadata/averages/20230830/subclass_median/"
#directory_path = f"{download_base}metadata/averages/20230830/supertype_median/"
#directory_path = f"{download_base}metadata/averages/20230830/supertype_mean/"


# Define the expected range of file numbers
start_number = 1
end_number = 338  # Adjust the range based on your subclass sequence
#end_number = 1201  # Adjust the range based on your supertype sequence

# Create a set to store the presence of files
file_exists = set()

# Loop through existing files and mark their presence
for filename in os.listdir(directory_path):
    #print(filename)
    # Extract the first three characters from the file name
    file_number = filename[:3].zfill(3)
    #print(file_number)
    if file_number.isdigit():
        file_exists.add(int(file_number))

# Find and print the missing file numbers
missing_files = set(range(start_number, end_number + 1)) - file_exists
print("Total number of files: ", len(file_exists))
print("Missing file numbers:", sorted(missing_files))

Total number of files:  338
Missing file numbers: []


## Estimate calculation complexity

Check the size of the matrix

In [5]:
download_base = "/gpfs/bbp.cscs.ch/data/project/proj84/csaba/aibs_10x_mouse_wholebrain/"

url = download_base + '/releases/20230830/manifest.json'
with open(url, 'r') as json_file:
    manifest = json.load(json_file)

metadata = manifest['file_listing']['WMB-10X']['metadata']

In [6]:
%%time
rpath = metadata['cell_metadata_with_cluster_annotation']['files']['csv']['relative_path']
file = os.path.join( download_base, rpath)
cell = pd.read_csv(file,dtype={"neurotransmitter":str})
cell.set_index('cell_label',inplace=True)

CPU times: user 28.2 s, sys: 2.09 s, total: 30.3 s
Wall time: 30.6 s


In [7]:
cell.groupby(['dataset_label','feature_matrix_label'])[['library_label']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,library_label
dataset_label,feature_matrix_label,Unnamed: 2_level_1
WMB-10XMulti,WMB-10XMulti,1687
WMB-10Xv2,WMB-10Xv2-CTXsp,43985
WMB-10Xv2,WMB-10Xv2-HPF,207281
WMB-10Xv2,WMB-10Xv2-HY,99879
WMB-10Xv2,WMB-10Xv2-Isocortex-1,248776
WMB-10Xv2,WMB-10Xv2-Isocortex-2,249360
WMB-10Xv2,WMB-10Xv2-Isocortex-3,249356
WMB-10Xv2,WMB-10Xv2-Isocortex-4,248784
WMB-10Xv2,WMB-10Xv2-MB,29781
WMB-10Xv2,WMB-10Xv2-OLF,192182


In [8]:
matrices = cell.groupby(['dataset_label','feature_matrix_label'])[['library_label']].count()
matrices.columns  = ['cell_count']

In [9]:
#As a change we will calculate subclass level averages
#clusters = np.unique(cell['subclass'])
#clusters = np.unique(cell.cluster)
clusters = np.unique(cell['supertype'])

In [14]:
#Full gene set
rpath = metadata['gene']['files']['csv']['relative_path']
file = os.path.join( download_base, rpath)
genes = pd.read_csv(file,dtype={"comment":str})
genes.set_index('gene_identifier',inplace=True)

In [15]:
# Removing columns 'Column1' and 'Column2'
columns_to_remove = ['cell_barcode', 'barcoded_cell_sample_label', 'library_label',
       'feature_matrix_label', 'entity', 'brain_section_label',
       'library_method', 'region_of_interest_acronym', 'donor_label',
       'donor_genotype', 'donor_sex', 'dataset_label', 'x', 'y',
       'cluster_alias', 'neurotransmitter',
       'cluster', 'neurotransmitter_color', 'class_color', 'subclass_color',
       'supertype_color', 'cluster_color', 'region_of_interest_order',
       'region_of_interest_color']
cell = cell.drop(columns=columns_to_remove)

In [17]:
cell.head(1)

Unnamed: 0_level_0,class,subclass,supertype
cell_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GCGAGAAGTTAAGGGC-410_B05,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3


In [18]:
items_list = []
for cluster in clusters[:]:
        # Initialize gdata as an empty DataFrame
        items_list.append([ cluster , len(cell[cell['supertype'] == cluster]) ])
        
        #print(f"matrix size for {cluster} is: {cdata}", flush=True)

# Create a DataFrame from the list of items
df = pd.DataFrame(items_list, columns=['Cluster', 'Size'])



In [19]:
df = df.sort_values(by='Size', ascending=False)


In [20]:
#the code seem to break if Size is bigger than (122605)
df.head(20)

Unnamed: 0,Cluster,Size
1183,1184 MOL NN_4,395554
1162,1163 Astro-TE NN_3,146806
1159,1160 Astro-NT NN_2,139205
29,0030 L2/3 IT CTX Glut_2,128897
1178,1179 OPC NN_1,121667
1154,1155 CB Granule Glut_2,116129
27,0028 L4/5 IT CTX Glut_6,90911
1192,1193 Endo NN_1,88011
1193,1194 Microglia NN_1,86232
22,0023 L4/5 IT CTX Glut_1,85830


## Generate list of numbers

In [14]:
for i in range(9, len(clusters), 10):
    print(i, end=" ")

9 19 29 39 49 59 69 79 89 99 109 119 129 139 149 159 169 179 189 199 209 219 229 239 249 259 269 279 289 299 309 319 329 339 349 359 369 379 389 399 409 419 429 439 449 459 469 479 489 499 509 519 529 539 549 559 569 579 589 599 609 619 629 639 649 659 669 679 689 699 709 719 729 739 749 759 769 779 789 799 809 819 829 839 849 859 869 879 889 899 909 919 929 939 949 959 969 979 989 999 1009 1019 1029 1039 1049 1059 1069 1079 1089 1099 1109 1119 1129 1139 1149 1159 1169 1179 1189 1199 

In [11]:
len(clusters)

1201