In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as ss
from retina_to_connectome_funcs import get_synapse_df

In [9]:
from utils import synapses_to_matrix_and_dict


def create_basic_data(side=None):
    synapses = get_synapse_df(side)
    char = "" if side is None else f"{side}_"
    matrix, root_id_to_index = synapses_to_matrix_and_dict(synapses)
    ss.save_npz(f"adult_data/{char}synaptic_matrix.npz", matrix)
    pd.DataFrame(root_id_to_index.items(), columns=["root_id", "index"]).to_csv(
        f"adult_data/{char}root_id_to_index.csv", index=False
    )

In [10]:
# create_basic_data()
create_basic_data("left")
create_basic_data("right")

In [67]:
connections = pd.read_csv("adult_data/connections.csv")

# there are repeated connections, so we add them
connections = connections.groupby(["pre_root_id", "post_root_id"]).sum("syn_count").reset_index()
connections["pre_root_id"] = connections["pre_root_id"].astype("string")
connections["post_root_id"] = connections["post_root_id"].astype("string")

In [116]:
cl = pd.read_csv("adult_data/classification.csv")
cl["root_id"] = cl["root_id"].astype("string")
cl = cl[cl["side"] == "right"]

In [117]:
# remove duplicated cell_types, but leaving the ones we prefer
cl["priority"] = np.where(cl["cell_type"] == "R8", 1, 
                    np.where(cl["cell_type"] == "R7", 2,
                        np.where(cl["cell_type"] == "R1-6", 3,
                            np.where(cl["cell_type"].notnull(), 4, 5))))
cl.sort_values(by=["root_id", "priority"], inplace=True)
cl = cl.drop_duplicates(subset="root_id", keep="first")
cl.drop(columns="priority", inplace=True)

In [118]:
cl_connections = connections.merge(cl, left_on="pre_root_id", right_on="root_id")
cl_connections = cl_connections.merge(cl, left_on="post_root_id", right_on="root_id", suffixes=("_pre", "_post"))

In [119]:
# Extract unique identifiers from both dataframes
unique_cl_ids = set(cl["root_id"])
unique_pre_ids = set(cl_connections["pre_root_id"])  # Pre-synaptic identifiers
unique_post_ids = set(cl_connections["post_root_id"])  # Post-synaptic identifiers

In [120]:
only_in_cl = (unique_cl_ids - (unique_pre_ids | unique_post_ids))

In [121]:
# leave cl not in only_in_cl
filtered_cl = cl[~cl["root_id"].isin(only_in_cl)]


In [None]:

# Determine the common identifiers
# Neurons that are in 'cl' and either 'pre_root_id' or 'post_root_id' in 'connections'
common_ids = unique_cl_ids & (unique_pre_ids | unique_post_ids)

# Filter the 'cl' dataframe to include only rows with 'root_id' in common_ids
filtered_cl = cl[cl["root_id"].isin(common_ids)]

# Filter the 'connections' dataframe to include only rows with 'pre_root_id' or 'post_root_id' in common_ids
filtered_connections = connections[
    (connections["pre_root_id"].isin(common_ids))
    | (connections["post_root_id"].isin(common_ids))
]

# The 'filtered_cl' and 'filtered_connections' dataframes now contain only the rows with neurons that match the criteria.

In [2]:
import pandas as pd
cl_connections = pd.read_csv("adult_data/right_connections.csv")
filtered_cl = pd.read_csv("adult_data/right_classification.csv")

In [3]:
# Convert to sets for faster operations
pre_ids_set = set(cl_connections["pre_root_id"].values)
post_ids_set = set(cl_connections["post_root_id"].values)
cl_ids_set = set(filtered_cl["root_id"].values)

# Find neurons not in either column using set operations
missing_both = cl_ids_set - (pre_ids_set | post_ids_set)  # not in either
missing_either = cl_ids_set - (pre_ids_set & post_ids_set)  # not in both
missing_pre = pre_ids_set - cl_ids_set  # in pre but not in cl
missing_post = post_ids_set - cl_ids_set  # in post but not in cl

# Determine lengths as needed
len_missing_both = len(missing_both)
len_missing_either = len(missing_either)

In [139]:
# Map neuron root_ids to matrix indices
root_id_to_index = {root_id: index for index, root_id in enumerate(cl_ids_set)}

# Convert root_ids in filtered_synapse_df to matrix indices
pre_indices = cl_connections["pre_root_id"].map(root_id_to_index).values
post_indices = cl_connections["post_root_id"].map(root_id_to_index).values

# Use syn_count as the data for the non-zero elements of the matrix
data = cl_connections["syn_count"].values

# Create the sparse matrix
matrix = ss.coo_matrix(
    (data, (pre_indices, post_indices)),
    shape=(len(cl_ids_set), len(cl_ids_set)),
    dtype=np.int64,
)

In [140]:
pd.DataFrame(root_id_to_index.items(), columns=["root_id", "index"]).to_csv(
    "adult_data/right_root_id_to_index.csv", index=False
)

In [141]:
ss.save_npz("adult_data/right_synaptic_matrix.npz", matrix)

In [145]:
visual = filtered_cl[filtered_cl["cell_type"].isin(["R8", "R7", "R1-6"])]

In [172]:
coords = pd.read_csv(
    "adult_data/coordinates_sep.csv",
    dtype={"root_id": "string"},
    )

In [174]:
# Filter the 'coords' dataframe to include only rows with 'root_id' in common_ids
filtered_coords = coords[coords["root_id"].isin(common_ids)]

In [175]:
filtered_coords

Unnamed: 0,root_id,x,y,z,supervoxel_id
156,720575940638164864,689560,301620,74920,8.247684e+16
307,720575940630892544,788036,313228,97480,8.416583e+16
313,720575940629327872,723084,188368,139880,8.303787e+16
401,720575940628023296,714996,205072,147960,8.289740e+16
544,720575940630799104,494852,107572,45240,7.909584e+16
...,...,...,...,...,...
238687,720575940631190016,510136,218256,196440,7.937917e+16
238691,720575940631190016,532392,198908,188080,7.980104e+16
238692,720575940631190016,533616,177480,177880,7.980070e+16
238745,720575940640150912,283592,364620,243040,7.551137e+16


In [8]:
from scipy.sparse import load_npz
synaptic_matrix = load_npz("adult_data/good_synaptic_matrix.npz")

In [10]:
import numpy as np
np.log1p(synaptic_matrix.data)

array([1.79175947, 1.79175947, 1.79175947, ..., 1.79175947, 3.04452244,
       1.94591015])

In [1]:
import torch
torch.cuda.is_available()

True

In [4]:
import numpy as np
# add parent to sys path
import sys
sys.path.append("..")
from no_training import get_data
from scipy.sparse import coo_matrix

In [11]:
connections, shuffled_connections, all_neurons, neuron_data, all_coords = get_data()

In [28]:
def construct_filtered_synaptic_matrix(filtered_celltypes):

    forbidden_celltypes = (
        ["R8", "R7", "R1-6"] + 
        pd.read_csv("adult_data/rational_cell_types.csv", index_col=0).index.tolist()
        )
    
    if not set(filtered_celltypes).isdisjoint(forbidden_celltypes):
        raise ValueError(
            f"You can't fitler out any of the following cell types: {forbidden_celltypes}"
            )


    connections = (
        pd.read_csv(
            "adult_data/connections.csv",
            dtype={
                "pre_root_id": "string",
                "post_root_id": "string",
                "syn_count": np.int32,
            },
        )
        .groupby(["pre_root_id", "post_root_id"])
        .sum("syn_count")
        .reset_index()
    )
    right_root_ids = pd.read_csv("adult_data/root_id_to_index.csv")
    all_neurons = (
        pd.read_csv("adult_data/classification_clean.csv")
        .merge(right_root_ids, on="root_id")
        .fillna("Unknown")
    )

    # Remove neurons in the filtered celltypes
    all_neurons = all_neurons[~all_neurons["cell_type"].isin(filtered_celltypes)]

    ix_conns = connections.merge(all_neurons[["root_id", "index_id"]], left_on="pre_root_id", right_on="root_id").merge(
        all_neurons[["root_id", "index_id"]], left_on="post_root_id", right_on="root_id", suffixes=("_pre", "_post")
    )

    return coo_matrix(
        (ix_conns["syn_count"], (ix_conns["index_id_pre"], ix_conns["index_id_post"])),
        shape=(len(all_neurons), len(all_neurons)),
        dtype=np.int32,
    )

In [29]:
temp_matrix = construct_filtered_synaptic_matrix(["R8", "R7", "R1-6"])

In [30]:
temp_matrix.shape

(126649, 126649)

In [24]:
ix_conns["root_id_pre"].max()

'720575940661337217'

In [6]:
import pandas as pd
right_root_ids = pd.read_csv("adult_data/root_id_to_index.csv")
all_neurons = (
    pd.read_csv("adult_data/classification_clean.csv")
    .merge(right_root_ids, on="root_id")
    .fillna("Unknown")
)

In [9]:
all_neurons.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 134191 entries, 0 to 134190
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   root_id    134191 non-null  int64 
 1   cell_type  134191 non-null  object
 2   index_id   134191 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.1+ MB


In [12]:
connections.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2701601 entries, 0 to 2701600
Data columns (total 4 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   pre_root_id   string
 1   post_root_id  string
 2   syn_count     int32 
 3   weight        int64 
dtypes: int32(1), int64(1), string(2)
memory usage: 72.1 MB
