In [2]:
import pandas as pd
import scipy.sparse as ss

from utils import synapses_to_matrix_and_dict

# Synaptic matrix and uuid map

In [8]:
df = pd.read_feather(
    "new_data/flywire_synapses_783.feather", 
    columns = ["pre_pt_root_id", "post_pt_root_id", "connection_score"]
    ).rename(
        columns={
            "pre_pt_root_id": "pre_root_id", 
            "post_pt_root_id": "post_root_id", 
            "connection_score": "syn_count"}
    )
df["syn_count"] = df["syn_count"].astype(int)

In [19]:
mm, synapse_dict = synapses_to_matrix_and_dict(df)

In [22]:
ss.save_npz(f"new_data/new_synaptic_matrix.npz", mm)
pd.DataFrame(synapse_dict.items(), columns=["root_id", "index"]).to_csv(
        f"new_data/new_root_id_to_index.csv", index=False
    )

## Grouping connections

In [None]:
# Add synapse counts for repeated pre_root_id and post_root_id pairs
df = df.groupby(["pre_root_id", "post_root_id"]).sum().reset_index()

In [12]:
mm, synapse_dict = synapses_to_matrix_and_dict(df)
ss.save_npz(f"new_data/new_grouped_synaptic_matrix.npz", mm)
pd.DataFrame(synapse_dict.items(), columns=["root_id", "index"]).to_csv(
        f"new_data/new_grouped_root_id_to_index.csv", index=False
    )

## Only proofread connections

In [3]:
pc = pd.read_feather(
    "new_data/proofread_connections_783.feather",
        columns = ["pre_pt_root_id", "post_pt_root_id", "syn_count"]
    ).rename(
        columns={
            "pre_pt_root_id": "pre_root_id", 
            "post_pt_root_id": "post_root_id"
            }
    ).groupby(["pre_root_id", "post_root_id"]).sum().reset_index()

In [5]:
pc.to_csv("new_data/connections.csv", index=False)

In [16]:

mm, synapse_dict = synapses_to_matrix_and_dict(pc)
ss.save_npz(f"new_data/proofread_synaptic_matrix.npz", mm)
pd.DataFrame(synapse_dict.items(), columns=["root_id", "index"]).to_csv(
        f"new_data/proofread_root_id_to_index.csv", index=False
    )

# Refined proofread synaptic matrix

In [5]:
import numpy as np

rpc = pd.read_feather("new_data/proofread_connections_783.feather")
# following https://www-nature-com.sabidi.urv.cat/articles/s41586-024-07763-9,
#  "We assume GABAergic and glutamatergic neurons are inhibitory" and the rest are excitatory
# compute the sum of all the columns ending with "avg"
rpc["syn_count"] = np.where(
    rpc["gaba_avg"] + rpc["glut_avg"] > 0.5, -1 * rpc["syn_count"], rpc["syn_count"]
)

In [7]:
crpc = rpc[["post_pt_root_id", "pre_pt_root_id", "syn_count"]].rename(
    columns={"pre_pt_root_id": "pre_root_id", "post_pt_root_id": "post_root_id"}
).groupby(["pre_root_id", "post_root_id"]).sum().reset_index()

In [9]:
crpc.to_csv("new_data/connections_refined.csv", index=False)

In [10]:
mm, synapse_dict = synapses_to_matrix_and_dict(crpc)
ss.save_npz(f"new_data/proofread_refined_synaptic_matrix.npz", mm)
pd.DataFrame(synapse_dict.items(), columns=["root_id", "index"]).to_csv(
    f"new_data/proofread_refined_root_id_to_index.csv", index=False
)

# Use hemibrain type when cell type is missing

In [None]:
rational_cell_types = pd.read_csv("adult_data/rational_cell_types.csv")
all_neurons = pd.read_table("new_data/neuron_annotations.tsv")
# If cell_type is NaN, put hemibrain_type in it
all_neurons["cell_type"] = all_neurons["cell_type"].fillna(
    all_neurons["hemibrain_type"]
)
all_neurons["decision_making"] = np.where(
    all_neurons["cell_type"].isin(rational_cell_types["cell_type"].values.tolist()),
    1,
    0,
)
all_neurons["root_id"] = all_neurons["root_id"].astype("string")

all_coords = all_neurons[["root_id", "pos_x", "pos_y", "pos_z"]].copy()

In [None]:
classification = pd.read_csv(
    os.path.join("new_data", "classification.csv"),
    usecols=["root_id", "cell_type", "side"],
    dtype={"root_id": "string"},
)

  nc = pd.read_table("new_data/neuron_annotations.tsv")


In [None]:
# if classification has NaNs, fill them with the cell_type from all_neurons, but note that they don't have the same number of rows, so we need to go by root_id
classification = classification.merge(
    all_neurons[["root_id", "cell_type"]].rename(
        columns={"cell_type": "cell_type_clean"}
    ),
    on="root_id",
    how="left",
)
classification["cell_type"] = classification["cell_type"].fillna(
    classification["cell_type_clean"]
)
classification = classification.drop(columns=["cell_type_clean"])

In [None]:
classification.to_csv("new_data/classification.csv", index=False)

cell_type
R1-6       7932
Dm3        2545
T2a        1781
Tm3        1746
T4c        1692
           ... 
DNge154       1
CB2716        1
CB3693        1
DNp72         1
CB3428        1
Name: count, Length: 5634, dtype: int64

In [11]:
df

Unnamed: 0,pre_root_id,post_root_id,syn_count
0,720575940379281722,720575940379283258,42
1,720575940379283482,720575940379282970,30
2,720575940379283719,720575940609623376,266
3,720575940379284367,720575940601751816,151
4,720575940379284367,720575940614275070,550
...,...,...,...
76460809,720575940661339009,720575940629746038,61
76460810,720575940661339777,720575940427732749,18
76460811,720575940661339777,720575940433957960,126
76460812,720575940661339777,720575940616982614,391
