-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
120 lines (83 loc) · 3.78 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas
import numpy as np
from ruffus import *
import sqlite3
import cPickle as pickle
import sklearn.metrics
MAX_CONTACT_AREA = 5.0
# Take in the connectivity matrix and
# 1. randomize the ordering of the cells
# select the N cells we're going to use and take that data frame
# return 3 distance matrices
# return connectivity matrix with real areas to use as subsequent threshold
# if we are going to say "above size X is a synapse" then we need to be consistent
# with it.
# create the python structures we want / need
def create_adj_mat(con,
area_contact_min,
cell_data):
"""
"""
df = pandas.io.sql.read_frame("select from_id, to_id, area, sum(area) as contact_area, count(area) as contact_count from contacts where area < %f and area > %f group by from_id, to_id" % (MAX_CONTACT_AREA, area_contact_min), con)
CELL_N = len(cell_data)
id_to_pos = {id: pos for pos, id in enumerate(cell_data.index.values)}
area_mat = np.zeros((CELL_N, CELL_N), dtype=np.float32)
for c_i, c_row in df.iterrows():
i1 = id_to_pos.get(c_row['from_id'], -1)
i2 = id_to_pos.get(c_row['to_id'], -1)
if i1 >= 0 and i2 >= 0:
area_mat[i1, i2] = c_row['contact_area']
return area_mat
def create_contact_x_lists(con, area_thold_min):
"""
for each cell collapse the list of synapse hists
"""
contacts_df = pandas.io.sql.read_frame("select * from contacts where area < %f and area > %f" % (MAX_CONTACT_AREA, area_thold_min),
con, index_col='id')
# contacts sanity check, make sure there is only ONE way of representing cell A contacts cell B
canon_set = set()
for from_id, to_id in zip(contacts_df['from_id'], contacts_df['to_id']):
if (from_id, to_id) in canon_set:
assert (to_id, from_id) not in canon_set
canon_set.add((from_id, to_id))
def f(group):
#row = group.irow(0)
gc = group.copy()
gc['cell_id'] = group['from_id']
g2 = group.copy()
g2['cell_id'] = group['to_id']
#new_df = group
#return DataFrame({'class': [row['class']] * row['count']})
return pandas.concat([gc, g2])
contacts_df_sym = contacts_df.groupby('from_id', group_keys=False).apply(f)
def feature_extract(group):
od = {}
od['contact_x_list'] = group['x'].tolist()
return pandas.Series(od)
s = contacts_df_sym.groupby('cell_id').apply(feature_extract)
return s
def create_data(con, area_thold_min):
"""
return a data frame with all the cell_type Ids, as well as the type information
joined in, the hists as list, the connectivity matrix, and then three
distance matrices, one for x, y, and z
"""
cells = pandas.io.sql.read_frame("select c.cell_id, s.x, s.z, s.y, c.type_id, t.coarse from cells as c join somapositions as s on c.cell_id = s.cell_id join types as t on c.type_id = t.type_id where s.x is not null",
con, index_col='cell_id')
cells = cells.reindex(np.random.permutation(cells.index))
contact_x_lists = create_contact_x_lists(con, area_thold_min)
cells = cells.join( contact_x_lists)
adj_mat = create_adj_mat(con, area_thold_min, cells) > 0
# should be in the right "order"
conn_mat = ((adj_mat + adj_mat.T) > 0).astype(np.uint8)
# distance matrices
dist_mats = {}
for dim in ['x', 'y', 'z']:
x = np.array(cells[dim])
x.shape = (len(x), 1)
dist_mats[dim] = sklearn.metrics.pairwise.pairwise_distances(x)
return cells, conn_mat, dist_mats
if __name__ == "__main__":
db = "../preprocess/mouseretina/mouseretina.db"
conn = sqlite3.connect(db)
create_data(conn, 0.5)