# collect cell matching annotation results
On Friday, June 25, a group of 8 of us worked through an exercise where we evaluated the cell matching for all ROIs associated with ~15 cell_specimen_ids. 

The fundamental question was whether or not cells that were labeled as 'invalid' by the classifier were more any more likely to be unmatched.  Every roi was assigned to 2 users to allow us to also evaluate how often users disagreed and to focus analysis on cells for which both users agreed.

This notebook collects and summarizes results.

In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## get sample cell dataframe

In [3]:
sample_cell_df_path = '/allen/programs/braintv/workgroups/nc-ophys/visual_behavior/qc_plots/single_cell_plots/cell_matching_qc/sample_cell_df.csv'
sample_cell_df = pd.read_csv(sample_cell_df_path)
print(len(sample_cell_df))
users = sample_cell_df['user1'].unique()
sample_cell_df.sample(5)

341


Unnamed: 0,cell_specimen_id,user1,user2,cell_roi_id,ophys_experiment_id,x,y,width,height,valid_roi,...,behavior_session_id,ophys_container_id,project_code,imaging_depth,targeted_structure,date_of_acquisition,session_type,file_id,has_dff,valid_and_has_dff
130,1086626585,sean,pete,1080838078,881949066,292,131,19,14,True,...,881278000,1018028370,VisualBehaviorMultiscope,225,VISp,2019-06-05 09:12:12.735423,OPHYS_2_images_A_passive,1085673745,True,True
39,1086538621,alex,kate,1080883381,795952471,180,313,26,20,True,...,795431009,814796612,VisualBehavior,375,VISp,2018-12-14 16:14:12.000000,OPHYS_5_images_B_passive,859685777,True,True
63,1086595949,alex,kate,1080828895,886585123,129,236,18,18,True,...,886440437,1018027549,VisualBehaviorMultiscope,227,VISp,2019-06-13 12:21:05.000000,OPHYS_2_images_A_passive,1085674042,True,True
325,1086573756,doug,pete,1080695811,982343738,275,41,13,15,False,...,981954859,1018028212,VisualBehaviorMultiscope,158,VISl,2019-11-13 13:35:56.285576,OPHYS_6_images_B,1085394216,True,False
107,1086601353,doug,pete,1080884766,875045489,62,76,19,16,False,...,874533721,863992815,VisualBehavior,175,VISp,2019-05-23 17:21:26.000000,OPHYS_2_images_A_passive,875773963,True,False


## get individual users annotations

In [4]:
links = {
    'alex': 'https://www.dropbox.com/s/v2ddqiw60smfert/cell_matching_qc_inventory_alex.csv?dl=1',
    'doug': 'https://www.dropbox.com/s/cy9103pdnob0nqa/cell_matching_qc_inventory_doug.csv?dl=1',
    'farzaneh': 'https://www.dropbox.com/s/05930t8w8vvfwyw/cell_matching_qc_inventory_farzaneh.csv?dl=1',
    'iryna': 'https://www.dropbox.com/s/majf4cpkciraouw/cell_matching_qc_inventory_iryna.csv?dl=1',
    'kate': 'https://www.dropbox.com/s/cjvaabq9pn9c6uo/cell_matching_qc_inventory_kate.csv?dl=1',
    'marina': 'https://www.dropbox.com/s/eaobtflsco3swer/cell_matching_qc_inventory_marina.csv?dl=1',
    'pete': 'https://www.dropbox.com/s/sby1euajfvpaziy/cell_matching_qc_inventory_pete.csv?dl=1',
    'sean': 'https://www.dropbox.com/s/cdj6p049nmhwj3a/cell_matching_qc_inventory_sean.csv?dl=1',
}

In [5]:
path = '/allen/programs/braintv/workgroups/nc-ophys/visual_behavior/qc_plots/single_cell_plots/cell_matching_qc'
inventories = []
cols = ['cell_specimen_id','ophys_experiment_id','cre_line','matched','user']
for user in links.keys():
#     df = pd.read_csv(os.path.join(path, user, 'cell_matching_qc_inventory_{}.csv'.format(user)))
    df = pd.read_csv(links[user])
    df['user'] = user
    inventories.append(df[cols])

## concatenate all inventories

In [6]:
inventory = pd.concat(inventories)
inventory

Unnamed: 0,cell_specimen_id,ophys_experiment_id,cre_line,matched,user
0,1086493808,826587940,Vip-IRES-Cre,True,alex
1,1086493808,830093338,Vip-IRES-Cre,True,alex
2,1086493808,833631914,Vip-IRES-Cre,True,alex
3,1086497277,993862620,Sst-IRES-Cre,True,alex
4,1086497277,994053903,Sst-IRES-Cre,True,alex
...,...,...,...,...,...
74,1086626585,891108758,Vip-IRES-Cre,True,sean
75,1086671126,949637002,Slc17a7-IRES2-Cre,True,sean
76,1086671126,950852107,Slc17a7-IRES2-Cre,True,sean
77,1086671126,956820482,Slc17a7-IRES2-Cre,True,sean


## get roi ID for each cell/experiment

In [7]:
def get_roi_id(row):
    cell_specimen_id = row['cell_specimen_id']
    ophys_experiment_id = row['ophys_experiment_id']

    return sample_cell_df.query('cell_specimen_id == @cell_specimen_id and ophys_experiment_id == @ophys_experiment_id')['cell_roi_id'].iloc[0]

inventory['cell_roi_id'] = inventory.apply(get_roi_id, axis=1)

## pivot so that every roi is a row and every user is a column

In [8]:
roi_inventory = inventory.pivot(index = 'cell_roi_id',columns='user',values='matched')
roi_inventory

user,alex,doug,farzaneh,iryna,kate,marina,pete,sean
cell_roi_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1080632005,True,,,,,,,True
1080632041,,,,True,,,True,
1080634115,True,,,,,,,True
1080634144,,,,True,,,True,
1080636244,True,,,,,,,True
...,...,...,...,...,...,...,...,...
1081192984,,True,,,,,True,
1081960834,True,,,,,,,True
1081960837,,,,True,,,True,
1082436277,,True,,,,,,True


## get match values and number of matches for each roi. We expect 2 values (True or False) for each row

In [9]:
def get_match_values(row):
    return [v for v in row if pd.notnull(v)]

def number_of_matches(row):
    return len([v for v in row if pd.notnull(v)])

roi_inventory['match_values'] = roi_inventory[users].apply(get_match_values, axis = 1)
roi_inventory['number_of_matches'] = roi_inventory[users].apply(number_of_matches, axis = 1)

roi_inventory

user,alex,doug,farzaneh,iryna,kate,marina,pete,sean,match_values,number_of_matches
cell_roi_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1080632005,True,,,,,,,True,"[True, True]",2
1080632041,,,,True,,,True,,"[True, True]",2
1080634115,True,,,,,,,True,"[True, True]",2
1080634144,,,,True,,,True,,"[True, True]",2
1080636244,True,,,,,,,True,"[True, True]",2
...,...,...,...,...,...,...,...,...,...,...
1081192984,,True,,,,,True,,"[True, True]",2
1081960834,True,,,,,,,True,"[True, True]",2
1081960837,,,,True,,,True,,"[True, True]",2
1082436277,,True,,,,,,True,"[True, True]",2


## what is the distrubution of `number of matches`
there are 4 with only 1. why? all should be 2. 

In [10]:
roi_inventory['number_of_matches'].value_counts()

2    331
1      4
Name: number_of_matches, dtype: int64

## of those with 2 match annotations, see if any have disagreement
7 have disagreement

In [11]:
cells_with_annotations = roi_inventory.query('number_of_matches == 2').copy()
cells_with_annotations['agreement'] = cells_with_annotations['match_values'].map(lambda l:l[0] == l[1])
cells_with_annotations['agreement'].value_counts()

True     324
False      7
Name: agreement, dtype: int64

## merge in the `valid_roi` label

In [12]:
cells_with_annotations = cells_with_annotations.merge(
    sample_cell_df[['cell_roi_id','valid_roi']],
    on = 'cell_roi_id'
)

In [13]:
cells_with_annotations['matched'] = cells_with_annotations['match_values'].map(lambda l:np.all(l))
cells_with_annotations['matched'].value_counts()

True     316
False     15
Name: matched, dtype: int64

## of those with `matched == False`, what is the distribution of  `valid_roi`?

In [14]:
cells_with_annotations.query('agreement and matched == False')['valid_roi']

225    False
239    False
242     True
243    False
289    False
290    False
313    False
320     True
Name: valid_roi, dtype: bool

In [15]:
cells_with_annotations.query('agreement == True and matched == False')

Unnamed: 0,cell_roi_id,alex,doug,farzaneh,iryna,kate,marina,pete,sean,match_values,number_of_matches,agreement,valid_roi,matched
225,1080800071,,False,,,,,,False,"[False, False]",2,True,False,False
239,1080822338,,False,,,,,,False,"[False, False]",2,True,False,False
242,1080824461,,False,,,,,,False,"[False, False]",2,True,True,False
243,1080824465,,False,,,,,,False,"[False, False]",2,True,False,False
289,1080846087,,False,,,,,False,,"[False, False]",2,True,False,False
290,1080846909,False,,,,,,,False,"[False, False]",2,True,False,False
313,1080884766,,False,,,,,False,,"[False, False]",2,True,False,False
320,1080912306,,False,,,,,False,,"[False, False]",2,True,True,False


## These are the cells with disagreeement. It might be worth looking more closely at these, but that there are so many makes it clear that these decisions are not straightforward.

In [16]:
cells_with_annotations.query('agreement == False')

Unnamed: 0,cell_roi_id,alex,doug,farzaneh,iryna,kate,marina,pete,sean,match_values,number_of_matches,agreement,valid_roi,matched
81,1080712558,True,,False,,,,,,"[False, True]",2,False,True,False
112,1080722988,,,,,True,False,,,"[False, True]",2,False,False,False
125,1080725713,,True,,,,False,,,"[True, False]",2,False,False,False
187,1080746457,,,False,,,True,,,"[False, True]",2,False,True,False
268,1080837459,True,,,,,,,False,"[True, False]",2,False,True,False
295,1080852333,,False,,,,True,,,"[False, True]",2,False,False,False
309,1080876856,,False,True,,,,,,"[True, False]",2,False,False,False


In [17]:
files = https://www.dropbox.com/sh/cth1kys5d8v4rhb/AADFIvYlpJg32pRHl2GxJ4nja?dl=1

SyntaxError: invalid syntax (<ipython-input-17-a5f78d9f083b>, line 1)