# Objective

Investigate precision and recall for all spots in all images in the persistence experiment.

# Takeaways

**Regarding the ROC curve at the bottom of this notebook:**

We would theoretically expect both sensitivity and specificity to decrease as the number of spots increases. The ROC curve does not look clean like this, but all data points are within a small region, so it's hard to gauge the significance of deviations away from expected behavior. The main takeaway is that at 225 spots of SNR 10 in one image we seem to be quite far from the upper limit of spots that workers are willing to annotate.

In [1]:
import os, sys, math, random
sys.path.insert(0, '../../FISH-annotation/Annotation')
from SpotAnnotationAnalysis import SpotAnnotationAnalysis
from QuantiusAnnotation import QuantiusAnnotation
from sklearn.neighbors import KDTree
import util
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
num_spots_list = np.arange(50, 250, 25)
cwd = os.getcwd()
num_replicates_list = range(1, 26)
num_iter = 10

In [3]:
recall_list, precision_list = [], []
print('Progress:')
for n in num_replicates_list:
    print('\n------- num_replicates = ' + str(n) + ' -------')
    
    recall, precision = [], []
    for num_spots in num_spots_list:
        
        json_filepath = cwd + '/annotations/' + 'snr10_' + str(num_spots) + '.json'
        img_filename = 'snr_10_numspots_' + str(num_spots) + '_spot_img.png'
        print('       num_spots = ' + str(num_spots))
        img_filepath = cwd + '/spot_images/' + img_filename
        csv_filepath = cwd + '/spot_data/' + 'snr_10_numspots_' + str(num_spots) + '_coord_snr_list.csv'
        qa = QuantiusAnnotation(json_filepath, img_filename)
        sa = SpotAnnotationAnalysis(qa)
        anno_all = qa.df()

        worker_list = util.get_workers(anno_all)
        if n >= len(worker_list):
            precision.append(None)
            recall.append(None)
            print('              Desired num replicates is greater than num workers who annotated this image')
            continue
        
        TPR_list, FPR_list = [], []
        for i in range(num_iter):
                        
            selected_workers = random.sample(list(worker_list), n)
            anno = anno_all[anno_all.worker_id.isin(selected_workers)]

            clustering_params = ['AffinityPropagation', -350]
            clusters = sa.get_clusters(anno, clustering_params)

            img_height = anno['height'].values[0]

            """ Screen by Cluster Size """
            cluster_size_threshold = util.get_cluster_size_threshold(clusters)
            small_clusters, large_clusters = util.sort_clusters_by_size(clusters, cluster_size_threshold)

            """ Detect Clumps """
            clumpiness_threshold = util.get_clumpiness_threshold(large_clusters)
            clumpy_clusters, nonclumpy_clusters = util.sort_clusters_by_clumpiness(large_clusters, clumpiness_threshold)

            """ Declump """
            declumping_params = ['KMeans', 2]
            result_clusters = nonclumpy_clusters
            for i in range(len(clumpy_clusters.index)):
                subclusters = util.declump(clumpy_clusters, i, declumping_params)
                result_clusters = pd.concat([subclusters, result_clusters], ignore_index=True)

            result_points = result_clusters.loc[:, ['centroid_x', 'centroid_y']].as_matrix()
            result_kdt = KDTree(result_points, leaf_size=2, metric='euclidean')

            ref_df = pd.read_csv(csv_filepath)
            ref_points = ref_df.loc[:, ['col', 'row']].as_matrix().tolist()
            ref_kdt = KDTree(ref_points, leaf_size=2, metric='euclidean')

            """ TPR """
            detected_spots = 0
            for point in ref_points:
                dist, ind = result_kdt.query([[point[0], img_height-point[1]]], k=1)
                if dist[0][0] <= 4:
                    detected_spots += 1
            tpr = float(detected_spots) / num_spots
            TPR_list.append(tpr)

            """ FPR """
            correct_centroids = 0
            for point in result_points:
                dist, ind = ref_kdt.query([[point[0], img_height-point[1]]], k=1)
                if dist[0][0] <= 4:
                    correct_centroids += 1

            fpr = 1 - float(correct_centroids) / num_spots
            FPR_list.append(fpr)
            
        avg_TPR = np.mean(TPR_list)
        avg_FPR = np.mean(FPR_list)
        recall.append(avg_TPR)
        precision.append(1 - avg_FPR)

    recall_list.append(recall)
    precision_list.append(precision)

Progress:

------- num_replicates = 1 -------
       num_spots = 50
              iter 0
              iter 1
              iter 2
              iter 3
              iter 4
              iter 5
              iter 6
              iter 7
              iter 8
              iter 9
       num_spots = 75
              iter 0
              iter 1
              iter 2
              iter 3
              iter 4
              iter 5
              iter 6
              iter 7
              iter 8
              iter 9
       num_spots = 100
              iter 0
              iter 1
              iter 2
              iter 3
              iter 4
              iter 5
              iter 6
              iter 7
              iter 8
              iter 9
       num_spots = 125
              iter 0
              iter 1
              iter 2
              iter 3
              iter 4
              iter 5
              iter 6
              iter 7
              iter 8
              iter 9
       num_spots = 150
 

              iter 4
              iter 5
              iter 6
              iter 7
              iter 8
              iter 9
       num_spots = 125
              iter 0
              iter 1
              iter 2
              iter 3
              iter 4
              iter 5
              iter 6
              iter 7
              iter 8
              iter 9
       num_spots = 150
              iter 0
              iter 1
              iter 2
              iter 3
              iter 4
              iter 5
              iter 6
              iter 7
              iter 8
              iter 9
       num_spots = 175
              iter 0
              iter 1
              iter 2
              iter 3
              iter 4
              iter 5
              iter 6
              iter 7
              iter 8
              iter 9
       num_spots = 200
              iter 0
              iter 1
              iter 2
              iter 3
              iter 4
              iter 5
              iter 6
     

KeyboardInterrupt: 

In [None]:
for recall, precision, n in zip(recall_list, precision_list, num_replicates_list):
                
    print('\n------------------------ num_replicates = ' + str(n) + ' ------------------------')

    if not any(recall):
        print('For all images in this set, the desired number of replicates is greater than the number of workers who annotated the image')
        continue
    
    plt.scatter(num_spots_list, recall)
    plt.xlabel('num_spots')
    plt.ylabel('recall = TPR')
    plt.title('Recall vs. number of spots, ' + str(n) + ' replicates, avg across ' + str(num_iter) + ' samples')
    plt.show()
    print('recall = ' + str(recall))
    
    plt.scatter(num_spots_list, precision)
    plt.xlabel('num_spots')
    plt.ylabel('precision = 1 - FPR')
    plt.title('Precision vs. number of spots, ' + str(n) + ' replicates, avg across ' + str(num_iter) + ' samples')
    plt.show()
    print('precision = ' + str(precision))
    