In [1]:
import geojson
import os
import pandas as pd
import cv2
import numpy as np
from sklearn.metrics import cohen_kappa_score
from glob import glob

In [2]:
def json_load(json_dir,geofile1):
    with open(os.path.join(json_dir,geofile1)) as f:
        gj = geojson.load(f)
    features = gj['features']
    return features


def polygon_rater_data_load(json_dir,geofile1):
    features1 = json_load(json_dir,geofile1)
    feature1_points = [f for f in features1 if (f["geometry"]["type"]=="Polygon") & ("isLocked" not in f["properties"].keys())]
    feature1_coords = [cords["geometry"]["coordinates"] for cords in feature1_points]
    feature1_name = []
    feature1_class = []
    for cords in feature1_points:
        if "name" not in cords["properties"].keys():
            feature1_name.append(None)
        else:
            feature1_name.append(cords["properties"]["name"])
        if "classification" not in cords["properties"].keys():
            feature1_class.append(None)
        elif "name" not in cords["properties"]["classification"].keys():
            feature1_class.append(None)
        else:
            feature1_class.append(cords["properties"]["classification"]["name"])
    feature1_df = pd.DataFrame({"coordinates":feature1_coords,"class":feature1_class, "name":feature1_name})
    return feature1_coords, feature1_class, feature1_name, feature1_df


def point_rater_data_load(json_dir,geofile2):
    features2 = json_load(json_dir,geofile2)
    features2_points = [f for f in features2 if f["geometry"]["type"]=="Point"]
    feature2_coords = [cords["geometry"]["coordinates"] for cords in features2_points]
    feature2_class = [cords["properties"]["classification"]["name"] if "classification" in (cords["properties"].keys()) else "None" for cords in features2_points ]
    feature2_name = [cords["properties"]["name"] if "name" in (cords["properties"].keys()) else "None" for cords in features2_points]
    feature2_df = pd.DataFrame({"coordinates":feature2_coords,"class":feature2_class, "name":feature2_name})
    return feature2_coords, feature2_class, feature2_name, feature2_df

def match_main_point_rater(feature1_coords,feature1_class,feature1_name,feature2_coords,feature2_class,feature2_name, radius):
    match = []
    i=0
    for c, class_var, name_var in zip(feature1_coords,feature1_class,feature1_name):
        #print(contours,class_var,name_var)
        j=0
        for cnt, class_var1, name1 in zip(feature2_coords,feature2_class,feature2_name):
            #print(c, class_var1, name1)
            #print((int(c[0]),int(c[1])))
            
            contours = [[int(c[0])+radius,int(c[1])],[int(c[0])+0.5*radius,int(c[1])+0.5*radius], [int(c[0]),int(c[1])+radius], [int(c[0])-0.5*radius,int(c[1])+0.5*radius], [int(c[0])-radius,int(c[1])], 
                        [int(c[0])-0.5*radius,int(c[1])-0.5*radius], [int(c[0])-radius,int(c[1])],[int(c[0])+0.5*radius,int(c[1])-0.5*radius]]
            #print(len(cnt[0]))
            if (len(cnt[0])>2):
                cnt = np.mean(cnt[0], axis=0)
            #print(len(cnt))
            dist = cv2.pointPolygonTest(np.int32(np.array(contours).round()),(int(cnt[0]),int(cnt[1])),False)
            if dist>=1:
                match.append([i,class_var,name_var,j,class_var1,name1, contours, cnt])
            j=j+1
        i=i+1 
    df = pd.DataFrame(match, columns=["main_rater_index","main_rater_class","main_rater_object_name","rater1_index","rater1_class","rater1_object_name","polygon_coords","point_coords"])
    return df



def match_point_rater(feature1_coords,feature1_class,feature1_name,feature2_coords,feature2_class,feature2_name):
    match = []
    i=0
    for contours, class_var, name_var in zip(feature1_coords,feature1_class,feature1_name):
        #print(contours,class_var,name_var)
        j=0
        for c, class_var1, name1 in zip(feature2_coords,feature2_class,feature2_name):
            #print(c, class_var1, name1)
            #print((int(c[0]),int(c[1])))
            dist = cv2.pointPolygonTest(np.int32(np.array(contours).round()),(int(c[0]),int(c[1])),False)
            if dist>=1:
                match.append([i,class_var,name_var,j,class_var1,name1, contours, c])
            j=j+1
        i=i+1 
    df = pd.DataFrame(match, columns=["main_rater_index","main_rater_class","main_rater_object_name","rater1_index","rater1_class","rater1_object_name","polygon_coords","point_coords"])
    return df




def compute_kappa_score(df, rater1_column, rater2_column):
    #df1= df[~((df["main_rater_class"]=='None') & (df["main_rater_class"]=='None'))]
    df1=df
    print(len(df1))
    labeler1 = df1[rater1_column]
    labeler2 = df1[rater2_column]
    return cohen_kappa_score(labeler1, labeler2)

def find_missing_main_rater(df, feature1_coords,feature1_class,feature1_name):
    missing_main_rater_list = []
    for i in range(len(feature1_coords)):
        if i not in df["main_rater_index"].values:
            print(i,feature1_name[i], feature1_coords[i])
            missing_main_rater_list.append([i,feature1_class[i],feature1_name[i],None, None,None, feature1_coords[i],None])
    missing_main_rater = pd.DataFrame(missing_main_rater_list, columns=["main_rater_index","main_rater_class","main_rater_object_name","rater1_index","rater1_class","rater1_object_name","polygon_coords","point_coords"])
    return missing_main_rater

def find_missing_rater1(df, feature2_coords,feature2_class,feature2_name):
    missing_rater1_list = []
    for i in range(len(feature2_class)):
        if i not in df["rater1_index"].values:
            print(i,feature2_class[i],feature2_coords[i] )
            missing_rater1_list.append([None, None,None,i,feature2_class[i],feature2_name[i], None, feature2_coords[i]])
    missing_rater1 = pd.DataFrame(missing_rater1_list,columns=["main_rater_index","main_rater_class","main_rater_object_name","rater1_index","rater1_class","rater1_object_name","polygon_coords","point_coords"])
    return missing_rater1

def map_class(l):
    if l=="Cored":
        return "(C)"
    if l=="Diffuse":
        return "(D)"
    if l=="Mature":
        return "(M)"
    if l=="Pre":
        return "(P)"
    if l=="Ghost":
        return "(G)"
    if l=="Coarse-Grained":
        return "(CG)"
    return None


def find_match(json_dir1,json_dir2, geojsons_names,radius):
    all_geojson_df = pd.DataFrame()
    for geofile1 in geojsons_names:
        print("------------------",geofile1,"---------------------")
        #feature1_coords, feature1_class, feature1_name, feature1_df = point_rater_data_load(json_dir1,geofile1)
        feature1_coords, feature1_class, feature1_name, feature1_df = polygon_rater_data_load(json_dir1,geofile1)
        print(len(feature1_coords))
        feature2_coords, feature2_class, feature2_name, feature2_df = point_rater_data_load(json_dir2,geofile1)
        #feature2_coords, feature2_class, feature2_name, feature2_df = polygon_rater_data_load(json_dir2,geofile1)
        print(len(feature2_coords))
        #df = match_main_point_rater(feature1_coords,feature1_class,feature1_name,feature2_coords,feature2_class,feature2_name, radius)
        df = match_point_rater(feature1_coords,feature1_class,feature1_name,feature2_coords,feature2_class,feature2_name)
        missing_main_rater = find_missing_main_rater(df, feature1_coords,feature1_class,feature1_name)
        #missing_rater1 = find_missing_rater1(df, feature2_coords,feature2_class,feature2_name)
        #df_final = pd.concat([df,missing_rater1,missing_main_rater], axis=0, ignore_index=True) 
        df_final = pd.concat([df,missing_main_rater], axis=0, ignore_index=True) 
        df_final["main_rater_class"] = np.where(df_final["main_rater_class"].isna(),"None",df_final["main_rater_class"])
        df_final["rater1_class"] = np.where(df_final["rater1_class"].isna(),"None",df_final["rater1_class"])
        df_final["main_rater_annotation"] = df_final["main_rater_class"].apply(lambda l: "Polygon"+ map_class(l) if l!='None' else "")
        df_final["rater1_annotation"] = df_final["rater1_class"].apply(lambda l: "Point"+ map_class(l) if l!='None' else "")
        df_final["geojson_file"] = geofile1
        if len(all_geojson_df)==0:
            all_geojson_df = df_final
        else:
            all_geojson_df =  pd.concat([all_geojson_df,df_final], ignore_index=True)
    return all_geojson_df
        

In [3]:
json_dir2= "/gladstone/finkbeiner/steve/work/data/npsad_data/vivek/interrater-study/interrater_geojson_Max/Brittany"
json_dir1= "/gladstone/finkbeiner/steve/work/data/npsad_data/vivek/interrater-study/interrater_geojson_Max/Max"

In [4]:
geojsons_main_rater =  glob(os.path.join(json_dir1,"*.geojson"))
geojsons_names =  [x.split("/")[-1] for x in geojsons_main_rater]

In [5]:
geojsons_names

['22650_7_AmyB_1_ceren.mrxs.geojson',
 '25144_1_AmyB_1_ceren.mrxs.geojson',
 '22640_1_AmyB_1_ceren.mrxs.geojson',
 '30414_6_AmyB_1_ceren.mrxs.geojson',
 '420418_6_AmyB_1_ceren.mrxs.geojson',
 '354049_6_AmyB_1_ceren.mrxs.geojson']

In [6]:
max_output = find_match(json_dir1,json_dir2, geojsons_names,250)

------------------ 22650_7_AmyB_1_ceren.mrxs.geojson ---------------------
15
12
11 5:Diffuse [[[72989, 139886], [72983, 139887], [72981, 139887], [72979, 139888], [72977, 139889], [72974, 139890], [72971, 139891], [72969, 139891], [72966, 139892], [72963, 139893], [72961, 139894], [72958, 139896], [72955, 139897], [72953, 139899], [72950, 139901], [72947, 139901], [72946, 139902], [72943, 139904], [72941, 139906], [72938, 139909], [72935, 139911], [72934, 139913], [72932, 139915], [72931, 139915], [72929, 139917], [72928, 139919], [72926, 139921], [72925, 139923], [72924, 139923], [72922, 139925], [72921, 139927], [72920, 139927], [72918, 139929], [72917, 139931], [72916, 139932], [72915, 139933], [72914, 139935], [72912, 139937], [72911, 139938], [72911, 139940], [72909, 139943], [72908, 139943], [72906, 139946], [72905, 139948], [72904, 139948], [72903, 139950], [72902, 139951], [72900, 139952], [72898, 139953], [72897, 139955], [72895, 139956], [72893, 139957], [72890, 139957], [72

In [7]:
len(max_output[max_output["rater1_class"]=="None"])

89

In [8]:
def find_equiv_diameter(cnt):
    area = cv2.contourArea(cnt)
    equi_diameter = np.sqrt(4*area/np.pi)
    return equi_diameter

In [9]:
max_output["area"] = max_output["polygon_coords"].apply(lambda l: cv2.contourArea(np.int32(np.array(l).round())))

In [10]:
max_output["diameter"] = max_output["area"].apply(lambda l: np.sqrt(4*l/np.pi))

In [11]:
max_output["size_bracket"] = np.where(max_output["diameter"]<=10,"a. <=10",np.where(max_output["diameter"]<=50,"b. 10-50", 
                                                                                   np.where(max_output["diameter"]<=100,"c. 50-100",
                                                                                            np.where(max_output["diameter"]<=250,"d. 100-250",
                                                                                                     np.where(max_output["diameter"]<=500,"e. 250-500",
                                                                                                     "f. >500")))))

In [12]:
max_output.columns

Index(['main_rater_index', 'main_rater_class', 'main_rater_object_name',
       'rater1_index', 'rater1_class', 'rater1_object_name', 'polygon_coords',
       'point_coords', 'main_rater_annotation', 'rater1_annotation',
       'geojson_file', 'area', 'diameter', 'size_bracket'],
      dtype='object')

In [13]:
max_output.groupby(["main_rater_class","size_bracket"])["main_rater_index"].count().reset_index()

Unnamed: 0,main_rater_class,size_bracket,main_rater_index
0,Coarse-Grained,c. 50-100,8
1,Coarse-Grained,d. 100-250,57
2,Coarse-Grained,e. 250-500,18
3,Coarse-Grained,f. >500,3
4,Cored,c. 50-100,1
5,Cored,d. 100-250,28
6,Cored,e. 250-500,25
7,Cored,f. >500,1
8,Diffuse,c. 50-100,7
9,Diffuse,d. 100-250,57


In [14]:
compute_kappa_score(max_output, "main_rater_class", "rater1_class")

266


0.04126335346028798

In [15]:
compute_kappa_score(max_output[max_output["rater1_class"]!="None"], "main_rater_class", "rater1_class")

177


0.23512782197811077

In [16]:
max_output[max_output["main_rater_class"]==max_output["rater1_class"]]["main_rater_class"].value_counts()

Diffuse           32
Cored             28
Coarse-Grained    10
None               2
Name: main_rater_class, dtype: int64

In [17]:
max_output[max_output["main_rater_class"]!=max_output["rater1_class"]]["main_rater_class"].value_counts()

Coarse-Grained    76
None              51
Diffuse           40
Cored             27
Name: main_rater_class, dtype: int64

In [18]:
max_output.to_csv("/gladstone/finkbeiner/steve/work/data/npsad_data/vivek/interrater-study/Kappa_compute/max_brittany_objects_full.csv")