In [1]:
import pandas as pd
from scipy.spatial import distance
import numpy as np

In [2]:
# results from t-SNE dimensionality reduction 
df = pd.read_csv("asteraceae_tsne_astera_50_checkpoint-1300.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,label
0,/projectnb/herbdl/data/kaggle-herbaria/herbari...,26.651928,13.341015,Asteraceae Ageratina jucunda
1,/projectnb/herbdl/data/kaggle-herbaria/herbari...,18.60073,-18.657986,Asteraceae Carphephorus paniculatus
2,/projectnb/herbdl/data/kaggle-herbaria/herbari...,-29.582312,-21.055666,Asteraceae Crepis acuminata
3,/projectnb/herbdl/data/kaggle-herbaria/herbari...,9.445475,26.835894,Asteraceae Bidens vulgata
4,/projectnb/herbdl/data/kaggle-herbaria/herbari...,3.919897,8.282666,Asteraceae Arnica sororia


In [3]:
# get list of labels - species 
species_names = df["label"].unique()

In [4]:
# for each label filter the df for the points 
result_dict = {}
for name in species_names:
    result_dict[name] = df[df["label"] == name]

In [31]:
# calculate the centroid for all points in the cluster 
# return the centroid for the cluster
def calculate_centroid(results_df: pd.DataFrame) -> tuple:
    x_s = results_df["0"]
    y_s = results_df["1"]
    group_size = len(x_s)
    center_x = sum(x_s)/group_size
    center_y = sum(y_s)/group_size 
    centroid = (center_x, center_y)
    return centroid

In [35]:
# calculate the distance to the centroid for each point in the cluster
# points_df: the DataFrame containing the points in columns '0' and '1' 
# centroid: the calculated centroid the cluster/group
# return the distances for the each point in the group as a list
def calculate_distances(points_df: pd.DataFrame, centroid: tuple) -> list:
    indexes = points_df.index 
    group_distances = []
    
    for idx in indexes:
        point = [points_df.loc[idx]["0"], points_df.loc[idx]["1"]]
        eucl_distance = distance.euclidean(point, centroid)
        group_distances.append(eucl_distance) 

    return group_distances

In [43]:
# write the distances to the original dataframe 
# calculate +/- 2 standard deviations 
# return the top threshold and bottom threshold
def calculate_thresholds(group_distances: list):
    top_thresh = np.mean(group_distances) + (np.std(group_distances)*2) 
    bot_thresh = np.mean(group_distances) - (np.std(group_distances)*2)
    return top_thresh, bot_thresh

In [55]:
# calculate outliers for each species 
centroids_dict = {}
for name in species_names:
    print(f"calculating outlier for group {name}")
    center = calculate_centroid(result_dict[name])
    distances = calculate_distances(result_dict[name], center)
    h_thresh, l_thresh = calculate_thresholds(distances) 

    # save the centroids
    centroids_dict[name] = center
    
    # get the outliers 
    outliers = [True if d > h_thresh or d < l_thresh else False for d in distances]
    
    # save the results to the dataframe 
    result_dict[name].loc[:, "eucl_distance"] = distances 
    result_dict[name].loc[:, "outlier"] = outliers

calculating outlier for group Asteraceae Ageratina jucunda
calculating outlier for group Asteraceae Carphephorus paniculatus
calculating outlier for group Asteraceae Crepis acuminata
calculating outlier for group Asteraceae Bidens vulgata
calculating outlier for group Asteraceae Arnica sororia
calculating outlier for group Asteraceae Brickellia longifolia
calculating outlier for group Asteraceae Liatris pilosa
calculating outlier for group Asteraceae Bidens beckii
calculating outlier for group Asteraceae Heterotheca pumila
calculating outlier for group Asteraceae Erigeron melanocephalus
calculating outlier for group Asteraceae Packera crocata
calculating outlier for group Asteraceae Erigeron pulcherrimus
calculating outlier for group Asteraceae Pterocaulon pycnostachyum
calculating outlier for group Asteraceae Garberia heterophylla
calculating outlier for group Asteraceae Eurybia divaricata
calculating outlier for group Asteraceae Chrysopsis subulata
calculating outlier for group Aster