In [None]:
import pandas as pd
from scipy.spatial import distance
import numpy as np
import plotly.express as px

: 

In [9]:
# results from pca dimensionality reduction 
df = pd.read_csv("../asteraceae_pca_astera_50_checkpoint-1300.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,label
0,/projectnb/herbdl/data/kaggle-herbaria/herbari...,9.106315,0.102141,Asteraceae Ageratina jucunda
1,/projectnb/herbdl/data/kaggle-herbaria/herbari...,10.195288,-11.720784,Asteraceae Carphephorus paniculatus
2,/projectnb/herbdl/data/kaggle-herbaria/herbari...,-5.752719,-7.106326,Asteraceae Crepis acuminata
3,/projectnb/herbdl/data/kaggle-herbaria/herbari...,3.561659,14.624518,Asteraceae Bidens vulgata
4,/projectnb/herbdl/data/kaggle-herbaria/herbari...,-12.460205,-4.621024,Asteraceae Arnica sororia


In [10]:
# get list of labels - species 
species_names = df["label"].unique()

In [11]:
# for each label filter the df for the points 
result_dict = {}
for name in species_names:
    result_dict[name] = df[df["label"] == name]

In [12]:
# calculate the centroid for all points in the cluster 
# return the centroid for the cluster
def calculate_centroid(results_df: pd.DataFrame) -> tuple:
    x_s = results_df["0"]
    y_s = results_df["1"]
    group_size = len(x_s)
    center_x = sum(x_s)/group_size
    center_y = sum(y_s)/group_size 
    centroid = (center_x, center_y)
    return centroid

In [13]:
# calculate the distance to the centroid for each point in the cluster
# points_df: the DataFrame containing the points in columns '0' and '1' 
# centroid: the calculated centroid the cluster/group
# return the distances for the each point in the group as a list
def calculate_distances(points_df: pd.DataFrame, centroid: tuple) -> list:
    indexes = points_df.index 
    group_distances = []
    
    for idx in indexes:
        point = [points_df.loc[idx]["0"], points_df.loc[idx]["1"]]
        eucl_distance = distance.euclidean(point, centroid)
        group_distances.append(eucl_distance) 

    return group_distances

In [14]:
# write the distances to the original dataframe 
# calculate +/- 2 standard deviations 
# return the top threshold and bottom threshold
def calculate_thresholds(group_distances: list):
    top_thresh = np.mean(group_distances) + (np.std(group_distances)*2) 
    bot_thresh = np.mean(group_distances) - (np.std(group_distances)*2)
    return top_thresh, bot_thresh

In [15]:
# calculate outliers for each species 
centroids_dict = {}
for name in species_names:
    print(f"calculating outlier for group {name}")
    center = calculate_centroid(result_dict[name])
    distances = calculate_distances(result_dict[name], center)
    h_thresh, l_thresh = calculate_thresholds(distances) 

    # save the centroids
    centroids_dict[name] = center
    
    # get the outliers 
    outliers = [True if d > h_thresh or d < l_thresh else False for d in distances]
    
    # save the results to the dataframe 
    result_dict[name].loc[:, "eucl_distance"] = distances 
    result_dict[name].loc[:, "outlier"] = outliers

calculating outlier for group Asteraceae Ageratina jucunda
calculating outlier for group Asteraceae Carphephorus paniculatus
calculating outlier for group Asteraceae Crepis acuminata
calculating outlier for group Asteraceae Bidens vulgata
calculating outlier for group Asteraceae Arnica sororia
calculating outlier for group Asteraceae Brickellia longifolia
calculating outlier for group Asteraceae Liatris pilosa
calculating outlier for group Asteraceae Bidens beckii
calculating outlier for group Asteraceae Heterotheca pumila
calculating outlier for group Asteraceae Erigeron melanocephalus
calculating outlier for group Asteraceae Packera crocata
calculating outlier for group Asteraceae Erigeron pulcherrimus
calculating outlier for group Asteraceae Pterocaulon pycnostachyum
calculating outlier for group Asteraceae Garberia heterophylla
calculating outlier for group Asteraceae Eurybia divaricata
calculating outlier for group Asteraceae Chrysopsis subulata
calculating outlier for group Aster

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_dict[name].loc[:, "eucl_distance"] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_dict[name].loc[:, "outlier"] = outliers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_dict[name].loc[:, "eucl_distance"] = distances
A value is trying to be set on a copy of a slice f

In [10]:
# recombine results to one dataframe 
df_combined = pd.concat(result_dict.values(), ignore_index=False)

In [76]:
df_combined.to_csv("pca_2d_outliers.csv")

In [22]:
df_combined.head()

Unnamed: 0.1,Unnamed: 0,0,1,label,eucl_distance,outlier
0,/projectnb/herbdl/data/kaggle-herbaria/herbari...,9.106315,0.102141,Asteraceae Ageratina jucunda,2.892152,False
164,/projectnb/herbdl/data/kaggle-herbaria/herbari...,14.531756,1.737664,Asteraceae Ageratina jucunda,2.912305,False
203,/projectnb/herbdl/data/kaggle-herbaria/herbari...,9.630124,3.463129,Asteraceae Ageratina jucunda,2.7875,False
205,/projectnb/herbdl/data/kaggle-herbaria/herbari...,9.757186,5.011175,Asteraceae Ageratina jucunda,3.961636,False
248,/projectnb/herbdl/data/kaggle-herbaria/herbari...,12.025526,7.481657,Asteraceae Ageratina jucunda,5.975991,True
266,/projectnb/herbdl/data/kaggle-herbaria/herbari...,13.476758,-0.772726,Asteraceae Ageratina jucunda,2.944615,False
320,/projectnb/herbdl/data/kaggle-herbaria/herbari...,13.369989,-1.148309,Asteraceae Ageratina jucunda,3.185871,False
404,/projectnb/herbdl/data/kaggle-herbaria/herbari...,13.432225,-0.663595,Asteraceae Ageratina jucunda,2.831921,False
417,/projectnb/herbdl/data/kaggle-herbaria/herbari...,13.725654,-3.259612,Asteraceae Ageratina jucunda,5.218803,False
495,/projectnb/herbdl/data/kaggle-herbaria/herbari...,11.765627,2.068515,Asteraceae Ageratina jucunda,0.56664,False


In [17]:
df_tsne = pd.read_csv("../asteraceae_tsne_astera_50_checkpoint-1300.csv")

In [20]:
df_tsne["pca_outlier"] = df_combined["outlier"]

In [24]:
df_tsne.head()

Unnamed: 0.1,Unnamed: 0,0,1,label,pca_outlier
0,/projectnb/herbdl/data/kaggle-herbaria/herbari...,26.651928,13.341015,Asteraceae Ageratina jucunda,False
1,/projectnb/herbdl/data/kaggle-herbaria/herbari...,18.60073,-18.657986,Asteraceae Carphephorus paniculatus,False
2,/projectnb/herbdl/data/kaggle-herbaria/herbari...,-29.582312,-21.055666,Asteraceae Crepis acuminata,False
3,/projectnb/herbdl/data/kaggle-herbaria/herbari...,9.445475,26.835894,Asteraceae Bidens vulgata,False
4,/projectnb/herbdl/data/kaggle-herbaria/herbari...,3.919897,8.282666,Asteraceae Arnica sororia,False


In [25]:
df_tsne.to_csv("../tsne_with_2d_pca_outlier.csv")

# Outliers PCA 

In [58]:
df_pca = pd.read_csv("../asteraceae_pca_10_astera_50_checkpoint-1300.csv")
df_pca.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,label
0,/projectnb/herbdl/data/kaggle-herbaria/herbari...,9.105993,0.101899,4.631554,-11.614634,-1.79182,-7.718238,2.456629,4.603161,1.909699,-7.485959,Asteraceae Ageratina jucunda
1,/projectnb/herbdl/data/kaggle-herbaria/herbari...,10.195303,-11.720765,5.667974,-0.047585,-0.236367,3.540408,-15.955474,-2.131101,3.381002,-2.250144,Asteraceae Carphephorus paniculatus
2,/projectnb/herbdl/data/kaggle-herbaria/herbari...,-5.75244,-7.106267,-7.364646,2.605586,-4.203525,14.398073,1.85742,0.202585,-0.226751,-1.229941,Asteraceae Crepis acuminata
3,/projectnb/herbdl/data/kaggle-herbaria/herbari...,3.561296,14.624349,-7.597177,-10.574511,4.946481,9.6189,-0.34104,-8.264402,-1.0486,5.325108,Asteraceae Bidens vulgata
4,/projectnb/herbdl/data/kaggle-herbaria/herbari...,-12.460394,-4.621159,4.782201,-7.747055,-8.316726,10.124874,2.436418,-10.581471,-6.922583,5.466442,Asteraceae Arnica sororia


In [59]:
# get list of labels - species 
pca_species_names = df_pca["label"].unique()

In [60]:
# for each label filter the df for the points 
pca_result_dict = {}
for name in pca_species_names:
    pca_result_dict[name] = df_pca[df_pca["label"] == name]

In [67]:
# calculate outliers for each species 
pca_centroids_dict = {}
for name in pca_species_names:
    print(f"calculating outlier for group {name}")
    temp = pca_result_dict[name].iloc[:, 1:11] # get only the components 
    X_pca = temp.to_numpy(dtype=float) # shape (n_samples, n_components)
    
    # calculate center of cluster and covariance matrix
    centroid = np.mean(temp, axis=0) 
    cov_matrix = np.cov(X_pca, rowvar=False)

    # calculate the inverse covariance matrix
    inv_cov_matrix = np.linalg.inv(cov_matrix)

    # redo this part
    group_distances = []
    for pt in X_pca:
        #eucl_distance = distance.euclidean(pt, centroid)
        maha_distance = distance.mahalanobis(pt, centroid, inv_cov_matrix)
        group_distances.append(maha_distance)
    
    h_thresh, l_thresh = calculate_thresholds(group_distances) 

    # save the centroids
    pca_centroids_dict[name] = centroid
    
    # get the outliers 
    outliers = [True if d > h_thresh or d < l_thresh else False for d in group_distances]
    
    # save the results to the dataframe 
    pca_result_dict[name].loc[:, "maha_distance"] = group_distances 
    pca_result_dict[name].loc[:, "outlier"] = outliers

calculating outlier for group Asteraceae Ageratina jucunda
calculating outlier for group Asteraceae Carphephorus paniculatus
calculating outlier for group Asteraceae Crepis acuminata
calculating outlier for group Asteraceae Bidens vulgata
calculating outlier for group Asteraceae Arnica sororia
calculating outlier for group Asteraceae Brickellia longifolia
calculating outlier for group Asteraceae Liatris pilosa
calculating outlier for group Asteraceae Bidens beckii
calculating outlier for group Asteraceae Heterotheca pumila
calculating outlier for group Asteraceae Erigeron melanocephalus
calculating outlier for group Asteraceae Packera crocata
calculating outlier for group Asteraceae Erigeron pulcherrimus
calculating outlier for group Asteraceae Pterocaulon pycnostachyum
calculating outlier for group Asteraceae Garberia heterophylla
calculating outlier for group Asteraceae Eurybia divaricata
calculating outlier for group Asteraceae Chrysopsis subulata
calculating outlier for group Aster

In [68]:
df_combined_pca = pd.concat(pca_result_dict.values(), ignore_index=False)

In [69]:
df_combined_pca.head(50)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,label,maha_distance,outlier
0,/projectnb/herbdl/data/kaggle-herbaria/herbari...,9.105993,0.101899,4.631554,-11.614634,-1.79182,-7.718238,2.456629,4.603161,1.909699,-7.485959,Asteraceae Ageratina jucunda,3.014349,False
164,/projectnb/herbdl/data/kaggle-herbaria/herbari...,14.53129,1.737274,2.208296,0.261971,-7.681929,-5.674283,-2.300909,7.452234,-3.681906,-13.978932,Asteraceae Ageratina jucunda,3.320379,False
203,/projectnb/herbdl/data/kaggle-herbaria/herbari...,9.629799,3.462915,3.150657,-6.120223,-4.265068,-3.707499,6.683343,9.018858,-1.678997,-11.147399,Asteraceae Ageratina jucunda,2.654208,False
205,/projectnb/herbdl/data/kaggle-herbaria/herbari...,9.756797,5.010805,-3.678381,-7.961831,6.554263,-5.719462,-2.087889,-1.835466,4.922622,-9.003279,Asteraceae Ageratina jucunda,3.407011,False
248,/projectnb/herbdl/data/kaggle-herbaria/herbari...,12.025302,7.48152,-1.269873,-5.655677,0.544229,-3.490135,5.052476,11.573939,-2.148967,-8.5731,Asteraceae Ageratina jucunda,3.135845,False
266,/projectnb/herbdl/data/kaggle-herbaria/herbari...,13.476419,-0.772997,4.463445,-8.031266,-2.07578,-3.834005,0.51226,12.575368,-2.271918,-14.036239,Asteraceae Ageratina jucunda,3.37423,False
320,/projectnb/herbdl/data/kaggle-herbaria/herbari...,13.369639,-1.148592,4.527994,-5.282699,-5.790909,-7.725666,0.94964,9.733892,-3.085157,-12.066175,Asteraceae Ageratina jucunda,2.108603,True
404,/projectnb/herbdl/data/kaggle-herbaria/herbari...,13.431916,-0.663786,4.506845,-5.768451,-3.705016,-7.338467,3.173555,9.956849,1.502397,-11.9649,Asteraceae Ageratina jucunda,3.244262,False
417,/projectnb/herbdl/data/kaggle-herbaria/herbari...,13.725321,-3.259818,3.660504,-6.659841,-7.324079,-1.819031,-0.586893,1.187135,1.517619,-13.315825,Asteraceae Ageratina jucunda,3.330308,False
495,/projectnb/herbdl/data/kaggle-herbaria/herbari...,11.765604,2.068518,3.296372,-5.834346,-2.16084,-5.343658,4.350477,7.984479,-2.697385,-7.916287,Asteraceae Ageratina jucunda,2.416135,False


In [70]:
df_combined_pca.to_csv("pca_10_outliers.csv")