In [19]:
import pandas as pd
import json
import copy

In [20]:
def from_json_to_df(json_file_path):
    """
    Convert the classification json file into a simple and usable dataframe. 
    We remove the key "species count" because it is useless here.
    Then to extract specific information of classification you can do df['000001.jpg']['track_info'] and then select the image
    """
    with open(json_file_path) as f:
        d = json.load(f)
        del d['species_count'] 
    df = pd.DataFrame.from_dict(d,orient='index').reset_index(level=0).rename(columns={"index": "source_image_cropped"})
    return df

In [21]:
csv_file_path = "../../0_database/tracking/inat_validation_data_with_sources_and_scores-20230204.csv" # contains the ground truth result for each cropped image
json_file_path = '../../0_database/tracking/set2_maxim-kent.json' # contains the classification result for each cropped image
df_gt = pd.read_csv(csv_file_path)
df_classification = from_json_to_df(json_file_path)

In [22]:
df_classification.head(3)

Unnamed: 0,source_image_cropped,region,date,track_id,prediction,track_info
0,000001.jpg,Quebec,2022_07_31,2.0,"[[Choristoneura parallela, 29], [Argyrotaenia ...","[[20220731233507-00-91.jpg, 2432, 1485, 2566, ..."
1,000002.jpg,Quebec,2022_07_31,3.0,"[[Grammia virguncula, 33], [Martyringa latipen...","[[20220731233507-00-91.jpg, 3389, 1064, 3522, ..."
2,000003.jpg,Quebec,2022_07_31,4.0,"[[Idia denticulalis, 31], [Metalectra discalis...","[[20220731233507-00-91.jpg, 600, 287, 751, 449..."


# Extract ground truth data

In [23]:
def extract_ground_truth_label(df_gt,df_classification):
    df = df_classification.merge(df_gt[['source_image_cropped','taxon_rank','taxon_name']],how='left',on='source_image_cropped')
    df.rename(columns={'taxon_rank':'taxon_rank_ground_truth',
                       'taxon_name':'taxon_name_ground_truth'}, inplace = True)
    return df

In [24]:
df_with_ground_truth = extract_ground_truth_label(df_gt,df_classification)

In [25]:
df_with_ground_truth

Unnamed: 0,source_image_cropped,region,date,track_id,prediction,track_info,taxon_rank_ground_truth,taxon_name_ground_truth
0,000001.jpg,Quebec,2022_07_31,2.0,"[[Choristoneura parallela, 29], [Argyrotaenia ...","[[20220731233507-00-91.jpg, 2432, 1485, 2566, ...",genus,Pandemis
1,000002.jpg,Quebec,2022_07_31,3.0,"[[Grammia virguncula, 33], [Martyringa latipen...","[[20220731233507-00-91.jpg, 3389, 1064, 3522, ...",genus,Olethreutes
2,000003.jpg,Quebec,2022_07_31,4.0,"[[Idia denticulalis, 31], [Metalectra discalis...","[[20220731233507-00-91.jpg, 600, 287, 751, 449...",genus,Idia
3,000004.jpg,Quebec,2022_07_31,6.0,"[[Agonopterix robiniella, 23], [Crambidia pall...","[[20220731233507-00-91.jpg, 1689, 740, 1780, 8...",tribe,Sparganothini
4,000005.jpg,Quebec,2022_07_31,7.0,"[[Archips cerasivorana, 25], [Rhynchagrotis cu...","[[20220731233507-00-91.jpg, 3482, 2052, 3649, ...",genus,Apotomis
...,...,...,...,...,...,...,...,...
995,000996.jpg,Quebec,2022_08_02,132.0,"[[Dioryctria disclusa, 21], [Acleris albicoman...","[[20220802024729-00-52.jpg, 3737, 1790, 3859, ...",genus,Acrobasis
996,000997.jpg,Quebec,2022_08_02,131.0,"[[Phaeoura quernaria, 38], [Thyris maculata, 1...","[[20220802024801-00-52.jpg, 1896, 978, 2237, 1...",species,Blepharomastix ranalis
997,000998.jpg,Quebec,2022_08_02,134.0,"[[Macaria granitata, 26], [Harrisimemna trisig...","[[20220802024638-00-52.jpg, 1295, 1285, 1543, ...",genus,Macaria
998,000999.jpg,Quebec,2022_08_02,137.0,"[[Eufidonia discospilata, 15], [Epelis truncat...","[[20220802024638-00-52.jpg, 430, 247, 537, 365...",tribe,Sparganothini


# Create df for evaluation and apply smoothing strategies

In [28]:
def smoothing_max_value(df):
    # ex: [['20220731233507-00-91.jpg', 2432, 1485, 2566, 1642, 'Choristoneura parallela', 29], ['20220731232026-00-79.jpg', 1650, 1277, 1768, 1448, 'Archips argyrospila', 39]]
    #display(df)
    max_confidence_score = df["confidence"].max()
    max_confidence_index = df["confidence"].idxmax()
    new_label = df.iloc[max_confidence_index, df.columns.get_loc("label")]
    return max_confidence_score, new_label

def smoothing_most_frequent(df):
    #display(df)
    if df.shape[0]>1:
        new_label = df.label.mode()[0]
        max_confidence_score = df[df['label'] == new_label]['confidence'].max()
        #print('here >',new_label)
        #print('here >',max_confidence_score)
    else:
        max_confidence_score, new_label = smoothing_max_value(df)
        #print('here >',new_label)
        #print('here >',max_confidence_score)
    return max_confidence_score, new_label

def smoothing_avg_max(df):
    return 1,'test'

def create_df_evaluation(df_raw):
    # test = df_evaluation.loc['000001.jpg']['track_info']
    # smoothing_max_value(test)
    df = df_raw[df_raw['taxon_rank_ground_truth']=='species']
    smoothing_labels_max = []
    scores_max = []
    smoothing_labels_most_frequent = []
    scores_most_frequent = []
    smoothing_labels_avg_max = []
    scores_avg_max = []
    for row in df['track_info']:
        df_row = pd.DataFrame(row, columns = ['img', 'x1', 'x2', 'y1', 'y2', 'label', 'confidence'])
        score_max, label_max = smoothing_max_value(df_row)
        scores_max.append(score_max)
        smoothing_labels_max.append(label_max)
        ##
        score_most_frequent, label_most_frequent = smoothing_most_frequent(df_row)
        scores_most_frequent.append(score_most_frequent)
        smoothing_labels_most_frequent.append(label_most_frequent)
        ##
        score_avg_max, label_avg_max = smoothing_avg_max(df_row)
        scores_avg_max.append(score_avg_max)
        smoothing_labels_avg_max.append(label_avg_max)
    df['smoothing_label_max'] = smoothing_labels_max
    df['score_max'] = scores_max
    df['smoothing_label_most_frequent'] = smoothing_labels_most_frequent
    df['score_most_frequent'] = scores_most_frequent
    df['smoothing_label_avg_max'] = smoothing_labels_avg_max
    df['score_avg_max'] = scores_avg_max
    return df

In [29]:
df_evaluation = create_df_evaluation(df_with_ground_truth)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['smoothing_label_max'] = smoothing_labels_max
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['score_max'] = scores_max
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['smoothing_label_most_frequent'] = smoothing_labels_most_frequent
A value is trying to be set on a copy of a slice from a Da

In [30]:
df_evaluation.head(3)

Unnamed: 0,source_image_cropped,region,date,track_id,prediction,track_info,taxon_rank_ground_truth,taxon_name_ground_truth,smoothing_label_max,score_max,smoothing_label_most_frequent,score_most_frequent,smoothing_label_avg_max,score_avg_max
15,000016.jpg,Quebec,2022_07_31,23.0,"[[Catocala lineella, 39], [Eacles imperialis, ...","[[20220731232026-00-79.jpg, 2527, 776, 2838, 1...",species,Lymantria dispar,Drepana arcuata,99,Catocala lineella,39,test,1
19,000020.jpg,Quebec,2022_07_31,28.0,"[[Syngrapha rectangula, 25], [Neoerastria apic...","[[20220731235027-00-100.jpg, 135, 481, 292, 62...",species,Pseudeustrotia carneola,Syngrapha rectangula,25,Syngrapha rectangula,25,test,1
26,000027.jpg,Quebec,2022_07_31,38.0,"[[Horisme intestinata, 4], [Amphipyra pyramido...","[[20220731235615-00-102.jpg, 3987, 1968, 4094,...",species,Macaria pustularia,Endothenia hebesana,50,Endothenia hebesana,50,test,1


# Count for eval

In [31]:
def count_for_eval(df_evaluation):
    # I need to compare taxon_name_ground_truth and smoothing_label_max and smoothing_label_most_frequent
    evaluation_max = []
    evaluation_most_frequent = []
    for index, row in df_evaluation.iterrows():
        if row.taxon_name_ground_truth in row.smoothing_label_max:
            evaluation_max.append(1)
        else:
            evaluation_max.append(0)
        if row.taxon_name_ground_truth in row.smoothing_label_most_frequent:
            evaluation_most_frequent.append(1)
        else:
            evaluation_most_frequent.append(0)
    df_evaluation['evaluation_max'] = evaluation_max
    df_evaluation['evaluation_most_frequent'] = evaluation_most_frequent   
    return df_evaluation

In [32]:
df_evaluation_with_count = count_for_eval(df_evaluation)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_evaluation['evaluation_max'] = evaluation_max
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_evaluation['evaluation_most_frequent'] = evaluation_most_frequent


In [33]:
df_evaluation_with_count

Unnamed: 0,source_image_cropped,region,date,track_id,prediction,track_info,taxon_rank_ground_truth,taxon_name_ground_truth,smoothing_label_max,score_max,smoothing_label_most_frequent,score_most_frequent,smoothing_label_avg_max,score_avg_max,evaluation_max,evaluation_most_frequent
15,000016.jpg,Quebec,2022_07_31,23.0,"[[Catocala lineella, 39], [Eacles imperialis, ...","[[20220731232026-00-79.jpg, 2527, 776, 2838, 1...",species,Lymantria dispar,Drepana arcuata,99,Catocala lineella,39,test,1,0,0
19,000020.jpg,Quebec,2022_07_31,28.0,"[[Syngrapha rectangula, 25], [Neoerastria apic...","[[20220731235027-00-100.jpg, 135, 481, 292, 62...",species,Pseudeustrotia carneola,Syngrapha rectangula,25,Syngrapha rectangula,25,test,1,0,0
26,000027.jpg,Quebec,2022_07_31,38.0,"[[Horisme intestinata, 4], [Amphipyra pyramido...","[[20220731235615-00-102.jpg, 3987, 1968, 4094,...",species,Macaria pustularia,Endothenia hebesana,50,Endothenia hebesana,50,test,1,0,0
45,000046.jpg,Quebec,2022_07_31,53.0,"[[Eacles imperialis, 33], [Catocala lineella, ...","[[20220731230344-00-65.jpg, 2533, 772, 2833, 1...",species,Lymantria dispar,Drepana arcuata,99,Eacles imperialis,64,test,1,0,0
47,000048.jpg,Quebec,2022_07_31,61.0,"[[Archips argyrospila, 29], [Polygrammate hebr...","[[20220731232932-00-85.jpg, 1492, 1684, 1566, ...",species,Platynota exasperatana,Aglossa cuprina,46,Aglossa cuprina,46,test,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,000988.jpg,Quebec,2022_08_02,115.0,"[[Condica vecors, 16], [Idia aemula, 8], [Disc...","[[20220802024937-00-52.jpg, 2122, 418, 2331, 5...",species,Idia aemula,Rheumaptera prunivorata,43,Agriphila vulgivagellus,25,test,1,0,0
991,000992.jpg,Quebec,2022_08_02,116.0,"[[Crambidia pallida, 8], [Cabera variolaria, 4...","[[20220802024757-00-52.jpg, 1662, 1450, 1861, ...",species,Anticlea multiferata,Macaria granitata,86,Crambidia pallida,8,test,1,0,0
992,000993.jpg,Quebec,2022_08_02,125.0,"[[Adoxophyes negundana, 30], [Argyrotaenia que...","[[20220802024650-00-52.jpg, 451, 602, 571, 757...",species,Pandemis lamprosana,Adoxophyes negundana,30,Adoxophyes negundana,30,test,1,0,0
996,000997.jpg,Quebec,2022_08_02,131.0,"[[Phaeoura quernaria, 38], [Thyris maculata, 1...","[[20220802024801-00-52.jpg, 1896, 978, 2237, 1...",species,Blepharomastix ranalis,Phaeoura quernaria,38,Eumacaria madopata,28,test,1,0,0


In [34]:
df_evaluation_with_count.drop(columns=['score_avg_max', 'smoothing_label_avg_max']).to_csv("evaluation_species.csv")

In [37]:
print(df_evaluation_with_count.evaluation_max.sum())
print(df_evaluation_with_count.evaluation_most_frequent.sum())
print(len(df_evaluation_with_count))

77
54
344
