In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm as tqdm
import shutil

In [2]:
fine_csv_fileName = "metadata.csv"
cleaned_fine_csv_fileName = "cleaned_metadata.csv"
data_root = "/raid/elhamod/Fish"
destination = "Curated3/Easy"
sources=["cropped2/INHS", "cropped2/FMNH", "cropped2/JFBM", "cropped2/OSUM", "cropped2/UWZM"]

In [3]:
# metadata table headers.
fine_csv_fileName_header = "fileName"
fine_csv_scientificName_header = "scientificName"
fine_csv_Coarse_header = "Genus"
fine_csv_Family_header = "Family"
fine_csv_usedColumns = [fine_csv_fileName_header,
                          fine_csv_scientificName_header,
                          fine_csv_Coarse_header,
                          fine_csv_Family_header]

In [4]:
# Get destination dataframe uncleaned metadata
destination_metadata_fileName_full_path = os.path.join(data_root, destination, fine_csv_fileName)
destination_metadata = pd.read_csv(destination_metadata_fileName_full_path, delimiter='\t', index_col=fine_csv_fileName_header, usecols=fine_csv_usedColumns)

destination_cleaned_metadata_final = pd.DataFrame()
# For each source
for source in sources:
    # get metadata
    source_metadata_fileName_full_path = os.path.join(data_root, source, cleaned_fine_csv_fileName)
    source_metadata = pd.read_csv(source_metadata_fileName_full_path, delimiter='\t', index_col=fine_csv_fileName_header, usecols=fine_csv_usedColumns)
    
    #get only interesctsion
    destination_metadata['lower'] = destination_metadata.index.str.lower()
    source_metadata['lower'] = source_metadata.index.str.lower()
    destination_cleaned_metadata = source_metadata.reset_index().merge(destination_metadata, how="inner", on=['lower']).set_index('fileName')
    destination_cleaned_metadata = destination_cleaned_metadata[[fine_csv_scientificName_header+"_x"
                                                                 , fine_csv_Coarse_header+"_x"
                                                                 , fine_csv_Family_header+"_x"]]
    destination_cleaned_metadata = destination_cleaned_metadata.rename(columns={fine_csv_scientificName_header+"_x": fine_csv_scientificName_header, 
                                                 fine_csv_Coarse_header+"_x": fine_csv_Coarse_header,
                                                 fine_csv_Family_header+"_x": fine_csv_Family_header})
    
          
    # append it to final metadata
    destination_cleaned_metadata_final = pd.concat([destination_cleaned_metadata_final, destination_cleaned_metadata])
    
# save final metadata
destination_cleaned_metadata_final = destination_cleaned_metadata_final.loc[~destination_cleaned_metadata_final.index.duplicated(keep='first')]  
destination_cleaned_metadata_final.to_csv(os.path.join(data_root, destination, cleaned_fine_csv_fileName), sep='\t')


#Some code to get unfound images
destination_cleaned_metadata_final['lower'] = destination_cleaned_metadata_final.index.str.lower()
destination_metadata['lower'] = destination_metadata.index.str.lower()
df3 = destination_cleaned_metadata_final.reset_index().merge(destination_metadata, indicator='Exist', how="outer", on=['lower']).set_index('fileName')
df3 = df3.loc[df3['Exist'] == 'right_only'][['lower', 'scientificName_y', 'Genus_y', 'Family_y']]
df3 = df3.rename(columns={fine_csv_scientificName_header+"_y": fine_csv_scientificName_header, 
                                             fine_csv_Coarse_header+"_y": fine_csv_Coarse_header,
                                             fine_csv_Family_header+"_y": fine_csv_Family_header,
                         'lower':'fileName'}).set_index('fileName')
print('missing', df3)
df3.to_csv(os.path.join(data_root, destination, "notfound.csv"), sep='\t')
    

missing                            scientificName     Genus         Family
fileName                                                          
inhs_fish_26586.jpg     Lepomis cyanellus   Lepomis  Centrarchidae
inhs_fish_85748.jpg      Lepomis gibbosus   Lepomis  Centrarchidae
inhs_fish_50859.jpg       Cyprinus carpio  Cyprinus     Cyprinidae
inhs_fish_62239.jpg  Notropis percobromus  Notropis     Cyprinidae
inhs_fish_22926.jpg       Morone chrysops    Morone      Moronidae
