# QC taxa

In [1]:
import sys
sys.path.append('../../')
import pandas as pd
import glob
from pathlib import Path
from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR
import db as db


from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
)

import scripts.normalize_taxa as nt


In [2]:
clean_data_paths = [
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_1', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_2', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_3', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_4', 
]

metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 


date = '2022-04-28'
taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"
taxa_crosswalk_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"

date = '2021-11-29'
noaa_taxa_crosswalk_file = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f"taxa_crosswalk_{date}.csv"
noaa_taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"

date = '2022-04-28'
input_file = RAW_DATA_DIR/'PI_processed_files'/f'LIMS_Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv'
input_4_file = RAW_DATA_DIR/'PI_processed_files'/f'LIMS Micropal_CSV_4_taxa_{date}.csv'

pi_file = OUTPUT_DIR/'taxa'/'LIMS'/'PI_normalized_taxa_list_with_pbdb_2022-04-28.csv'

input_noaa_file = RAW_DATA_DIR/'PI_processed_files'/f'NOAA_taxa_lists_taxa_list_2021-11-29.csv'

add_taxa_file = RAW_DATA_DIR/'PI_processed_files'/'LIMS_Micropal_CSV_4_taxa_ADDTL_TAXA.csv'

In [3]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


# compare taxa list and PI taxa list

Check if the taxa that were aproved by the researchers from the google sheet matches the taxa from the LIMS taxa_list.csv.

In [4]:
normalized_df = pd.read_csv(input_file, skiprows = 9)
normalized_df.shape

(4754, 32)

In [5]:
normalized_4_df = pd.read_csv(input_4_file)
normalized_4_df.shape

(681, 33)

In [6]:
taxa_df = pd.read_csv(taxa_crosswalk_list_file)
taxa_df.shape

(5281, 18)

In [7]:
normalized_names = set(normalized_df['verbatim_name'])
normalized_4_names = set(normalized_4_df['verbatim_name'])
all_normalized_names = normalized_names.union(normalized_4_names)

taxa_names = set(taxa_df['verbatim_name'])

In [8]:
len(all_normalized_names - taxa_names)

30

In [9]:
all_normalized_names - taxa_names

{'ADDITIONAL SPECIES',
 'Bathymetry',
 'Dextral:Sinistral _N. acostaensis_',
 'Dextral:Sinistral _P. finalis_',
 'Dextral:Sinistral _P. obliquiloculata_',
 'Dextral:Sinistral _P. praecursor_',
 'Dextral:Sinistral _P. praespectabilis_',
 'Dextral:Sinistral _P. primalis_',
 'Dextral:Sinistral _P. spectabilis_',
 'Diatom Zone (NPD) in Yanagisawa and Akiba (1998)',
 'Diatom Zone (Yanagisawa and Akiba, 1998)',
 'Exotic',
 'Gen. et sp. indet',
 'Marine',
 'Martini (1971) Zone',
 'Organic matter',
 'Planktic foraminiferal %',
 'Planktic foraminiferal (%)',
 'Preservation palynofacies',
 'Pyrite',
 'RESOLVED NAMES BY TAXONOMIC GROUP',
 'Radiolarian zone',
 'Radiolarian zone/subzone',
 'Silicoflagellate Zone in Ling (1992)',
 'Terrestrial organic matter',
 'Tintinids',
 'Zone in Ling (1992)',
 'fossil',
 'fossil_group',
 nan}

In [10]:
len(taxa_names - all_normalized_names)

34

In [12]:
taxa_names - all_normalized_names

{'Dextral N. acostaensis',
 'Dextral P. finalis',
 'Dextral P. obliquiloculata',
 'Dextral P. praecursor',
 'Dextral P. praespectabilis',
 'Dextral P. primalis',
 'Dextral P. spectabilis',
 'Sinistral N. acostaensis',
 'Sinistral P. finalis',
 'Sinistral P. obliquiloculata',
 'Sinistral P. praecursor',
 'Sinistral P. praespectabilis',
 'Sinistral P. primalis',
 'Sinistral P. spectabilis',
 'additional species: Big Lithomelissa sp. A (Antarctissa like)',
 'additional species: Cypassis irregularis',
 'additional species: Eucyrtidium teuscheri',
 'additional species: Gondwanaria dogieli',
 'additional species: Grobolotalia spp.',
 'additional species: Hantkenina australis',
 'additional species: Larcopyle buetschlii (R)',
 'additional species: Litharachnium tentorium',
 'additional species: Lithostrobus cuspidatus',
 'additional species: Lonchosphaera spicata',
 'additional species: N. incompta',
 'additional species: Neogloboquadrina pachyderma',
 'additional species: Parasubbotina varia

# check if taxon with multiple taxon groups is imported

'Globigerinoides ruber (white)' has both benthic_forams; planktic_forams

In [16]:
sql = f"""select 
taxa.name as taxon_name,
taxa.taxon_group,
 samples.data_source_notes
from samples
join samples_taxa on samples_taxa.sample_id = samples.id
join taxa on samples_taxa.taxon_id = taxa.id
where  samples.data_source_type = 'micropal csv'
and taxa.name = 'Globigerinoides ruber (white)';"""

rows = db.fetch_all(sql)
data = []
for row in rows:
    data.append({
        'verbatim_name': row['taxon_name'],
        'taxon_group': row['taxon_group'],
        'data_source_notes': row['data_source_notes']
    })
    

db_df = pd.DataFrame(data)
db_df.drop_duplicates(inplace=True, subset=['verbatim_name', 'taxon_group' ])
db_df

Unnamed: 0,verbatim_name,taxon_group,data_source_notes
0,Globigerinoides ruber (white),planktic_forams,LIMS/Micropal_CSV_2/350_U1437D_planktic_forams...
1128,Globigerinoides ruber (white),benthic_forams,LIMS/Micropal_CSV_2/356-U1463B_benthic_forams.csv


# check  dextral sinistral import

In [99]:
file = 'LIMS/Micropal_CSV_1/363-U1482A-planktic_forams.csv'

In [100]:

sql = f"""select samples.name as sample_name,  samples.eodp_id,
taxa_crosswalk.verbatim_name as verbatim_name,
taxa.name as taxon_name,
samples_taxa.code
from samples_taxa
join samples on samples.id = samples_taxa.sample_id
join taxa on samples_taxa.taxon_id = taxa.id

join taxa_crosswalk on taxa_crosswalk.id = samples_taxa.original_taxon_id

where samples_taxa.data_source_notes = '{file}';"""

rows = db.fetch_all(sql)
data = []
for row in rows:
    data.append({
        'Sample': row['sample_name'], 
        'verbatim_name': row['verbatim_name'],
        'code': row['code'],
        'eodp_id': row['eodp_id']
    })

db_df = pd.DataFrame(data)
log_df(db_df)

(478, 4)


Unnamed: 0,Sample,verbatim_name,code,eodp_id
0,363-U1482A-1H-CC-PAL-FORAM,Globigerinella calida _B_,R,581cf8d872bc892b3d7664f3c868745f
1,363-U1482A-1H-CC-PAL-FORAM,Pulleniatina finalis _B,P,581cf8d872bc892b3d7664f3c868745f
2,363-U1482A-1H-CC-PAL-FORAM,Globorotalia truncatulinoides _B_,R,581cf8d872bc892b3d7664f3c868745f
3,363-U1482A-1H-CC-PAL-FORAM,Dextral P. obliquiloculata,49,581cf8d872bc892b3d7664f3c868745f
4,363-U1482A-1H-CC-PAL-FORAM,Sinistral P. obliquiloculata,1,581cf8d872bc892b3d7664f3c868745f


In [101]:
db_pivot = db_df.pivot(index=['Sample', 'eodp_id'], columns='verbatim_name', values='code')
db_pivot = db_pivot.reset_index()
 
db_pivot.sort_values(['Sample', 'eodp_id'], inplace=True)

log_df(db_pivot)

(71, 73)


verbatim_name,Sample,eodp_id,"""Globigerina"" angulisuturalis _T_",Candeina nitida,Candeina praenitida,Cassigerinella chipolensis _T,"Dentoglobigerina ""conglomerata""",Dentoglobigerina altispira _T_ _PL5,Dentoglobigerina baroemoenensis,Dentoglobigerina binaensis _T,...,Sinistral P. praespectabilis,Sinistral P. primalis,Sphaeroidinella dehiscens sensu lato _B_,Sphaeroidinella excavata,Sphaeroidinellopsis kochi _T,Sphaeroidinellopsis paenedehiscens,Sphaeroidinellopsis seminulina _T_ _PL4_,Trilobatus bisphericus,Trilobatus sacculifer,Trilobatus trilobus _B_
0,363-U1482A-10H-CC-PAL-FORAM,f7fa5407f5c0964abf1fd3da02fbd441,,,,,,,,,...,,,,,,,,,,
1,363-U1482A-11H-CC-PAL-FORAM,d3af6afade68362574996c493140a750,,,,,,,,,...,,,,,,,,,,
2,363-U1482A-12H-CC-PAL-FORAM,70b431e20011c6451b591d4b525b6a6d,,,,,,,,,...,,,,,,,,,,
3,363-U1482A-13H-CC-PAL-FORAM,eea9bab5737b68b7b5029aa30781645d,,,,,,,,,...,,,,,,,,,,
4,363-U1482A-14H-CC-PAL-FORAM,ee887c246f7318a6f17a0d83d927004d,,,,,,,,,...,,,,,,,,,,


In [102]:
file_df = pd.read_csv(CLEAN_DATA_DIR/file, dtype=str)
log_df(file_df, 2)

(71, 251)


Unnamed: 0,Sample,Top [cm],Bottom [cm],Top Depth [m],Bottom Depth [m],Zone name (short),Zone name,Additional zone name (short),Additional zone name,Preservation,...,Dextral P. obliquiloculata,Sinistral P. obliquiloculata,Dextral P. primalis,Sinistral P. primalis,Dextral P. praecursor,Sinistral P. praecursor,Dextral P. spectabilis,Sinistral P. spectabilis,Dextral P. finalis,Sinistral P. finalis
0,363-U1482A-1H-CC-PAL-FORAM,0,5,2.93,2.98,PT1b,PT1b - Globorotalia truncatulinoides Partial-r...,,,E [P46],...,49,1,,,,,,,,
1,363-U1482A-2H-CC-PAL-FORAM,0,5,12.7,12.75,PT1b,PT1b - Globorotalia truncatulinoides Partial-r...,,,E [P46],...,15,0,,,,,,,,


In [103]:
taxa_cols = set(db_pivot.columns) - {'Sample'}


filter_df = file_df[db_pivot.columns].copy()
filter_df.dropna(axis=0, how='all', inplace=True, subset=taxa_cols)
filter_df.dropna(axis=1, how='all', inplace=True)

filter_df.sort_values(['Sample', 'eodp_id'], inplace=True)

log_df(filter_df, 2)

(71, 73)


Unnamed: 0,Sample,eodp_id,"""Globigerina"" angulisuturalis _T_",Candeina nitida,Candeina praenitida,Cassigerinella chipolensis _T,"Dentoglobigerina ""conglomerata""",Dentoglobigerina altispira _T_ _PL5,Dentoglobigerina baroemoenensis,Dentoglobigerina binaensis _T,...,Sinistral P. praespectabilis,Sinistral P. primalis,Sphaeroidinella dehiscens sensu lato _B_,Sphaeroidinella excavata,Sphaeroidinellopsis kochi _T,Sphaeroidinellopsis paenedehiscens,Sphaeroidinellopsis seminulina _T_ _PL4_,Trilobatus bisphericus,Trilobatus sacculifer,Trilobatus trilobus _B_
9,363-U1482A-10H-CC-PAL-FORAM,f7fa5407f5c0964abf1fd3da02fbd441,,,,,,,,,...,,,,,,,,,,
10,363-U1482A-11H-CC-PAL-FORAM,d3af6afade68362574996c493140a750,,,,,,,,,...,,,,,,,,,,


In [104]:
db_pivot.to_csv(OUTPUT_DIR/'tmp'/'db_pivot.csv', index=False)
filter_df.to_csv(OUTPUT_DIR/'tmp'/'filter_df.csv', index=False)

In [105]:
dfa = pd.read_csv(OUTPUT_DIR/'tmp'/'db_pivot.csv', dtype=str)
dfb = pd.read_csv(OUTPUT_DIR/'tmp'/'filter_df.csv', dtype=str)

dfa.equals(dfb)

True

# check  files with multiple taxon groups is imported

'Globigerinoides ruber (white)' has both benthic_forams; planktic_forams

file has benthic_forams and planktic_forams

In [33]:
file = 'LIMS/Micropal_CSV_2/356-U1463B_benthic_forams.csv'

In [34]:

sql = f"""select samples.name as sample_name, 
taxa_crosswalk.verbatim_name as taxon_name,
 samples_taxa.code, samples_taxa.data_source_notes,
taxa.taxon_group
from samples_taxa
join samples on samples.id = samples_taxa.sample_id
join taxa_crosswalk on taxa_crosswalk.id = samples_taxa.original_taxon_id
join taxa on samples_taxa.taxon_id = taxa.id

where samples_taxa.data_source_notes = '{file}';"""

rows = db.fetch_all(sql)
data = []
for row in rows:
    data.append({
        'Sample': row['sample_name'], 
        'verbatim_name': row['taxon_name'],
        'code': row['code'],
    })

db_df = pd.DataFrame(data)
db_df.head()

Unnamed: 0,Sample,verbatim_name,code
0,356-U1463B-1H-CC-PAL-FORAM-150-2000,Globigerinoides ruber s.s.,F
1,356-U1463B-1H-CC-PAL-FORAM-150-2000,Globigerinella calida,P
2,356-U1463B-1H-CC-PAL-FORAM-150-2000,Neogloboquadrina dutertrei,R
3,356-U1463B-1H-CC-PAL-FORAM-150-2000,Globorotalia tumida,R
4,356-U1463B-1H-CC-PAL-FORAM-150-2000,Globigerinoides subquadratus,P


In [35]:
db_pivot = db_df.pivot(index='Sample', columns='verbatim_name', values='code')
db_pivot = db_pivot.reset_index()

 
db_pivot.sort_values('Sample', inplace=True)

log_df(db_pivot)

(24, 48)


verbatim_name,Sample,Candeina nitida,Dentoglobigerina altispira,Dentoglobigerina baroemoenensis,Globigerina bulloides,Globigerina rubescens,Globigerinella calida,Globigerinella siphonifera,Globigerinita glutinata,Globigerinoides bulloideus,...,Neogloboquadrina acostaensis,Neogloboquadrina dutertrei,Neogloboquadrina humerosa,Orbulina bilobata,Orbulina universa,Pulleniatina obliquiloculata,Pulleniatina primalis,Sphaeroidinella dehiscens,Sphaeroidinellopsis kochi,Sphaeroidinellopsis seminulina
0,356-U1463B-13H-CC-PAL-FORAM-150 2000,,,,,,,P,,,...,,P,,,P,F,,,,
1,356-U1463B-15H-CC-PAL-FORAM-150-2000,,,,P,,,P,,,...,,P,,,P,P,,,,
2,356-U1463B-17H-CC-PAL-FORAM-150 2000,,,,P,,,,P,,...,,P,,,,P,,,,
3,356-U1463B-19H-6-PAL-FORAM-150-2000,,,,P,,P,P,,,...,,A,,,P,P,,,,
4,356-U1463B-1H-CC-PAL-FORAM-150-2000,P,,,,R,P,R,,,...,,R,,,R,F,,P,,


In [36]:
file_df = pd.read_csv(CLEAN_DATA_DIR/file)
log_df(file_df, 2)

(32, 174)


Unnamed: 0,Sample,Top [cm],Bottom [cm],Top Depth [m],Bottom Depth [m],Datum name,Datum name generic,Datum comment,Datum age average [Ma],Zone name (short),...,Shore File Links,File Data,Exp,Site,Hole,Core,Type,Section,A/W,eodp_id
0,356-U1463B-1H-CC-PAL-FORAM-150-2000,0,10,7.78,7.88,,,,,,...,,,356,U1463,B,1,H,CC,PAL,8adfb3c6f33088d72a886ed17287a589
1,356-U1463B-3H-CC-PAL-FORAM-150-2000,0,10,27.1,27.2,,,,,,...,,,356,U1463,B,3,H,CC,PAL,93cc7d07dca008920dac6892a7489d30


In [37]:
taxa_cols = set(db_pivot.columns) - {'Sample'}


filter_df = file_df[db_pivot.columns].copy()
filter_df.dropna(axis=0, how='all', inplace=True, subset=taxa_cols)
filter_df.dropna(axis=1, how='all', inplace=True)

filter_df.sort_values('Sample', inplace=True)

log_df(filter_df, 2)

(24, 48)


Unnamed: 0,Sample,Candeina nitida,Dentoglobigerina altispira,Dentoglobigerina baroemoenensis,Globigerina bulloides,Globigerina rubescens,Globigerinella calida,Globigerinella siphonifera,Globigerinita glutinata,Globigerinoides bulloideus,...,Neogloboquadrina acostaensis,Neogloboquadrina dutertrei,Neogloboquadrina humerosa,Orbulina bilobata,Orbulina universa,Pulleniatina obliquiloculata,Pulleniatina primalis,Sphaeroidinella dehiscens,Sphaeroidinellopsis kochi,Sphaeroidinellopsis seminulina
5,356-U1463B-13H-CC-PAL-FORAM-150 2000,,,,,,,P,,,...,,P,,,P,F,,,,
6,356-U1463B-15H-CC-PAL-FORAM-150-2000,,,,P,,,P,,,...,,P,,,P,P,,,,


In [38]:
db_pivot.to_csv(OUTPUT_DIR/'tmp'/'db_pivot.csv', index=False)
filter_df.to_csv(OUTPUT_DIR/'tmp'/'filter_df.csv', index=False)

In [39]:
dfa = pd.read_csv(OUTPUT_DIR/'tmp'/'db_pivot.csv', dtype=str)
dfb = pd.read_csv(OUTPUT_DIR/'tmp'/'filter_df.csv', dtype=str)

dfa.equals(dfb)

True

# check taxa files have unique values

In [40]:
df = pd.read_csv(taxa_list_file, usecols=['normalized_name', 'taxon_group'])
df.shape
# 4675

(4675, 2)

In [41]:
df[df.duplicated(subset=['normalized_name', 'taxon_group'])]

Unnamed: 0,normalized_name,taxon_group


In [42]:
sql = """
SELECT count(*) FROM taxa

"""
db.fetch_one(sql)

[4675]

In [43]:
df2 = pd.read_csv(taxa_crosswalk_list_file, usecols=['normalized_name', 'taxon_group', 'verbatim_name', 'eodp_id' ])
df2.shape
# 5281

(5281, 4)

In [44]:
df2[df2.duplicated(subset=['normalized_name', 'taxon_group', 'verbatim_name'])]

Unnamed: 0,normalized_name,taxon_group,verbatim_name,eodp_id


In [45]:
sql = """
SELECT * FROM taxa_crosswalk 
JOIN taxa on taxa.id = taxa_crosswalk.taxon_id 

"""
rows = db.fetch_all(sql)
data = []
for row in rows:
    data.append({
        'normalized_name': row['name'], 
        'taxon_group': row['taxon_group'],
        'verbatim_name': row['verbatim_name'],
        'eodp_id': row['eodp_id']
        
    })


len(data)

5281

In [46]:
db_df = pd.DataFrame(data)
db_df.head()

Unnamed: 0,normalized_name,taxon_group,verbatim_name,eodp_id
0,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,0
1,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,1
2,Foraminifera indet.,benthic_forams,Others,2
3,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,3
4,Ostracoda indet.,benthic_forams,Ostracoda spp.,4


In [47]:
set(df2['eodp_id']) - set (db_df['eodp_id'])

set()

In [48]:
set(db_df['eodp_id']) - set (df2['eodp_id'])

set()

# check additional species are imported

In [49]:
path = CLEAN_DATA_DIR/'LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv'
df = pd.read_csv(path, dtype=str)
df = df.dropna(axis=1, how="all")

log_df(df)

(31, 77)


Unnamed: 0,Sample,Top [cm],Bottom [cm],Top Depth [m],Bottom Depth [m],Zone name,Preservation,Group Abundance,Antarctissa cylindrica,Cycladophora pliocenica,...,additional species: Gondwanaria dogieli,additional species: Tetraplecta pinigera/Euscenium corynephorum,additional species: Litharachnium tentorium,additional species: Streblacantha circumtexta,additional species: Eucyrtidium teuscheri,additional species: Verticillata hexacantha,additional species: Cypassis irregularis,additional species: Prunopyle antarctica,additional species: Big Lithomelissa sp. A (Antarctissa like),additional species: Larcopyle buetschlii (R)
0,374-U1525A-1H-1-IW_MUDLINE,0,0,0.0,0.0,,G,A,,,...,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,,
1,374-U1525A-1H-CC-PAL-RADS,0,5,8.57,8.62,,P,B,,,...,,,,,,,,,,
2,374-U1525A-2H-CC-PAL-RADS,0,5,18.72,18.77,,P,Tr,,,...,,,,,,,,,,
3,374-U1525A-3H-CC-PAL-RADS,0,5,28.16,28.21,,P,Tr,,,...,,,,,,,,,,
4,374-U1525A-4H-CC-PAL-RADS,0,5,28.86,28.91,> 0.65 (LAD A. cylindrica),M,Tr,X,,...,,,,,,,,,,


In [50]:
df.columns

Index(['Sample', 'Top [cm]', 'Bottom [cm]', 'Top Depth [m]',
       'Bottom Depth [m]', 'Zone name', 'Preservation', 'Group Abundance',
       'Antarctissa cylindrica', 'Cycladophora pliocenica',
       'Triceraspyris antarctica', 'Eucyrtidium calvertense',
       'Helotholus vema', 'Desmospyris spongiosa', 'Cycladophora davisiana',
       'Ceratocyrtis mashae', 'Prunopyle hayesi', 'Actinomma popofskii',
       'Acrosphaera? mercurius', 'Actinomma boreale', 'Actinomma delicatulum',
       'Actinomma leptodermum', 'Actinomma leptodernum longispinum',
       'Antarctissa denticulata', 'Antarctissa strelkovi',
       'Cenosphaera cristata', 'Ceratocyrtis spp.', 'Cornutella profunda',
       'Cycladophora bicornis', 'Druppatractus hastatus',
       'Enneaphormis rotula', 'Eucyrtidium inflatum',
       'Hexacontium pachydermum', 'Larcopyle pylomaticus',
       'Larcopyle weddellium', 'Lithelius nautiloides', 'Lithelius sp. A',
       'Mitrocalpis araneafera', 'Peripyramis circumtexta',
    

In [58]:
cols = [
    'Antarctissa cylindrica', 'Cycladophora pliocenica',
    'Triceraspyris antarctica', 'Eucyrtidium calvertense',
    'Helotholus vema', 'Desmospyris spongiosa', 'Cycladophora davisiana',
    'Ceratocyrtis mashae', 'Prunopyle hayesi', 'Actinomma popofskii',
    'Acrosphaera? mercurius', 'Actinomma boreale', 'Actinomma delicatulum',
    'Actinomma leptodermum', 'Actinomma leptodernum longispinum',
    'Antarctissa denticulata', 'Antarctissa strelkovi',
    'Cenosphaera cristata', 'Ceratocyrtis spp.', 'Cornutella profunda',
    'Cycladophora bicornis', 'Druppatractus hastatus',
    'Enneaphormis rotula', 'Eucyrtidium inflatum',
    'Hexacontium pachydermum', 'Larcopyle pylomaticus',
    'Larcopyle weddellium', 'Lithelius nautiloides', 'Lithelius sp. A',
    'Mitrocalpis araneafera', 'Peripyramis circumtexta',
    'Phormacantha hystrix/Plectacantha oikiskos group',
    'Phormostichoartus corbula', 'Pseudodictyophimus gracilipes',
    'Saccospyris antarctica', 'Saccospyris conithorax',
    'Saccospyris praeantarctica', 'Spongopyle osculosa',
    'Spongotrochus glacialis', 'Spongotrochus sp. A Abelmann',
    'Sphaeropyle robusta', 'Prunopyle tetrapila', 'Stylatractus neptunus',
    'Stylochlamidium  venustum', 'Stylodictya spp.', 'Trisulcus nana',
    # 'ADDITIONAL SPECIES', 
]

add_cols = [
    'additional species: Lonchosphaera spicata',
    'additional species: Poulpus spp.-like',
    'additional species: Lithostrobus cuspidatus',
    'additional species: Gondwanaria dogieli',
    'additional species: Tetraplecta pinigera/Euscenium corynephorum',
    'additional species: Litharachnium tentorium',
    'additional species: Streblacantha circumtexta',
    'additional species: Eucyrtidium teuscheri',
    'additional species: Verticillata hexacantha',
    'additional species: Cypassis irregularis',
    'additional species: Prunopyle antarctica',
    'additional species: Big Lithomelissa sp. A (Antarctissa like)',
    'additional species: Larcopyle buetschlii (R)'
]

In [59]:
data = []
for index, row in df.iterrows():
    for col in cols:
        if pd.notna(row[col]):
            data.append({'Sample': row['Sample'], 'code': row[col], 'taxon': col})
 
len(data)

114

In [60]:
sql = """
SELECT count(*) 
FROM  samples_taxa
WHERE (data_source_notes = 'LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv')
AND (code != 'Indeterminate due to data source') 
"""

db.fetch_one(sql)

[114]

In [62]:
data = []
for index, row in df.iterrows():
    for col in add_cols:
        if pd.notna(row[col]):
            data.append({'Sample': row['Sample'], 'code': row[col], 'taxon': col})
 
len(data)

17

In [63]:
sql = """
SELECT count(*) 
FROM  samples_taxa
WHERE (data_source_notes = 'LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv')
AND (code = 'Indeterminate due to data source') 
"""

db.fetch_one(sql)

[17]

# check non taxa and taxa headers

In [64]:
all_columns_file = OUTPUT_DIR/'tmp/all_LIMS_taxa_columns_2020-02-23.csv'
df = pd.read_csv(all_columns_file, dtype=str)

log_df(df)
# 60254

(60254, 3)


Unnamed: 0.1,Unnamed: 0,path,column
0,0,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,Sample
1,1,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,Top [cm]
2,2,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,Bottom [cm]
3,3,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,Top Depth [m]
4,4,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,Bottom Depth [m]


In [65]:
taxa_df = pd.read_csv(taxa_crosswalk_list_file)
log_df(taxa_df)
# 5281

(5281, 18)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments,eodp_id
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>100 m group,0
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>50 m group,1
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera,2
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,,not a taxa,Andy,,3
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",,4


In [66]:
noaa_taxa_df = pd.read_csv(input_noaa_file)
log_df(noaa_taxa_df)

(7763, 20)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,Corrections to pbdb_taxon_rank
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,,incisa,,,,,762.0,Abyssamina,genus,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,,glomeratum,,,,,774.0,Adercotryma,genus,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,,sp.,,,,,774.0,Adercotryma,genus,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,,decorata,,,,,788.0,Alabamina,genus,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,,haitiensis,,,,,788.0,Alabamina,genus,


In [67]:
add_taxa_df = pd.read_csv(add_taxa_file)
log_df(add_taxa_df)

(30, 36)


Unnamed: 0,sample_id,file,sample,taxon_group,verbatim_name,code,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Lonchosphaera spicata,,,,,,...,,,,,,,,,,
1,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Poulpus spp.-like,,Poulpus spp.,,,,...,,,,,,,,,,
2,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Lithostrobus cuspidatus,,,,,,...,,,,,,,,,,
3,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Gondwanaria dogieli,,,,,,...,,,,,,,,,,
4,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Tetraplecta pinigera/Euscenium corynephorum,,Plagiacanthidae indet.,,,Plagiacanthidae indet.,...,,,,,,,,,,


In [68]:
cols = ['normalized_name', 'verbatim_name', 'non-taxa descriptor']
dex_sin_df = pd.read_csv(taxa_crosswalk_list_file, dtype=str, usecols=cols)

dex_sin_df = dex_sin_df[dex_sin_df['non-taxa descriptor'].isin(['sinistral', 'dextral'])]
log_df(dex_sin_df)

(80, 3)


Unnamed: 0,non-taxa descriptor,normalized_name,verbatim_name
3092,dextral,Neogloboquadrina acostaensis (dextral),Dextral N. acostaensis
3093,sinistral,Neogloboquadrina acostaensis (sinistral),Sinistral N. acostaensis
3094,dextral,Pulleniatina finalis (dextral),Dextral P. finalis
3095,sinistral,Pulleniatina finalis (sinistral),Sinistral P. finalis
3096,dextral,Pulleniatina obliquiloculata (dextral),Dextral P. obliquiloculata


In [69]:
dex_sin_taxa = set(dex_sin_df['normalized_name'])
dex_sin_taxa

{'Globoconella miotumida (dextral)',
 'Globoconella miotumida (sinistral)',
 'Globorotalia hirsuta (dextral)',
 'Globorotalia hirsuta (sinistral)',
 'Globorotalia menardii (dextral)',
 'Globorotalia menardii (sinistral)',
 'Globorotalia s.l. crassaformis (dextral)',
 'Globorotalia s.l. crassaformis (sinistral)',
 'Globorotalia scitula (dextral)',
 'Globorotalia scitula (sinistral)',
 'Globorotalia truncatulinoides (dextral)',
 'Globorotalia truncatulinoides (sinistral)',
 'Hirsutella hirsuta (dextral)',
 'Neogloboquadrina acostaensis (dextral)',
 'Neogloboquadrina acostaensis (sinistral)',
 'Neogloboquadrina atlantica (dextral)',
 'Neogloboquadrina atlantica (sinistral)',
 'Neogloboquadrina dutertrei (dextral)',
 'Neogloboquadrina dutertrei (sinistral)',
 'Neogloboquadrina incompta (dextral)',
 'Neogloboquadrina incompta (sinistral)',
 'Neogloboquadrina pachyderma (dextral)',
 'Neogloboquadrina pachyderma (sinistral)',
 'Paragloborotalia mayeri (dextral)',
 'Paragloborotalia mayeri (si

In [70]:
tmp = set(df['column']) - set(taxa_df['verbatim_name']) 
# tmp

In [71]:
nontaxa = {
'342-U1408A-2H-2-W 100/102-FORAM',
'A/W',
'ADDITIONAL SPECIES',
'Abundance',
'Abundance (%)',
'Abundances',
'Additional zone name',
'Additional zone name (short)',
'Age',
'Age:',
'Aspect comment (etching)',
'BF Group abundance',
'BF Preservation',
'BF comment',
'BF preservation',
'Bathymetry',
'Biozone name',
'Biozone name (short)',
'Bottom Depth [m]',
'Bottom Offset (cm) on Parent Sample',
'Bottom [cm]',
'COMMENTS',
'Comment',
'Comment (general)',
'Comments',
'Core',
'Core Type - Section',
'Core,    section',
'Core, Section',
'Core, Section, Interval',
'Core, Type, Section',
'Core, section',
'Core, section, interval',
'Core, section, interval (cm)',
'Core-Sect',
'Datum age average (Ma)',
'Datum age average [Ma]',
'Datum age maximum [Ma]',
'Datum age minimum [Ma]',
'Datum author year',
'Datum comment',
'Datum group',
'Datum group code',
'Datum name',
'Datum name generic',
'Datum region',
'Datum type',
'Depth (cm)',
'Depth Method',
'Diatom Zone (NPD) in Yanagisawa and Akiba (1998)',
'Diatom Zone (Yanagisawa and Akiba, 1998)',
'Exotic',
'Exp',
'Expedition, site, hole, core, section, interval (cm):',
'Extra Sample ID Data',
'File Data',
'Fragmentation',
'Fragmentation rank [auto-pop]',
'General comment',
'Genus/species (upper zone)',
'Genus/species lower zone)',
'Group Abundance',
'Group abundance',
'Group abundance (%)',
'Group preservation',
'Half',
'Hole',
'Hole, Core, Section',
'IRD',
'Labl ID',
'Lower boundary age av. [Ma]',
'Martini (1971) Zone',
'Miscellaneous',
'Mixing',
'No. specimens/tray',
'Oberservations',
'Observations',
'Original Bottom Depth (m)',
'Original Top Depth (m)',
'Other observations',
'PALEO WATER DEPTH (IS=inner shelf, MS=middle shelf, OS=outer shelf)',
'PF Group Abundance',
'PF Preservation',
'PF Zone',
'PF group abundance',
'PF preservation',
'Piece',
'Preservation',
'Presevation',
'Pyrite',
'REMARKS',
'Remarks',
'Reworking comment (1= <1%, 2= light 1-10%, 3= >10%)',
'Reworking comment (1= <1%, 2=light 1-10%, 3= >10%)',
'Sample',
'Sample comment',
'Sample preparation comment',
'Section',
'Section Half',
'Secton Half',
'Ship File Links',
'Shore File Links',
'Silicoflagellate Zone in Ling (1992)',
'Site',
'Temperature Range',
'Top Depth [m]',
'Top Offset (cm) on Parent Sample',
'Top [cm]',
'Type',
'Type (lower zone)',
'Type (upper zone)',
'Unnamed: 13',
'Unnamed: 14',
'Unnamed: 179',
'Unnamed: 21',
'Unnamed: 81',
'Upper boundary age av. [Ma]',
'Upper boundary age min [Ma]',
'XBroken',
'XCorroded',
'XCrumpled',
'Zone',
'Zone author (year)',
'Zone comment',
'Zone group',
'Zone in Ling (1992)',
'Zone name',
'Zone name (short)',
'Zone name [short]',
'Zone/Subzone',
'comments',
'constituent',
'core, section',
'count',
'count_type',
'dupes and comments',
'eodp_id',
'fossil',
'fossil_group',
'interval (cm)',
'mean depth (mbsf)',
'pc_abundance_name_mode',
'pc_fossil_group',
'pc_fossil_name',
'physical_constituent_name',
'preservation'
}

misc = {
'% Planktic Foraminifera within whole sample',
'Benthic abundance',
'Chrysophyte cyst group abundance',
'Diatom abundance',
'Diatom preservation - pyritization2',
'Diatom preservation dissolution',
'Diatom preservation fragmentation',
'Diatoms and siliceous plankton comment',
'Diatoms group abundance',
'Ebridian group abundance', 
'Foram abundance',
'Foraminferal preservation',
'Foraminiferal abundance',
'Foraminiferal preservation',
'Large Benthic Forams [%]',
'Marine',
'Nannofossil Zone',
'Nannofossil abundance',
'Nannofossil comment',
'Organic matter',
'Other fossil material',
'Other taxa',
'Percentage of benthic forams in total foram assemblage [%]',
'Percentage of non-calcareous agglutinated forams in total foram assemblage [%]',
'Percentage of planktic forams in total foram assemblage [%]',
'Planktic foraminiferal %',
'Planktic foraminiferal (%)',
'Planktonic Benthic ratio (P:B)',
'Radiolarian zone',
'Radiolarian zone/subzone',
'Silicoflagellates group abundance',
'Sillicoflagellate abundance',
'Terrestrial organic matter',
'Total in situ dinocysts',
'Total pollen',
'Total radiolarians',   
}

In [72]:
tmp - nontaxa - misc - dex_sin_taxa

{'Actinocyclus ingens nodus',
 'Actinocyclus senarius',
 'Actinocyclus vulgaris',
 'Actinoptychus bipunctatus',
 'Argilloecia sp.',
 'Asteromphalus brookei',
 'Asteromphalus hyalinus',
 'Aulacoseira sp.',
 'Bitectatodinium tepikiense',
 'Bolivina albatrossi',
 'Braarudosphaera sp.',
 'Bradleya sp.',
 'Brigantedinium simplex',
 'Bulimina exilis',
 'Chiasmolithus sp.',
 'Cibicidoides parki',
 'Coccolithus streckeri',
 'Cocconeis placentula',
 'Cocconeis vitrea',
 'Crenalithus doronicoides',
 'Cricolithus jonesii',
 'Cyclotella sp.',
 'Cytheropteron sp.',
 'Detonula confervacea',
 'Dextral:Sinistral _N. acostaensis_',
 'Dextral:Sinistral _P. finalis_',
 'Dextral:Sinistral _P. obliquiloculata_',
 'Dextral:Sinistral _P. praecursor_',
 'Dextral:Sinistral _P. praespectabilis_',
 'Dextral:Sinistral _P. primalis_',
 'Dextral:Sinistral _P. spectabilis_',
 'Dictyocha subarctios',
 'Diploneis interrupta',
 'Discoaster spp. (six-rayed)',
 'Distephanus boliviensis',
 'Distephanus boliviensis bolivie

# check for taxa in multiple taxon groups

In [73]:
# https://stackoverflow.com/questions/38127209/how-to-use-groupby-to-concatenate-strings-in-python-pandas

def check_taxa_by_groups(path, name_col):
    taxa_df = pd.read_csv(path)
    nt.add_normalized_name_column(taxa_df)
    taxa_df = taxa_df.drop_duplicates(subset=[name_col,  'taxon_group'])

    data = taxa_df.groupby([name_col])['taxon_group'].apply('; '.join)
    grouped_df = pd.DataFrame(data)

    grouped_df = grouped_df[grouped_df['taxon_group'].str.contains('; ')]
    grouped_df.reset_index(inplace=True)
    
    return grouped_df


In [74]:
check_taxa_by_groups(pi_file, 'verbatim_name')
# 6

Unnamed: 0,verbatim_name,taxon_group
0,ADDITIONAL SPECIES,planktic_forams; radiolarians
1,Amorphous organic matter,dinoflagellates; palynology
2,Black phytoclasts,dinoflagellates; palynology
3,Brown phytoclasts,dinoflagellates; palynology
4,Dinocysts,dinoflagellates; palynology
5,Globigerinoides ruber (white),benthic_forams; planktic_forams
6,Neogloboquadrina humerosa,benthic_forams; planktic_forams


In [75]:
check_taxa_by_groups(taxa_crosswalk_list_file, 'verbatim_name')
# 11

Unnamed: 0,verbatim_name,taxon_group
0,Amorphous organic matter,dinoflagellates; palynology
1,Black phytoclasts,dinoflagellates; palynology
2,Brown phytoclasts,dinoflagellates; palynology
3,Dinocysts,dinoflagellates; palynology
4,Echinoid plate fragments,other; planktic_forams
5,Echinoid spines,other; planktic_forams
6,Fish teeth,other; planktic_forams
7,Globigerinoides ruber (white),benthic_forams; planktic_forams
8,Neogloboquadrina humerosa,benthic_forams; planktic_forams
9,Ostracods,other; planktic_forams


In [76]:
check_taxa_by_groups(pi_file, 'normalized_name')
# 10

Unnamed: 0,normalized_name,taxon_group
0,,benthic_forams; palynology; planktic_forams; r...
1,"""Amorphous organic matter""",dinoflagellates; palynology
2,"""Black phytoclasts""",dinoflagellates; palynology
3,"""Brown phytoclasts""",dinoflagellates; palynology
4,Brigantedinium spp.,dinoflagellates; palynology
5,Dinoflagellata indet.,dinoflagellates; nannofossils; palynology
6,Foraminifera indet.,benthic_forams; dinoflagellates; palynology; p...
7,Globigerinoides ruber (white),benthic_forams; planktic_forams
8,Lejeunecysta sp.,dinoflagellates; palynology
9,Neogloboquadrina humerosa,benthic_forams; planktic_forams


In [77]:
check_taxa_by_groups(taxa_list_file, 'normalized_name')
# 17

Unnamed: 0,normalized_name,taxon_group
0,"""Amorphous organic matter""",dinoflagellates; palynology
1,"""Black phytoclasts""",dinoflagellates; palynology
2,"""Brown phytoclasts""",dinoflagellates; palynology
3,"""Fish teeth""",other; planktic_forams
4,"""Otoliths""",other; planktic_forams
5,Brigantedinium spp.,dinoflagellates; palynology
6,Bryozoa indet.,other; planktic_forams
7,Dinoflagellata indet.,dinoflagellates; nannofossils; palynology
8,Echinoidea indet.,other; planktic_forams
9,Foraminifera indet.,benthic_forams; dinoflagellates; palynology; p...


# check if taxon group matches file name

In [78]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [79]:
taxa_df = pd.read_csv(taxa_crosswalk_list_file)

data = taxa_df.groupby(['verbatim_name'])['taxon_group'].apply('; '.join)
grouped_df = pd.DataFrame(data)
grouped_df.reset_index(inplace=True)

log_df(grouped_df)

(5269, 2)


Unnamed: 0,verbatim_name,taxon_group
0,"""Globigerina"" angulisuturalis _T",planktic_forams
1,"""Globigerina"" angulisuturalis _T_",planktic_forams
2,"""Globigerina"" ciperoensis _T",planktic_forams
3,"""Globigerina"" ciperoensis _T_",planktic_forams
4,"""Skeletonema"" utriculosa",diatoms


In [80]:
grouped_df[grouped_df['verbatim_name']=='Globigerinoides ruber (white)']

Unnamed: 0,verbatim_name,taxon_group
2124,Globigerinoides ruber (white),benthic_forams; planktic_forams


In [81]:
grouped_df[grouped_df['verbatim_name']=='Tenuitella gemma']

Unnamed: 0,verbatim_name,taxon_group
4680,Tenuitella gemma,planktic_forams


In [82]:
taxa_dict = {}
for index, row in grouped_df.iterrows():
    if row['verbatim_name'] not in taxa_dict:
        taxa_dict[row['verbatim_name']] = row['taxon_group']

In [83]:
taxa_dict['Globigerinoides ruber (white)']

'benthic_forams; planktic_forams'

In [84]:
taxa_dict['Tenuitella gemma']

'planktic_forams'

check if file taxon group is not in PI vetted taxon groups

In [85]:
data = []
all_taxa = taxa_df['verbatim_name']

for index, row in metadata.iterrows():
    # if index > 300:
    #     continue
        
    file_taxon_group = row['taxon_groups']

    df = pd.read_csv(CLEAN_DATA_DIR/row['path'])
    df = df.dropna(how='all', axis=1)

    taxa = set(df.columns).intersection(all_taxa)
    for taxon in taxa:
        if file_taxon_group not in taxa_dict[taxon] :
            name = taxa_df[taxa_df['verbatim_name'] == taxon]['normalized_name'].values[0]
            
            attr = {
                "verbatim_name": taxon, 
                "name": name,
                "file_taxon_group": file_taxon_group,
                "taxon_group": taxa_dict[taxon],
                "path": row['path']
            }
            data.append(attr)

print(len(data))

219


In [86]:
report_df = pd.DataFrame(data)
report_df.sort_values(by=['name', 'taxon_group'], inplace=True)
log_df(report_df)

# 219

(219, 5)


Unnamed: 0,verbatim_name,name,file_taxon_group,taxon_group,path
118,Challengeria spp.,"""Challengeria spp.""",benthic_forams,radiolarians,LIMS/Micropal_CSV_2/356-U1462C_benthic_forams.csv
187,Challengeria spp.,"""Challengeria spp.""",benthic_forams,radiolarians,LIMS/Micropal_CSV_2/356-U1462A_benthic_forams.csv
100,Acritarchs,Acritarcha indet.,palynology,dinoflagellates,LIMS/Micropal_CSV_2/374_U1522A_palynology.csv
102,Acritarchs,Acritarcha indet.,palynology,dinoflagellates,LIMS/Micropal_CSV_2/374_U1523E_palynology.csv
106,Acritarchs,Acritarcha indet.,palynology,dinoflagellates,LIMS/Micropal_CSV_2/374_U1525A_palynology.csv


In [87]:
cols = ['verbatim_name', 'name', 'file_taxon_group', 'taxon_group']
report_df.drop_duplicates(subset=cols, inplace=True)
log_df(report_df)

# 160

(160, 5)


Unnamed: 0,verbatim_name,name,file_taxon_group,taxon_group,path
118,Challengeria spp.,"""Challengeria spp.""",benthic_forams,radiolarians,LIMS/Micropal_CSV_2/356-U1462C_benthic_forams.csv
100,Acritarchs,Acritarcha indet.,palynology,dinoflagellates,LIMS/Micropal_CSV_2/374_U1522A_palynology.csv
97,Actiniscus pentasterias,Actiniscus pentasterias,diatoms,dinoflagellates,LIMS/Micropal_CSV_2/374_U1525A_diatoms.csv
88,Amaurolithus delicatus,Amaurolithus delicatus,diatoms,nannofossils,LIMS/Micropal_CSV_2/368_U1505D_diatoms.csv
26,Amaurolithus primus,Amaurolithus primus,diatoms,nannofossils,LIMS/Micropal_CSV_2/368_U1505D_diatoms.csv


In [67]:
# report_df.to_csv(OUTPUT_DIR/'tmp'/'nonmatching_taxon_groups.csv', index=False)

check if file taxon group is not in PI vetted taxon groups when multiple taxon groups

In [88]:
data = []
all_taxa = taxa_df['verbatim_name']

for index, row in metadata.iterrows():
    # if index > 300:
    #     continue
        
    file_taxon_group = row['taxon_groups']

    df = pd.read_csv(CLEAN_DATA_DIR/row['path'])
    df = df.dropna(how='all', axis=1)

    taxa = set(df.columns).intersection(all_taxa)
    for taxon in taxa:
        if file_taxon_group not in taxa_dict[taxon] and ';' in taxa_dict[taxon]:
            name = taxa_df[taxa_df['verbatim_name'] == taxon]['normalized_name'].values[0]
            
            attr = {
                "verbatim_name": taxon, 
                "name": name,
                "file_taxon_group": file_taxon_group,
                "taxon_group": taxa_dict[taxon],
                "path": row['path']
            }
            data.append(attr)

print(len(data))

0
