# QC taxa

In [1]:
import sys
sys.path.append('../../')
import pandas as pd
import glob
from pathlib import Path
from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR
import db as db


from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
)

In [2]:
clean_data_paths = [
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_1', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_2', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_3', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_4', 
]

metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 


date = '2022-04-28'
taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"
taxa_crosswalk_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"

date = '2021-11-29'
noaa_taxa_crosswalk_file = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f"taxa_crosswalk_{date}.csv"
noaa_taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"

date = '2022-04-28'
input_file = RAW_DATA_DIR/'PI_processed_files'/f'LIMS_Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv'
input_4_file = RAW_DATA_DIR/'PI_processed_files'/f'LIMS Micropal_CSV_4_taxa_{date}.csv'

input_noaa_file = RAW_DATA_DIR/'PI_processed_files'/f'NOAA_taxa_lists_taxa_list_2021-11-29.csv'

add_taxa_file = RAW_DATA_DIR/'PI_processed_files'/'LIMS_Micropal_CSV_4_taxa_ADDTL_TAXA.csv'

In [3]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


# compare taxa list and PI taxa list

Check if the taxa that were aproved by the researchers from the google sheet matches the taxa from the LIMS taxa_list.csv.

In [21]:
normalized_df = pd.read_csv(input_file, skiprows = 9)
normalized_df.shape

(4754, 32)

In [22]:
normalized_4_df = pd.read_csv(input_4_file)
normalized_4_df.shape

(681, 33)

In [23]:
taxa_df = pd.read_csv(taxa_crosswalk_list_file)
taxa_df.shape

(5264, 19)

In [24]:
normalized_names = set(normalized_df['verbatim_name'])
normalized_4_names = set(normalized_4_df['verbatim_name'])
all_normalized_names = normalized_names.union(normalized_4_names)

taxa_names = set(taxa_df['verbatim_name'])

In [25]:
len(all_normalized_names - taxa_names)

23

In [26]:
all_normalized_names - taxa_names

{'ADDITIONAL SPECIES',
 'Bathymetry',
 'Diatom Zone (NPD) in Yanagisawa and Akiba (1998)',
 'Diatom Zone (Yanagisawa and Akiba, 1998)',
 'Exotic',
 'Gen. et sp. indet',
 'Marine',
 'Martini (1971) Zone',
 'Organic matter',
 'Planktic foraminiferal %',
 'Planktic foraminiferal (%)',
 'Preservation palynofacies',
 'Pyrite',
 'RESOLVED NAMES BY TAXONOMIC GROUP',
 'Radiolarian zone',
 'Radiolarian zone/subzone',
 'Silicoflagellate Zone in Ling (1992)',
 'Terrestrial organic matter',
 'Tintinids',
 'Zone in Ling (1992)',
 'fossil',
 'fossil_group',
 nan}

In [27]:
len(taxa_names - all_normalized_names)

14

In [28]:
taxa_names - all_normalized_names

{'Cypassis irregularis',
 'Eucyrtidium teuscheri',
 'Globorotalia spp.',
 'Gondwanaria dogieli',
 'Hantkenina australis',
 'Litharachnium tentorium',
 'Lithomelissa sp. A',
 'Lithostrobus cuspidatus',
 'Lonchosphaera spicata',
 'Plagiacanthidae indet.',
 'Poulpus spp.',
 'Prunopyle antarctica',
 'Streblacantha circumtexta',
 'Verticillata hexacantha'}

# check taxa files have unique values

In [29]:
df = pd.read_csv(taxa_list_file, usecols=['normalized_name', 'taxon_group'])
df.shape
# 4676

(4676, 2)

In [30]:
df[df.duplicated(subset=['normalized_name', 'taxon_group'])]

Unnamed: 0,normalized_name,taxon_group


In [31]:
df2 = pd.read_csv(taxa_crosswalk_list_file, usecols=['normalized_name', 'taxon_group', 'verbatim_name','eodp_id'])
df2.shape
# 5264

(5264, 4)

In [32]:
df2[df2.duplicated(subset=['normalized_name', 'taxon_group', 'verbatim_name'])]

Unnamed: 0,normalized_name,taxon_group,verbatim_name,eodp_id


In [33]:
sql = """
SELECT * FROM taxa_crosswalk 
JOIN taxa on taxa.id = taxa_crosswalk.taxon_id 

"""
rows = db.fetch_all(sql)
data = []
for row in rows:
    data.append({
        'normalized_name': row['name'], 
        'taxon_group': row['taxon_group'],
        'verbatim_name': row['original_name'],
        'eodp_id': row['eodp_id']
        
    })


len(data)

5263

In [34]:
db_df = pd.DataFrame(data)
db_df.head()

Unnamed: 0,normalized_name,taxon_group,verbatim_name,eodp_id
0,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,0
1,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,1
2,Foraminifera indet.,benthic_forams,Others,2
3,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,3
4,Ostracoda indet.,benthic_forams,Ostracoda spp.,4


In [35]:
set(df2['eodp_id']) - set (db_df['eodp_id'])

{3680}

In [36]:
set(db_df['eodp_id']) - set (df2['eodp_id'])

set()

# check additional species are imported

In [22]:
path = CLEAN_DATA_DIR/'LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv'
df = pd.read_csv(path, dtype=str)
df = df.dropna(axis=1, how="all")

log_df(df)

(31, 77)


Unnamed: 0,Sample,Top [cm],Bottom [cm],Top Depth [m],Bottom Depth [m],Zone name,Preservation,Group Abundance,Antarctissa cylindrica,Cycladophora pliocenica,...,Gondwanaria dogieli,Plagiacanthidae indet.,Litharachnium tentorium,Streblacantha circumtexta,Eucyrtidium teuscheri,Verticillata hexacantha,Cypassis irregularis,Prunopyle antarctica,Lithomelissa sp. A,Larcopyle buetschlii
0,374-U1525A-1H-1-IW_MUDLINE,0,0,0.0,0.0,,G,A,,,...,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,,
1,374-U1525A-1H-CC-PAL-RADS,0,5,8.57,8.62,,P,B,,,...,,,,,,,,,,
2,374-U1525A-2H-CC-PAL-RADS,0,5,18.72,18.77,,P,Tr,,,...,,,,,,,,,,
3,374-U1525A-3H-CC-PAL-RADS,0,5,28.16,28.21,,P,Tr,,,...,,,,,,,,,,
4,374-U1525A-4H-CC-PAL-RADS,0,5,28.86,28.91,> 0.65 (LAD A. cylindrica),M,Tr,X,,...,,,,,,,,,,


In [23]:
df.columns

Index(['Sample', 'Top [cm]', 'Bottom [cm]', 'Top Depth [m]',
       'Bottom Depth [m]', 'Zone name', 'Preservation', 'Group Abundance',
       'Antarctissa cylindrica', 'Cycladophora pliocenica',
       'Triceraspyris antarctica', 'Eucyrtidium calvertense',
       'Helotholus vema', 'Desmospyris spongiosa', 'Cycladophora davisiana',
       'Ceratocyrtis mashae', 'Prunopyle hayesi', 'Actinomma popofskii',
       'Acrosphaera? mercurius', 'Actinomma boreale', 'Actinomma delicatulum',
       'Actinomma leptodermum', 'Actinomma leptodernum longispinum',
       'Antarctissa denticulata', 'Antarctissa strelkovi',
       'Cenosphaera cristata', 'Ceratocyrtis spp.', 'Cornutella profunda',
       'Cycladophora bicornis', 'Druppatractus hastatus',
       'Enneaphormis rotula', 'Eucyrtidium inflatum',
       'Hexacontium pachydermum', 'Larcopyle pylomaticus',
       'Larcopyle weddellium', 'Lithelius nautiloides', 'Lithelius sp. A',
       'Mitrocalpis araneafera', 'Peripyramis circumtexta',
    

In [24]:
cols = [
    'Antarctissa cylindrica', 'Cycladophora pliocenica',
    'Triceraspyris antarctica', 'Eucyrtidium calvertense',
    'Helotholus vema', 'Desmospyris spongiosa', 'Cycladophora davisiana',
    'Ceratocyrtis mashae', 'Prunopyle hayesi', 'Actinomma popofskii',
    'Acrosphaera? mercurius', 'Actinomma boreale', 'Actinomma delicatulum',
    'Actinomma leptodermum', 'Actinomma leptodernum longispinum',
    'Antarctissa denticulata', 'Antarctissa strelkovi',
    'Cenosphaera cristata', 'Ceratocyrtis spp.', 'Cornutella profunda',
    'Cycladophora bicornis', 'Druppatractus hastatus',
    'Enneaphormis rotula', 'Eucyrtidium inflatum',
    'Hexacontium pachydermum', 'Larcopyle pylomaticus',
    'Larcopyle weddellium', 'Lithelius nautiloides', 'Lithelius sp. A',
    'Mitrocalpis araneafera', 'Peripyramis circumtexta',
    'Phormacantha hystrix/Plectacantha oikiskos group',
    'Phormostichoartus corbula', 'Pseudodictyophimus gracilipes',
    'Saccospyris antarctica', 'Saccospyris conithorax',
    'Saccospyris praeantarctica', 'Spongopyle osculosa',
    'Spongotrochus glacialis', 'Spongotrochus sp. A Abelmann',
    'Sphaeropyle robusta', 'Prunopyle tetrapila', 'Stylatractus neptunus',
    'Stylochlamidium  venustum', 'Stylodictya spp.', 'Trisulcus nana',
    # 'ADDITIONAL SPECIES', 
]

add_cols = [
    'Lonchosphaera spicata', 'Poulpus spp.', 'Lithostrobus cuspidatus',
    'Gondwanaria dogieli', 'Plagiacanthidae indet.',
    'Litharachnium tentorium', 'Streblacantha circumtexta',
    'Eucyrtidium teuscheri', 'Verticillata hexacantha',
    'Cypassis irregularis', 'Prunopyle antarctica', 'Lithomelissa sp. A',
    'Larcopyle buetschlii'
]

In [25]:
data = []
for index, row in df.iterrows():
    for col in cols:
        if pd.notna(row[col]):
            data.append({'Sample': row['Sample'], 'code': row[col], 'taxon': col})
 
len(data)

114

In [7]:
sql = """
SELECT count(*) 
FROM  samples_taxa
WHERE (data_source_notes = 'LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv')
AND (code != 'Indeterminate due to data source') 
"""

db.fetch_one(sql)

[114]

In [26]:
data = []
for index, row in df.iterrows():
    for col in add_cols:
        if pd.notna(row[col]):
            data.append({'Sample': row['Sample'], 'code': row[col], 'taxon': col})
 
len(data)

17

In [8]:
sql = """
SELECT count(*) 
FROM  samples_taxa
WHERE (data_source_notes = 'LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv')
AND (code = 'Indeterminate due to data source') 
"""

db.fetch_one(sql)

[17]

# check non taxa and taxa headers

In [52]:
all_columns_file = OUTPUT_DIR/'tmp/all_LIMS_taxa_columns_2020-02-23.csv'
df = pd.read_csv(all_columns_file, dtype=str)

log_df(df)
# 60215

(60215, 3)


Unnamed: 0.1,Unnamed: 0,path,column
0,0,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,Sample
1,1,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,Top [cm]
2,2,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,Bottom [cm]
3,3,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,Top Depth [m]
4,4,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,Bottom Depth [m]


In [53]:
taxa_df = pd.read_csv(taxa_crosswalk_list_file)
log_df(taxa_df)

(5264, 19)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments,additional species comments,eodp_id
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>100 m group,,0
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>50 m group,,1
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera,,2
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,,not a taxa,Andy,,,3
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",,,4


In [54]:
noaa_taxa_df = pd.read_csv(input_noaa_file)
log_df(noaa_taxa_df)

(7763, 20)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,Corrections to pbdb_taxon_rank
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,,incisa,,,,,762.0,Abyssamina,genus,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,,glomeratum,,,,,774.0,Adercotryma,genus,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,,sp.,,,,,774.0,Adercotryma,genus,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,,decorata,,,,,788.0,Alabamina,genus,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,,haitiensis,,,,,788.0,Alabamina,genus,


In [55]:
add_taxa_df = pd.read_csv(add_taxa_file)
log_df(add_taxa_df)

(30, 38)


Unnamed: 0,sample_id,file,sample,taxon_group,verbatim_name,code,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,path,normalized_name
0,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Lonchosphaera spicata,,,,,,...,,,,,,,,,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv,Lonchosphaera spicata
1,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Poulpus spp.-like,,Poulpus spp.,,,,...,,,,,,,,,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv,Poulpus spp.
2,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Lithostrobus cuspidatus,,,,,,...,,,,,,,,,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv,Lithostrobus cuspidatus
3,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Gondwanaria dogieli,,,,,,...,,,,,,,,,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv,Gondwanaria dogieli
4,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Tetraplecta pinigera/Euscenium corynephorum,,Plagiacanthidae indet.,,,Plagiacanthidae indet.,...,,,,,,,,,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv,Plagiacanthidae indet.


In [56]:
cols = ['normalized_name', 'verbatim_name', 'non-taxa descriptor']
dex_sin_df = pd.read_csv(taxa_crosswalk_list_file, dtype=str, usecols=cols)

dex_sin_df = dex_sin_df[dex_sin_df['non-taxa descriptor'].isin(['sinistral', 'dextral'])]
log_df(dex_sin_df)

(80, 3)


Unnamed: 0,non-taxa descriptor,normalized_name,verbatim_name
3089,dextral,Neogloboquadrina acostaensis (dextral),Dextral:Sinistral _N. acostaensis_
3090,sinistral,Neogloboquadrina acostaensis (sinistral),Dextral:Sinistral _N. acostaensis_
3091,dextral,Pulleniatina finalis (dextral),Dextral:Sinistral _P. finalis_
3092,sinistral,Pulleniatina finalis (sinistral),Dextral:Sinistral _P. finalis_
3093,dextral,Pulleniatina obliquiloculata (dextral),Dextral:Sinistral _P. obliquiloculata_


In [57]:
dex_sin_taxa = set(dex_sin_df['normalized_name'])
dex_sin_taxa

{'Globoconella miotumida (dextral)',
 'Globoconella miotumida (sinistral)',
 'Globorotalia hirsuta (dextral)',
 'Globorotalia hirsuta (sinistral)',
 'Globorotalia menardii (dextral)',
 'Globorotalia menardii (sinistral)',
 'Globorotalia s.l. crassaformis (dextral)',
 'Globorotalia s.l. crassaformis (sinistral)',
 'Globorotalia scitula (dextral)',
 'Globorotalia scitula (sinistral)',
 'Globorotalia truncatulinoides (dextral)',
 'Globorotalia truncatulinoides (sinistral)',
 'Hirsutella hirsuta (dextral)',
 'Neogloboquadrina acostaensis (dextral)',
 'Neogloboquadrina acostaensis (sinistral)',
 'Neogloboquadrina atlantica (dextral)',
 'Neogloboquadrina atlantica (sinistral)',
 'Neogloboquadrina dutertrei (dextral)',
 'Neogloboquadrina dutertrei (sinistral)',
 'Neogloboquadrina incompta (dextral)',
 'Neogloboquadrina incompta (sinistral)',
 'Neogloboquadrina pachyderma (dextral)',
 'Neogloboquadrina pachyderma (sinistral)',
 'Paragloborotalia mayeri (dextral)',
 'Paragloborotalia mayeri (si

In [58]:
tmp = set(df['column']) - set(taxa_df['verbatim_name']) 
# tmp

In [59]:
nontaxa = {
'342-U1408A-2H-2-W 100/102-FORAM',
'A/W',
'ADDITIONAL SPECIES',
'Abundance',
'Abundance (%)',
'Abundances',
'Additional zone name',
'Additional zone name (short)',
'Age',
'Age:',
'Aspect comment (etching)',
'BF Group abundance',
'BF Preservation',
'BF comment',
'BF preservation',
'Bathymetry',
'Biozone name',
'Biozone name (short)',
'Bottom Depth [m]',
'Bottom Offset (cm) on Parent Sample',
'Bottom [cm]',
'COMMENTS',
'Comment',
'Comment (general)',
'Comments',
'Core',
'Core Type - Section',
'Core,    section',
'Core, Section',
'Core, Section, Interval',
'Core, Type, Section',
'Core, section',
'Core, section, interval',
'Core, section, interval (cm)',
'Core-Sect',
'Datum age average (Ma)',
'Datum age average [Ma]',
'Datum age maximum [Ma]',
'Datum age minimum [Ma]',
'Datum author year',
'Datum comment',
'Datum group',
'Datum group code',
'Datum name',
'Datum name generic',
'Datum region',
'Datum type',
'Depth (cm)',
'Depth Method',
'Diatom Zone (NPD) in Yanagisawa and Akiba (1998)',
'Diatom Zone (Yanagisawa and Akiba, 1998)',
'Exotic',
'Exp',
'Expedition, site, hole, core, section, interval (cm):',
'Extra Sample ID Data',
'File Data',
'Fragmentation',
'Fragmentation rank [auto-pop]',
'General comment',
'Genus/species (upper zone)',
'Genus/species lower zone)',
'Group Abundance',
'Group abundance',
'Group abundance (%)',
'Group preservation',
'Half',
'Hole',
'Hole, Core, Section',
'IRD',
'Labl ID',
'Lower boundary age av. [Ma]',
'Martini (1971) Zone',
'Miscellaneous',
'Mixing',
'No. specimens/tray',
'Oberservations',
'Observations',
'Original Bottom Depth (m)',
'Original Top Depth (m)',
'Other observations',
'PALEO WATER DEPTH (IS=inner shelf, MS=middle shelf, OS=outer shelf)',
'PF Group Abundance',
'PF Preservation',
'PF Zone',
'PF group abundance',
'PF preservation',
'Piece',
'Preservation',
'Presevation',
'Pyrite',
'REMARKS',
'Remarks',
'Reworking comment (1= <1%, 2= light 1-10%, 3= >10%)',
'Reworking comment (1= <1%, 2=light 1-10%, 3= >10%)',
'Sample',
'Sample comment',
'Sample preparation comment',
'Section',
'Section Half',
'Secton Half',
'Ship File Links',
'Shore File Links',
'Silicoflagellate Zone in Ling (1992)',
'Site',
'Temperature Range',
'Top Depth [m]',
'Top Offset (cm) on Parent Sample',
'Top [cm]',
'Type',
'Type (lower zone)',
'Type (upper zone)',
'Unnamed: 13',
'Unnamed: 14',
'Unnamed: 179',
'Unnamed: 21',
'Unnamed: 81',
'Upper boundary age av. [Ma]',
'Upper boundary age min [Ma]',
'XBroken',
'XCorroded',
'XCrumpled',
'Zone',
'Zone author (year)',
'Zone comment',
'Zone group',
'Zone in Ling (1992)',
'Zone name',
'Zone name (short)',
'Zone name [short]',
'Zone/Subzone',
'comments',
'constituent',
'core, section',
'count',
'count_type',
'dupes and comments',
'eodp_id',
'fossil',
'fossil_group',
'interval (cm)',
'mean depth (mbsf)',
'pc_abundance_name_mode',
'pc_fossil_group',
'pc_fossil_name',
'physical_constituent_name',
'preservation'
}

misc = {
'% Planktic Foraminifera within whole sample',
'Benthic abundance',
'Chrysophyte cyst group abundance',
'Diatom abundance',
'Diatom preservation - pyritization2',
'Diatom preservation dissolution',
'Diatom preservation fragmentation',
'Diatoms and siliceous plankton comment',
'Diatoms group abundance',
'Ebridian group abundance', 
'Foram abundance',
'Foraminferal preservation',
'Foraminiferal abundance',
'Foraminiferal preservation',
'Large Benthic Forams [%]',
'Marine',
'Nannofossil Zone',
'Nannofossil abundance',
'Nannofossil comment',
'Organic matter',
'Other fossil material',
'Other taxa',
'Percentage of benthic forams in total foram assemblage [%]',
'Percentage of non-calcareous agglutinated forams in total foram assemblage [%]',
'Percentage of planktic forams in total foram assemblage [%]',
'Planktic foraminiferal %',
'Planktic foraminiferal (%)',
'Planktonic Benthic ratio (P:B)',
'Radiolarian zone',
'Radiolarian zone/subzone',
'Silicoflagellates group abundance',
'Sillicoflagellate abundance',
'Terrestrial organic matter',
'Total in situ dinocysts',
'Total pollen',
'Total radiolarians',   
}

In [60]:
tmp - nontaxa - misc - dex_sin_taxa

{'Actinocyclus ingens nodus',
 'Actinocyclus senarius',
 'Actinocyclus vulgaris',
 'Actinoptychus bipunctatus',
 'Argilloecia sp.',
 'Asteromphalus brookei',
 'Asteromphalus hyalinus',
 'Aulacoseira sp.',
 'Bitectatodinium tepikiense',
 'Bolivina albatrossi',
 'Braarudosphaera sp.',
 'Bradleya sp.',
 'Brigantedinium simplex',
 'Bulimina exilis',
 'Chiasmolithus sp.',
 'Cibicidoides parki',
 'Coccolithus streckeri',
 'Cocconeis placentula',
 'Cocconeis vitrea',
 'Crenalithus doronicoides',
 'Cricolithus jonesii',
 'Cyclotella sp.',
 'Cytheropteron sp.',
 'Detonula confervacea',
 'Dictyocha subarctios',
 'Diploneis interrupta',
 'Discoaster spp. (six-rayed)',
 'Distephanus boliviensis',
 'Distephanus boliviensis boliviensis',
 'Distephanus jimlingii',
 'Distephanus octangulatus',
 'Distephanus octonarius',
 'Distephanus quinquangellus',
 'Dorcadospyris scambos',
 'Ebriopsis antiqua antiqua',
 'Ebriopsis antiqua cornuta',
 'Eunotia praerupta',
 'Filisphaera filifera',
 'Gen. et sp. indet'

# check for taxa in multiple taxon groups

In [260]:
taxa_df = pd.read_csv(taxa_list_file)
log_df(taxa_df)

(4676, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476.0,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,,,18891.0,Arthropoda,325038.0,Animalia


normalized_name

In [265]:
# https://stackoverflow.com/questions/38127209/how-to-use-groupby-to-concatenate-strings-in-python-pandas

data = taxa_df.groupby(['normalized_name'])['taxon_group'].apply('; '.join)
grouped_df = pd.DataFrame(data)
log_df(grouped_df)

(4652, 1)


Unnamed: 0_level_0,taxon_group
normalized_name,Unnamed: 1_level_1
"""Amorphous organic matter""",dinoflagellates; palynology
"""Black phytoclasts""",dinoflagellates; palynology
"""Black woody phytoclasts""",palynology
"""Brown phytoclasts""",dinoflagellates; palynology
"""Brown woody phytoclasts""",palynology


In [266]:
grouped_df = grouped_df[grouped_df['taxon_group'].str.contains('; ')]
grouped_df.reset_index(inplace=True)

grouped_df

Unnamed: 0,normalized_name,taxon_group
0,"""Amorphous organic matter""",dinoflagellates; palynology
1,"""Black phytoclasts""",dinoflagellates; palynology
2,"""Brown phytoclasts""",dinoflagellates; palynology
3,"""Fish teeth""",other; planktic_forams
4,"""Otoliths""",other; planktic_forams
5,Brigantedinium spp.,dinoflagellates; palynology
6,Bryozoa indet.,other; planktic_forams
7,Dinoflagellata indet.,dinoflagellates; nannofossils; palynology
8,Echinoidea indet.,other; planktic_forams
9,Foraminifera indet.,benthic_forams; dinoflagellates; palynology; p...


verbatim_name

In [279]:
data = taxa_df.groupby(['verbatim_name'])['taxon_group'].apply('; '.join)
grouped_df = pd.DataFrame(data)

grouped_df = grouped_df[grouped_df['taxon_group'].str.contains('; ')]
grouped_df.reset_index(inplace=True)

grouped_df

Unnamed: 0,verbatim_name,taxon_group
0,Dextral:Sinistral _N. acostaensis_,planktic_forams; planktic_forams
1,Dextral:Sinistral _P. finalis_,planktic_forams; planktic_forams
2,Dextral:Sinistral _P. obliquiloculata_,planktic_forams; planktic_forams
3,Dextral:Sinistral _P. praecursor_,planktic_forams; planktic_forams
4,Dextral:Sinistral _P. praespectabilis_,planktic_forams; planktic_forams
5,Dextral:Sinistral _P. primalis_,planktic_forams; planktic_forams
6,Dextral:Sinistral _P. spectabilis_,planktic_forams; planktic_forams
7,Discoaster pentaradiatus,nannofossils; nannofossils


# check if taxon group matches file name

In [11]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [12]:
taxa_df = pd.read_csv(taxa_crosswalk_list_file)

data = taxa_df.groupby(['verbatim_name'])['taxon_group'].apply('; '.join)
grouped_df = pd.DataFrame(data)
grouped_df.reset_index(inplace=True)

log_df(grouped_df)

(5256, 2)


Unnamed: 0,verbatim_name,taxon_group
0,"""Globigerina"" angulisuturalis _T",planktic_forams
1,"""Globigerina"" angulisuturalis _T_",planktic_forams
2,"""Globigerina"" ciperoensis _T",planktic_forams
3,"""Globigerina"" ciperoensis _T_",planktic_forams
4,"""Skeletonema"" utriculosa",diatoms


In [13]:
taxa_dict = {}
for index, row in grouped_df.iterrows():
    if row['verbatim_name'] not in taxa_dict:
        taxa_dict[row['verbatim_name']] = row['taxon_group']

In [14]:
taxa_dict['Discoaster pentaradiatus']

'nannofossils; nannofossils'

In [19]:
data = []
all_taxa = taxa_df['verbatim_name']

for index, row in metadata.iterrows():
    
    if index > 10:
        continue

        
    df = pd.read_csv(CLEAN_DATA_DIR/row['path'])
    
    taxon_group = row['taxon_groups']
    taxa = set(df.columns).intersection(all_taxa)
    
    for taxon in taxa:
        if taxon_group != taxa_dict[taxon]:
            data.append({
                "verbatim_name": taxon, 
                "file_taxon_group": taxon_group,
                "taxon_group": taxa_dict[taxon],
                "path": row['path']
            })



In [20]:
report_df = pd.DataFrame(data)
report_df

Unnamed: 0,verbatim_name,file_taxon_group,taxon_group,path
0,Discoaster pentaradiatus,nannofossils,nannofossils; nannofossils,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv
1,Discoaster pentaradiatus,nannofossils,nannofossils; nannofossils,LIMS/Micropal_CSV_1/320_U1335A_Nannofossils_1.csv
2,Discoaster pentaradiatus,nannofossils,nannofossils; nannofossils,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_1.csv
3,Corbisema triacantha,silicoflagellates,diatoms,LIMS/Micropal_CSV_1/318_U1355A_Silicoflagellat...
