# Normalize LIMS taxa

In [1]:
import sys
sys.path.append('../../../')
import glob
import re
import os.path
import hashlib
from pathlib import Path

import pandas as pd
import numpy as np

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
    clean_taxon_name,
    get_non_taxa_fields,
    remove_whitespace
)
import scripts.normalize_taxa as nt


In [2]:
metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 
# metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes_4.csv'

normalized_fields_path =  OUTPUT_DIR/'normalized_data'/f'eODP_unified_data_structure_2022_02_21.csv'


clean_data_path = CLEAN_DATA_DIR

PI_additional_taxa_path = RAW_DATA_DIR/'PI_processed_files'/'LIMS_Micropal_CSV_4_taxa_ADDTL_TAXA.csv'
additional_taxa_path = OUTPUT_DIR/'taxa'/'LIMS'/'addtional_species.csv'


date = '2022-08-08'

crosswalk_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"
taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"
PI_file = RAW_DATA_DIR/'PI_processed_files'/f'LIMS_Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv'

date='2021-11-29'
noaa_crosswalk_file = OUTPUT_DIR/'taxa'/'NOAA'/f"taxa_crosswalk_{date}.csv"
noaa_taxa_list_file = OUTPUT_DIR/'taxa'/'NOAA'/f"taxa_list_{date}.csv"


In [3]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## Clean up taxa values in data files
Look for taxa columns that have "code [extra text]", and remove "[extra text]".

In [4]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [5]:
taxa_df = pd.read_csv(crosswalk_file, dtype=str)
taxa_df.head()


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>100 m group
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>50 m group
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,,not a taxa,Andy,
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",


In [6]:
taxa_list = set(taxa_df['verbatim_name'].unique())
len(taxa_list)

4587

In [7]:
def clean_up_taxa_values(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    taxa_cols = list(set(content.columns).intersection(taxa_list))
    filter_df = content[taxa_cols]
    content[list(taxa_cols)] = filter_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not filter_df.fillna('').equals(content[taxa_cols].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_values(file) for file in metadata['path']] 



### Update metadata

In [8]:
dict = {"clean_up_taxa_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [9]:
new_metadata.to_csv(metadata_file, index=False)

## Clean up non-taxa metadata values in data files
Look for non-taxa metadata columns that have "code [extra text]", and remove "[extra text]".

In [10]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [11]:
cols = ['normalized', 'taxa_317-present']
non_taxa_df = pd.read_csv(normalized_fields_path, dtype=str, header=5, usecols=cols)
non_taxa_df.head()


Unnamed: 0,normalized,taxa_317-present
0,Sample,Label ID; Sample
1,Expedition,Exp
2,Site,Site
3,Hole,Hole
4,Core,Core


In [12]:
non_taxa_dict = get_non_taxa_fields(non_taxa_df, 'taxa_317-present')
non_taxa_list = set(non_taxa_dict.keys())

In [13]:
non_taxa_list

{'% Planktic Foraminifera within whole sample',
 'A/W',
 'Abundance',
 'Additional zone name',
 'Additional zone name (short)',
 'Age',
 'Aspect comment (etching)',
 'BF Group abundance',
 'BF Preservation',
 'BF comment',
 'BF preservation',
 'Bottom Depth [m]',
 'Bottom Depth[m] [m]',
 'Bottom [cm]',
 'Bottom depth [m]',
 'Bottom offset [cm]',
 'Bottom[cm] [cm]',
 'COMMENTS',
 'Comment',
 'Comment (general)',
 'Comments',
 'Core',
 'Datum age average [Ma]',
 'Datum age maximum (Ma)',
 'Datum age minimum (Ma)',
 'Datum author year',
 'Datum name',
 'Datum name generic',
 'Datum region',
 'Datum type',
 'Diatom abundance',
 'Diatom preservation dissolution',
 'Diatom preservation fragmentation',
 'Diatoms group abundance',
 'Exp',
 'Extra Sample ID Data',
 'Foram abundance',
 'General comment',
 'Genus/species (upper zone)',
 'Genus/species lower zone)',
 'Group Abundance',
 'Group abundance',
 'Group abundance (%)',
 'Group preservation',
 'Hole',
 'Label ID',
 'Large Benthic Forams [

In [14]:
def clean_up_taxa_meta_values(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    nontaxa_cols = list(set(content.columns).intersection(non_taxa_list))
    filter_df = content[nontaxa_cols]
    content[list(nontaxa_cols)] = filter_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not filter_df.fillna('').equals(content[nontaxa_cols].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_meta_values(file) for file in metadata['path']] 

### Update metadata

In [15]:
dict = {"clean_up_taxa_metadata_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [16]:
new_metadata.to_csv(metadata_file, index=False)

## Add eodp_id to data files
add hash eodp_id to each row in data files

In [17]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [18]:
def add_eodp_id(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path, dtype=str)
    content.dropna(how='all', axis='rows', inplace=True)
    
    def add_hash_id(id):
        return hashlib.md5(f'{file}{id}'.encode()).hexdigest()
            
    content['eodp_id'] = content.index 
    content['eodp_id'] = content['eodp_id'].apply(add_hash_id)
    
    content = csv_cleanup(content, path)
    content.to_csv(path, index=False)

res = [add_eodp_id(file) for file in metadata['path']]

## split Dextral:Sinistral taxa in data files

split  Dextral:Sinistral column into two columns.

In [19]:
dex_sin_files = [
'363-U1482A-planktic_forams.csv',
 '363-U1482B-planktic_forams.csv',
 '363-U1482C-planktic_forams.csv',
 '363-U1483A-planktic_forams.csv',
 '363-U1483C-planktic_forams.csv',
 '363-U1484A-planktic_forams.csv',
 '363-U1485A-planktic_forams.csv',
 '363-U1486B-planktic_forams.csv',
 '363-U1487A-planktic_forams.csv',
 '363-U1488A-planktic_forams.csv',
 '363-U1489B-planktic_forams.csv',
 '363-U1489C-planktic_forams.csv',
 '363-U1489D-planktic_forams.csv',
 '363-U1490A-planktic_forams.csv'
]
len(dex_sin_files)

14

In [20]:
cols = ['normalized_name', 'verbatim_name', 'name comment field']
taxa_df = pd.read_csv(crosswalk_file, dtype=str, usecols=cols)
taxa_df = taxa_df[taxa_df['name comment field'] == 'this will be two output fields']

taxa_df

Unnamed: 0,normalized_name,verbatim_name,name comment field
3092,Neogloboquadrina acostaensis (dextral),Dextral N. acostaensis,this will be two output fields
3093,Neogloboquadrina acostaensis (sinistral),Sinistral N. acostaensis,this will be two output fields
3094,Pulleniatina finalis (dextral),Dextral P. finalis,this will be two output fields
3095,Pulleniatina finalis (sinistral),Sinistral P. finalis,this will be two output fields
3096,Pulleniatina obliquiloculata (dextral),Dextral P. obliquiloculata,this will be two output fields
3097,Pulleniatina obliquiloculata (sinistral),Sinistral P. obliquiloculata,this will be two output fields
3098,Pulleniatina praecursor (dextral),Dextral P. praecursor,this will be two output fields
3099,Pulleniatina praecursor (sinistral),Sinistral P. praecursor,this will be two output fields
3100,Pulleniatina praespectabilis (dextral),Dextral P. praespectabilis,this will be two output fields
3101,Pulleniatina praespectabilis (sinistral),Sinistral P. praespectabilis,this will be two output fields


In [21]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [22]:
def split_dextral_sinistral(file):
    path = Path(clean_data_path/file)
    if path.name not in dex_sin_files:
        return
    
    changed = False
    df_0 = pd.read_csv(path, dtype=str, nrows=0)
    
    file_target_taxa = set(df_0.columns).intersection(set(nt.dex_sin_conversion_2.keys()))
    if len(file_target_taxa) > 0:
        changed = True
        df = pd.read_csv(path, dtype=str)
        for taxon in file_target_taxa:
            new_columns = nt.dex_sin_conversion_2[taxon]
            split_columns = df[taxon].str.split(':', expand=True)
            if split_columns.shape[1] == 1:
                for col in new_columns:
                    df[col] = np.nan
            else:
                df[new_columns] = split_columns
            
        df = csv_cleanup(df, path)
        df.to_csv(path, index=False)

    return changed 
            

change_columns = [split_dextral_sinistral(file) for file in metadata['path']]

In [23]:
dict = {"split_dextral_sinistral": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [24]:
new_metadata.to_csv(metadata_file, index=False)

## clean up  additional species file
 add path and normalized_name columns

In [25]:
metadata = pd.read_csv(metadata_file, dtype=str)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [26]:
df = pd.read_csv(PI_additional_taxa_path, dtype=str)
log_df(df)

(30, 36)


Unnamed: 0,sample_id,file,sample,taxon_group,verbatim_name,code,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Lonchosphaera spicata,,,,,,...,,,,,,,,,,
1,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Poulpus spp.-like,,Poulpus spp.,,,,...,,,,,,,,,,
2,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Lithostrobus cuspidatus,,,,,,...,,,,,,,,,,
3,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Gondwanaria dogieli,,,,,,...,,,,,,,,,,
4,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,Tetraplecta pinigera/Euscenium corynephorum,,Plagiacanthidae indet.,,,Plagiacanthidae indet.,...,,,,,,,,,,


In [27]:
set(df['taxon_group'])

{'planktic_forams', 'radiolarians'}

In [28]:
nt.add_normalized_name_column(df)

df['verbatim_name'] = 'additional species: ' + df['verbatim_name']

In [29]:
for index, row in df.iterrows():
    path = metadata[metadata['file'] == row['file']]['path'].values[0]
    df.loc[df['file'] == row['file'], 'path'] = path
        
log_df(df)

(30, 38)


Unnamed: 0,sample_id,file,sample,taxon_group,verbatim_name,code,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,normalized_name,path
0,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Lonchosphaera spicata,,,,,,...,,,,,,,,,Lonchosphaera spicata,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
1,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Poulpus spp.-like,,Poulpus spp.,,,,...,,,,,,,,,Poulpus spp.,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
2,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Lithostrobus cuspidatus,,,,,,...,,,,,,,,,Lithostrobus cuspidatus,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
3,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Gondwanaria dogieli,,,,,,...,,,,,,,,,Gondwanaria dogieli,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
4,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Tetraplecta pinigera/Eusce...,,Plagiacanthidae indet.,,,Plagiacanthidae indet.,...,,,,,,,,,Plagiacanthidae indet.,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv


In [30]:
df.to_csv(additional_taxa_path, index=False)

## add additional species taxa to data files 

In [20]:
df = pd.read_csv(additional_taxa_path, dtype=str)
log_df(df)

(30, 38)


Unnamed: 0,sample_id,file,sample,taxon_group,verbatim_name,code,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,normalized_name,path
0,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Lonchosphaera spicata,,,,,,...,,,,,,,,,Lonchosphaera spicata,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
1,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Poulpus spp.-like,,Poulpus spp.,,,,...,,,,,,,,,Poulpus spp.,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
2,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Lithostrobus cuspidatus,,,,,,...,,,,,,,,,Lithostrobus cuspidatus,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
3,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Gondwanaria dogieli,,,,,,...,,,,,,,,,Gondwanaria dogieli,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
4,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Tetraplecta pinigera/Eusce...,,Plagiacanthidae indet.,,,Plagiacanthidae indet.,...,,,,,,,,,Plagiacanthidae indet.,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv


In [21]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [22]:
for index, row in df.iterrows():
    # only process that has additional species
    tmp_df = metadata[metadata['file'] == row['file']]
    if len(tmp_df) > 0:
        path = tmp_df['path'].values[0]
        temp_df = pd.read_csv(clean_data_path/path, dtype=str)

        temp_df.loc[
            (temp_df['Sample'] == row['sample']) & (pd.isna(row['code'])), 
            row['verbatim_name']
        ] = 'Indeterminate due to data source'
        
        temp_df.loc[
            (temp_df['Sample'] == row['sample']) & (pd.notna(row['code'])), 
            row['verbatim_name']
        ] = row['code']

        
        temp_df = csv_cleanup(temp_df, path)
        temp_df.to_csv(clean_data_path/path, index=False)


## add additional species taxa to crosswalk taxa list

In [4]:
existing_crosswalk = pd.read_csv(crosswalk_file, dtype=str)
log_df(existing_crosswalk, 3)
# 5276

(5276, 17)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>100 m group
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>50 m group
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera


In [5]:
df = pd.read_csv(additional_taxa_path, dtype=str)
log_df(df)

(30, 38)


Unnamed: 0,sample_id,file,sample,taxon_group,verbatim_name,code,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,normalized_name,path
0,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Lonchosphaera spicata,,,,,,...,,,,,,,,,Lonchosphaera spicata,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
1,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Poulpus spp.-like,,Poulpus spp.,,,,...,,,,,,,,,Poulpus spp.,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
2,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Lithostrobus cuspidatus,,,,,,...,,,,,,,,,Lithostrobus cuspidatus,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
3,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Gondwanaria dogieli,,,,,,...,,,,,,,,,Gondwanaria dogieli,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
4,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Tetraplecta pinigera/Eusce...,,Plagiacanthidae indet.,,,Plagiacanthidae indet.,...,,,,,,,,,Plagiacanthidae indet.,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv


In [6]:
filtered_taxa = nt.create_taxa_crosswalk_df(df)
# 30
# 30
# 20

fields: ['Any taxon above genus', 'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name', 'species modifier', 'species name', 'subspecies modifier', 'subspecies name', 'non-taxa descriptor', 'normalized_name', 'taxon_group', 'verbatim_name', 'name comment field', 'Comment', 'Notes (change to Internal only notes?)', 'comments']
initial df:  (30, 17)
remove nontaxa df:  (30, 17)
drop duplicates df:  (20, 17)


In [7]:
existing_crosswalk.columns == filtered_taxa.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [9]:
combine_df = pd.concat([existing_crosswalk, filtered_taxa])
remove_whitespace(combine_df)

combine_df.shape
# 5296

(5296, 17)

In [10]:
combine_df.drop_duplicates(subset=['normalized_name', 'taxon_group', 'verbatim_name'], inplace=True)
log_df(combine_df)
# 5295

(5295, 17)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>100 m group
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>50 m group
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,,not a taxa,Andy,
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",


In [11]:
combine_df.to_csv(crosswalk_file, index=False)

## add additional species taxa to taxa list

In [12]:
existing_taxa = pd.read_csv(taxa_list_file, dtype=str)
log_df(existing_taxa, 3)
# 4646

(4646, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria


In [13]:
df = pd.read_csv(additional_taxa_path, dtype=str)
log_df(df)

(30, 38)


Unnamed: 0,sample_id,file,sample,taxon_group,verbatim_name,code,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,normalized_name,path
0,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Lonchosphaera spicata,,,,,,...,,,,,,,,,Lonchosphaera spicata,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
1,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Poulpus spp.-like,,Poulpus spp.,,,,...,,,,,,,,,Poulpus spp.,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
2,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Lithostrobus cuspidatus,,,,,,...,,,,,,,,,Lithostrobus cuspidatus,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
3,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Gondwanaria dogieli,,,,,,...,,,,,,,,,Gondwanaria dogieli,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
4,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Tetraplecta pinigera/Eusce...,,Plagiacanthidae indet.,,,Plagiacanthidae indet.,...,,,,,,,,,Plagiacanthidae indet.,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv


In [14]:
filtered_taxa = nt.create_taxa_list_df(df)
# 30
# 30
# 20

fields: ['Any taxon above genus', 'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name', 'species modifier', 'species name', 'subspecies modifier', 'subspecies name', 'non-taxa descriptor', 'normalized_name', 'taxon_group', 'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id', 'family_taxon_name', 'order_taxon_id', 'order_taxon_name', 'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name']
initial df:  (30, 25)
remove nontaxa df:  (30, 25)
drop duplicates df:  (20, 25)


In [15]:
existing_taxa.columns == filtered_taxa.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [16]:
combine_df = pd.concat([existing_taxa, filtered_taxa])
remove_whitespace(combine_df)
combine_df.shape

# 4666

(4666, 25)

In [17]:
combine_df.drop_duplicates(inplace=True, subset =[ 'normalized_name', 'taxon_group'] )
log_df(combine_df)
# 4660

(4660, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [18]:
combine_df.to_csv(taxa_list_file, index=False)

## add matching NOAA taxa to LIMS taxa 

add NOAA taxa that match LIMS taxa to LIMS taxa

In [19]:
metadata = pd.read_csv(metadata_file)
log_df(metadata)

(1253, 16)


Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [20]:
all_columns = set()
for path in metadata['path']:
    df = pd.read_csv(clean_data_path/path, dtype=str)
    df = csv_cleanup(df, clean_data_path/path)
    df = df.dropna(how='all', axis='columns')
    all_columns.update([col.strip() for col in df.columns])

In [21]:
len(all_columns)
# 5552

5552

In [22]:
existing_taxa_df = pd.read_csv(taxa_list_file)
existing_taxa_df.columns

Index(['Any taxon above genus', 'genus modifier', 'genus name',
       'subgenera modifier', 'subgenera name', 'species modifier',
       'species name', 'subspecies modifier', 'subspecies name',
       'non-taxa descriptor', 'normalized_name', 'taxon_group',
       'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
       'family_taxon_id', 'family_taxon_name', 'order_taxon_id',
       'order_taxon_name', 'class_taxon_id', 'class_taxon_name',
       'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
       'kingdom_taxon_name'],
      dtype='object')

In [23]:
existing_LIMS_taxa = set()

taxa_df = pd.read_csv(crosswalk_file)
for index, row in taxa_df.iterrows():
    existing_LIMS_taxa.add(row['verbatim_name'])

len(existing_LIMS_taxa)
# 5283

5283

In [24]:
existing_NOAA_taxa = set()

noaa_taxa_df = pd.read_csv(noaa_crosswalk_file)
for index, row in noaa_taxa_df.iterrows():
    existing_NOAA_taxa.add(row['verbatim_name'])

len(existing_NOAA_taxa)
# 7758

7758

In [25]:
nontaxa = all_columns - existing_LIMS_taxa - existing_NOAA_taxa 
len(nontaxa)
# 188

188

In [27]:
nontaxa

{'',
 '% Planktic Foraminifera within whole sample',
 '342-U1408A-2H-2-W 100/102-FORAM',
 'A/W',
 'ADDITIONAL SPECIES',
 'Abundance',
 'Abundance (%)',
 'Abundances',
 'Additional zone name',
 'Additional zone name (short)',
 'Age',
 'Age:',
 'Aspect comment (etching)',
 'BF Group abundance',
 'BF Preservation',
 'BF comment',
 'BF preservation',
 'Bathymetry',
 'Benthic abundance',
 'Biozone name',
 'Biozone name (short)',
 'Bottom Depth [m]',
 'Bottom Offset (cm) on Parent Sample',
 'Bottom [cm]',
 'COMMENTS',
 'Chrysophyte cyst group abundance',
 'Comment',
 'Comment (general)',
 'Comments',
 'Core',
 'Core Type - Section',
 'Core,    section',
 'Core, Section',
 'Core, Section, Interval',
 'Core, Type, Section',
 'Core, section',
 'Core, section, interval',
 'Core, section, interval (cm)',
 'Core-Sect',
 'Datum age average (Ma)',
 'Datum age average [Ma]',
 'Datum age maximum [Ma]',
 'Datum age minimum [Ma]',
 'Datum author year',
 'Datum comment',
 'Datum group',
 'Datum group cod

In [28]:
noaa_lims_taxa = all_columns - existing_LIMS_taxa - nontaxa
len(noaa_lims_taxa)

85

In [29]:
noaa_crosswalk = pd.read_csv(noaa_crosswalk_file, dtype=str)
noaa_crosswalk['eodp_id'] = np.nan
noaa_crosswalk['name comment field'] = np.nan

log_df(noaa_crosswalk, 3)
# 7763

(7763, 37)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,normalized_name,eodp_id,name comment field
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,Globothalamea,288974,Foraminifera,212476,Rhizaria,,,Abyssamina incisa,,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,288974,Foraminifera,212476,Rhizaria,,,Adercotryma glomeratum,,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,288974,Foraminifera,212476,Rhizaria,,,Adercotryma sp.,,


In [30]:
data = []
for index, row in noaa_crosswalk.iterrows():
    if row['verbatim_name'] in noaa_lims_taxa:
        data.append(row)
        
noaa_lims_df = pd.DataFrame(data)
log_df(noaa_lims_df, 3)
# 85

(85, 37)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,normalized_name,eodp_id,name comment field
143,benthic_forams,Bolivina albatrossi,,,,,,Bolivina,,,...,,288974,Foraminifera,212476,Rhizaria,,,Bolivina albatrossi,,
257,benthic_forams,Bulimina exilis,,,,,,Bulimina,,,...,,288974,Foraminifera,212476,Rhizaria,,,Bulimina exilis,,
415,benthic_forams,Cibicidoides parki,,,,,,Cibicidoides,,,...,,288974,Foraminifera,212476,Rhizaria,,,Cibicidoides parki,,


### update LIMS crosswalk

In [32]:
existing_crosswalk = pd.read_csv(crosswalk_file, dtype=str)
log_df(existing_crosswalk, 3)
# 5295

(5295, 17)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>100 m group
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>50 m group
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera


In [33]:
filter_noaa_df = noaa_lims_df[existing_crosswalk.columns]

filter_noaa_df.columns == existing_crosswalk.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [35]:
combine_df = pd.concat([existing_crosswalk, filter_noaa_df])
remove_whitespace(combine_df)

combine_df.shape
# 5380

(5380, 17)

In [36]:
combine_df.to_csv(crosswalk_file, index=False)

### update LIMS taxa list

In [37]:
existing_taxa = pd.read_csv(taxa_list_file, dtype=str)
log_df(existing_taxa, 3)
# 4660

(4660, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria


In [38]:
filter_noaa_df = noaa_lims_df[existing_taxa.columns]

filter_noaa_df.columns == existing_taxa.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [40]:
combine_df = pd.concat([existing_taxa, filter_noaa_df])
remove_whitespace(combine_df)

combine_df.shape
# 4745

(4745, 25)

In [41]:
combine_df.to_csv(taxa_list_file, index=False)

## add eodp_id to taxa crosswalk

In [43]:
df = pd.read_csv(crosswalk_file, dtype=str)
log_df(df, 3)
# 5380

(5380, 17)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>100 m group
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>50 m group
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera


In [44]:
df['eodp_id'] = df.index
log_df(df, 3)


(5380, 18)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments,eodp_id
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>100 m group,0
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>50 m group,1
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera,2


In [45]:
df.to_csv(crosswalk_file, index=False)