# create_temp_reports.

In [3]:
import sys
sys.path.append('../../')
import glob
import shutil
from pathlib import Path
import os

import numpy as np
import pandas as pd
from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR
import db as db
from scripts.normalize_taxa import add_normalized_name_column

from scripts.normalize_data import (
    check_duplicate_columns
)

from scripts.shared_utils import (
    log_df
)

In [4]:
clean_data_path = CLEAN_DATA_DIR

metadata_1_file = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_dsdp_files.csv' 
metadata_2_file = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_janus_files.csv' 
metadata_3_file = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_janus_iodp_files.csv' 



## create a file with taxa from NOAA taxa files


In [85]:
metadata = pd.read_csv(metadata_1_file)
log_df(metadata)


(4477, 5)


Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
1,NOAA/DSDP_core_data/61/462/ageprof.csv,age,,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_forams,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_forams,61,462
4,NOAA/DSDP_core_data/61/462/hr_desc.csv,hard_rock,,61,462


handle files where taxa is in 'fossil' column

In [88]:
data = []
for mindex, mrow  in metadata.iterrows():
    if mrow['type'] != 'taxa':
        continue 
    
    df = pd.read_csv(clean_data_path/mrow['path'], dtype=str)
    df.dropna(axis="columns", how='all', inplace=True)
    df.dropna(axis="rows", how='all', inplace=True)
    
    for index, row in df.iterrows():
        if pd.isna(row['fossil']):
            continue 
            
        data.append({"path": mrow['path'], "column": row['fossil'].strip()})
        
new_df = pd.DataFrame(data)
new_df.drop_duplicates(inplace=True)
log_df(new_df)

(73961, 2)


Unnamed: 0,path,column
0,NOAA/DSDP_core_data/61/462/radiolar.csv,Stichocorys peregrina
1,NOAA/DSDP_core_data/61/462/radiolar.csv,Artophormis gracilis
2,NOAA/DSDP_core_data/61/462/radiolar.csv,Artostrobium doliolum
3,NOAA/DSDP_core_data/61/462/radiolar.csv,Lithopera renzae
4,NOAA/DSDP_core_data/61/462/radiolar.csv,Solenospheara omnitubus


In [71]:
metadata = pd.read_csv(metadata_3_file)
log_df(metadata)


(2481, 5)


Unnamed: 0,path,type,expedition,site,taxon_group
0,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,benthic_forams
1,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,benthic_forams
2,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,nannofossils
3,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,nannofossils
4,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,planktic_forams


handle files taxa is in column headers

In [84]:
data = []
for mindex, mrow  in metadata.iterrows():
    if mrow['type'] != 'taxa':
        continue 
    
    df = pd.read_csv(clean_data_path/mrow['path'], dtype=str)
    df.dropna(axis="columns", how='all', inplace=True)
    df.dropna(axis="rows", how='all', inplace=True)
    
    for col in df.columns:
        data.append({"path": mrow['path'], "column": col.strip()})
        
new2_df = pd.DataFrame(data)
new2_df.drop_duplicates(inplace=True)
log_df(new2_df)

(114727, 2)


Unnamed: 0,path,column
0,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,Data
1,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,Age From (oldest)
2,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,Age To (youngest)
3,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,Zone From (bottom)
4,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,Zone To (top)


In [89]:
df = pd.concat([new_df, new2_df])
log_df(df)

(188688, 2)


Unnamed: 0,path,column
0,NOAA/DSDP_core_data/61/462/radiolar.csv,Stichocorys peregrina
1,NOAA/DSDP_core_data/61/462/radiolar.csv,Artophormis gracilis
2,NOAA/DSDP_core_data/61/462/radiolar.csv,Artostrobium doliolum
3,NOAA/DSDP_core_data/61/462/radiolar.csv,Lithopera renzae
4,NOAA/DSDP_core_data/61/462/radiolar.csv,Solenospheara omnitubus


In [90]:
df.to_csv(OUTPUT_DIR/'tmp'/'NOAA_taxa_headers.csv', index=False)

## search for taxa

In [31]:
def copy_file(path):

    temp_path = OUTPUT_DIR /'tmp'/ 'search_results'

    if not os.path.exists(temp_path):
        os.makedirs(temp_path)
    
    parts = path.split('/')
    filename = '_'.join(parts[4:7]) + '__' + parts[7]
    output_dir = temp_path / search_term
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    shutil.copy(clean_data_path/path, output_dir/filename)
    

In [32]:
data_df = pd.read_csv(OUTPUT_DIR/'tmp'/'NOAA_taxa_headers.csv', dtype=str)
log_df(data_df)

(188688, 2)


Unnamed: 0,path,column
0,NOAA/DSDP_core_data/61/462/radiolar.csv,Stichocorys peregrina
1,NOAA/DSDP_core_data/61/462/radiolar.csv,Artophormis gracilis
2,NOAA/DSDP_core_data/61/462/radiolar.csv,Artostrobium doliolum
3,NOAA/DSDP_core_data/61/462/radiolar.csv,Lithopera renzae
4,NOAA/DSDP_core_data/61/462/radiolar.csv,Solenospheara omnitubus


In [39]:
search_term = 'Abies taxa'

count  = 0
for path in data_df[data_df['column'] == search_term]['path']:
    print(path)
    count += 1
    copy_file(path)
    
print(count)

NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/104/642/HOLE_B/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/127/794/HOLE_B/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/127/794/HOLE_A/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/127/795/HOLE_B/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/127/795/HOLE_A/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/127/796/HOLE_B/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/127/797/HOLE_B/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/127/797/HOLE_C/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/145/881/HOLE_C/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/145/881/HOLE_D/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/145/883/HOLE_B/Pollen_Spores.csv
NOAA/JanusIODP_paleo_

# get list of messed up taxa files

104 pollen taxa files has messed up taxa. We want to get taxa that only appear in leg 104 pollen taxa files and delete them from the NOAAA taxa google sheet.

In [9]:
bad_files = [
'NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/104/642/HOLE_B/Pollen_Spores.csv',
'NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/104/642/HOLE_C/Pollen_Spores.csv',
'NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/104/642/HOLE_D/Pollen_Spores.csv',
'NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/104/643/HOLE_A/Pollen_Spores.csv'
]

In [10]:
non_taxa = {
'Age From (oldest)',
'Age To (youngest)',
'Comment',
'Cor',
'Data',
'Depth (mbsf)',
'Fossil Group                                 ',
'Group Abundance',
'Group Preservation',
'H',
'Leg',
'Sc',
'Scientist',
'Site',
'T',
'Top(cm)',
'Zone From (bottom)',
'Zone To  (top)'
}

In [11]:
data_df = pd.read_csv(OUTPUT_DIR/'tmp'/'NOAA_taxa_headers.csv', dtype=str)
log_df(data_df)

(188688, 2)


Unnamed: 0,path,column
0,NOAA/DSDP_core_data/61/462/radiolar.csv,Stichocorys peregrina
1,NOAA/DSDP_core_data/61/462/radiolar.csv,Artophormis gracilis
2,NOAA/DSDP_core_data/61/462/radiolar.csv,Artostrobium doliolum
3,NOAA/DSDP_core_data/61/462/radiolar.csv,Lithopera renzae
4,NOAA/DSDP_core_data/61/462/radiolar.csv,Solenospheara omnitubus


get taxa that are in the bad files 

In [12]:
bad_taxa = set()
for file in bad_files:
    print(file)

    df = pd.read_csv(clean_data_path/file)
    bad_taxa.update(list(set(df.columns) -  (non_taxa)))

NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/104/642/HOLE_B/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/104/642/HOLE_C/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/104/642/HOLE_D/Pollen_Spores.csv
NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/104/643/HOLE_A/Pollen_Spores.csv


In [13]:
len(bad_taxa)
# 85

85

find taxa that appear in other files

In [14]:
good_taxa = set()
data = []

for taxon in bad_taxa:
    data.append({'taxon name': taxon, 'action': pd.NA})
    for path in data_df[data_df['column'] == taxon]['path']:
        if path in bad_files:
            pass
        else:
            good_taxa.add(taxon.strip())
            
        
        

In [15]:
good_taxa 

{'Abies taxa',
 'Carya taxa',
 'Castanea sp.',
 'Chenopodiaceae sp.',
 'Fagus sp.',
 'Gramineae taxa',
 'Juglans sp.',
 'Larix taxa',
 'Podocarpus sp.',
 'Pterocarya taxa',
 'Quercus taxa',
 'Salix sp.',
 'Selaginella  taxa',
 'Sphagnum taxa',
 'Tricolpites sp. 1',
 'Ulmus taxa'}

In [16]:
print(len(bad_taxa), len(good_taxa), len(bad_taxa - good_taxa))

# 85 16 69

85 16 69


In [17]:
bad_taxa - good_taxa

{'?Proteacidites sp.',
 'AInus cf. A. viridus',
 'Alnipollenites sp.',
 'Amorphogen taxa',
 'Azolla cf. A',
 'Baculate triletes cf. A',
 'Betula cf.B. alba',
 'Betula sect. Nanae',
 'Betula sp',
 'Bigfrilled monosaccates cf A',
 'Botrychium sp.',
 'Carpirus sp.',
 'Cedrus cf. C perialata',
 'Clavate tricolporates cf. A & B',
 'Coaly  taxa',
 'Cyathea sp. cf. C. sp.',
 'DR sporomorphs taxa',
 'Ericales taxa',
 'Fagus grandis',
 'Frilled monosaccates cf. A & B',
 'Gleichenia sp.',
 'Granular inaperturates cf. A',
 'Large bisaccates cf. A &  B',
 'Large tricolpates cf. A &  B &  C',
 'Large tricolporates cf. A &  B',
 'Lycopodium reticulatum',
 'Lycopodium sp.',
 'Osmunda regalis',
 'Osmunda sp.',
 'Palaeogene bisaccates cf A & B & C',
 'Papillate inaperturates cf. A &  B',
 'Picea cf. P. mariana',
 'Pinus cf.P. banksiana',
 'Pinus cf.P. sylvestris',
 'Pointed monocolpates (unspecified)',
 'Pointed monocolpates cf. A',
 'Polypodium cf. P. reniforme',
 'Polytrichum spores taxa',
 'Pteridiu

create csv that lists which taxa were deleted or kept

In [122]:
df = pd.DataFrame(data)
df['action'] = 'delete'

for taxon in good_taxa:
    df.loc[df['taxon name'] == taxon, 'action'] = 'keep'

df = df.sort_values(['taxon name'])
log_df(df)

(85, 2)


Unnamed: 0,taxon name,action
23,?Proteacidites sp.,delete
45,AInus cf. A. viridus,delete
83,Abies taxa,keep
0,Alnipollenites sp.,delete
62,Amorphogen taxa,delete


In [123]:
df.to_csv(OUTPUT_DIR/'tmp'/'104_bad_pollen_taxa_summary.csv', index=False)

### check taxa deletes

In [21]:
df = pd.read_csv(RAW_DATA_DIR/'PI_processed_files'/'NOAA_2_taxa_lists_taxa_list_2023-01-17.csv')
df2 = pd.read_csv(RAW_DATA_DIR/'PI_processed_files'/'NOAA_2_taxa_lists_taxa_list_2023-01-18.csv')


In [27]:
(
    set(df['verbatim_name']) - 
    set(df2['verbatim_name'])  - 
    set([t.strip() for t in bad_taxa]) - 
    good_taxa
)

{'Coaly  m taxa'}