# create status report

In [1]:
import sys
sys.path.append('../../')
import glob
from pathlib import Path
import os

import pandas as pd
import numpy as np

from scripts.normalize_noaa_files import (
    unique_filenames_for_paths,
    unique_columns_for_paths,
    filename_index,
    format_filepaths_set
)
from config import OUTPUT_DIR, CLEAN_DATA_DIR

from scripts.shared_utils import (
    log_df
)

In [2]:
base_dir = CLEAN_DATA_DIR
metadata_path_1_96 = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_dsdp_files.csv'
metadata_path_101_210 = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_janus_iodp_files.csv'
metadata_path_101_210_lith = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_janus_files.csv'


## 1 - 96

In [90]:
raw_df = pd.read_csv(metadata_path_1_96)
log_df(raw_df)

(4477, 5)


Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
1,NOAA/DSDP_core_data/61/462/ageprof.csv,age,,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_forams,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_forams,61,462
4,NOAA/DSDP_core_data/61/462/hr_desc.csv,hard_rock,,61,462


In [91]:
raw_df['type'].unique()

array(['taxa', 'age', 'hard_rock', 'lithology'], dtype=object)

In [92]:
df = raw_df[['type', 'expedition', 'path']]
df = df.loc[df['type'].isin(['taxa', 'lithology'])]
log_df(df)

(3104, 3)


Unnamed: 0,type,expedition,path
0,taxa,61,NOAA/DSDP_core_data/61/462/radiolar.csv
2,taxa,61,NOAA/DSDP_core_data/61/462/b_forams.csv
3,taxa,61,NOAA/DSDP_core_data/61/462/p_forams.csv
5,taxa,61,NOAA/DSDP_core_data/61/462/nannos.csv
6,lithology,61,NOAA/DSDP_core_data/61/462/vistxt.csv


In [93]:
pivot = pd.pivot_table(df, index=['expedition'], columns=['type'], values=['type'], aggfunc='count')
log_df(pivot)

(99, 2)


Unnamed: 0_level_0,path,path
type,lithology,taxa
expedition,Unnamed: 1_level_2,Unnamed: 2_level_2
1,8.0,20.0
10,14.0,18.0
11,13.0,21.0
12,11.0,5.0
13,27.0,46.0


In [95]:
pivot.to_csv(OUTPUT_DIR/'eodp_status_1_96.csv')

## 101 - 201

In [74]:
raw_df = pd.read_csv(metadata_path_101_210, dtype=str)
log_df(raw_df)

(2481, 5)


Unnamed: 0,path,type,expedition,site,taxon_group
0,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,benthic_forams
1,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,benthic_forams
2,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,nannofossils
3,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,nannofossils
4,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,planktic_forams


In [75]:
raw_df['type'].unique()

array(['taxa', 'age'], dtype=object)

In [76]:
taxa_df = raw_df[['type', 'expedition', 'path']]
taxa_df = taxa_df.loc[taxa_df['type'] == 'taxa']
log_df(taxa_df)

(2045, 3)


Unnamed: 0,type,expedition,path
0,taxa,101,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...
1,taxa,101,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...
2,taxa,101,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...
3,taxa,101,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...
4,taxa,101,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...


In [78]:

raw_df2 = pd.read_csv(metadata_path_101_210_lith, dtype=str)
log_df(raw_df2)

(325, 4)


Unnamed: 0,path,type,expedition,site
0,NOAA/Janus_core_data/101/626a/sed_lith_101_626...,lithology,101,626a
1,NOAA/Janus_core_data/101/626b/sed_lith_101_626...,lithology,101,626b
2,NOAA/Janus_core_data/101/626c/sed_lith_101_626...,lithology,101,626c
3,NOAA/Janus_core_data/101/626d/sed_lith_101_626...,lithology,101,626d
4,NOAA/Janus_core_data/101/627a/sed_lith_101_627...,lithology,101,627a


In [79]:
raw_df2['type'].unique()

array(['lithology'], dtype=object)

In [80]:
lith_df = raw_df2[['type', 'expedition', 'path']]
lith_df = lith_df.loc[lith_df['type'] == 'lithology']
log_df(lith_df)

(325, 3)


Unnamed: 0,type,expedition,path
0,lithology,101,NOAA/Janus_core_data/101/626a/sed_lith_101_626...
1,lithology,101,NOAA/Janus_core_data/101/626b/sed_lith_101_626...
2,lithology,101,NOAA/Janus_core_data/101/626c/sed_lith_101_626...
3,lithology,101,NOAA/Janus_core_data/101/626d/sed_lith_101_626...
4,lithology,101,NOAA/Janus_core_data/101/627a/sed_lith_101_627...


In [81]:
df = pd.concat([taxa_df, lith_df])
log_df(df)

(2370, 3)


Unnamed: 0,type,expedition,path
0,taxa,101,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...
1,taxa,101,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...
2,taxa,101,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...
3,taxa,101,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...
4,taxa,101,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...


In [82]:
pivot = pd.pivot_table(df, index=['expedition'], columns=['type'], values=['type'], aggfunc='count')
pivot.sort_values(['expedition'])
log_df(pivot)

(89, 2)


Unnamed: 0_level_0,path,path
type,lithology,taxa
expedition,Unnamed: 1_level_2,Unnamed: 2_level_2
101,19.0,25.0
103,12.0,
104,8.0,47.0
105,11.0,
107,9.0,


In [83]:
pivot.to_csv(OUTPUT_DIR/'eodp_status_101_210.csv')