# Create Micropal metadata

Create a metadata file to track the changes made to each data file.

In [1]:
import sys
sys.path.append('../../../')
import glob
import re

import pandas as pd

from scripts.shared_utils import extract_taxon_group_from_filename
from config import CLEAN_DATA_DIR, OUTPUT_DIR

Create a list of all the files.

In [2]:
base_dir = CLEAN_DATA_DIR

micropal_1 = base_dir/'LIMS'/'Micropal_CSV_1'
micropal_2 = base_dir/'LIMS'/'Micropal_CSV_2'
micropal_3 = base_dir/'LIMS'/'Micropal_CSV_3'
micropal_4 = base_dir/'LIMS'/'Micropal_CSV_4'
micropal_revised = base_dir/'LIMS'/'Micropal_CSV_revised'

metadata_path = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv'

In [3]:
clean_csvs = []
clean_csvs = clean_csvs + list(micropal_1.glob("*.csv"))
clean_csvs = clean_csvs + list(micropal_2.glob("*.csv"))
clean_csvs = clean_csvs + list(micropal_3.glob("*.csv"))
clean_csvs = clean_csvs + list(micropal_4.glob("*.csv"))
clean_csvs = clean_csvs + list(micropal_revised.glob("*.csv"))

len(clean_csvs)

1253

## create metadata

Get all the taxon groups from the file names.

In [4]:
raw_taxon_groups = set()

for path in clean_csvs:
    filename = path.name
    group = extract_taxon_group_from_filename(filename)
    raw_taxon_groups.add(group)
    
raw_taxon_groups

{'benthic_forams',
 'bolboformids',
 'chrysophyte_cysts',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossil',
 'nannofossils',
 'nannofossils_revised',
 'ostracods',
 'other',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'radiolarians_events',
 'rads',
 'silicoflagellates'}

Create metadata file with file paths and taxon groups.

In [5]:
taxon_groups = []
filenames = []
relative_paths = []

groups = {
    'benthic_forams': 'benthic_forams',
    'bolboformids': 'bolboformids',
    'chrysophyte_cysts': 'chrysophyte_cysts',
    'diatoms': 'diatoms',
    'dinoflagellates': 'dinoflagellates',
    'ebridians': 'ebridians',
    'nannofossil': 'nannofossils',
    'nannofossils': 'nannofossils',
    'nannofossils_revised': 'nannofossils',
    'ostracods': 'ostracods',
    'other': 'other',
    'palynology': 'palynology',
    'planktic_forams': 'planktic_forams',
    'radiolarians': 'radiolarians',
    'radiolarians_events': 'radiolarians',
    'rads': 'radiolarians',
    'silicoflagellates': 'silicoflagellates'
}

for path in clean_csvs:
    df = pd.read_csv(path, dtype=str, nrows=1)
    df = df.dropna(how="all", axis="index") 
    if len(df) == 0:
        continue
        
    relative_path = path.relative_to(base_dir)
    filename = path.name
    raw_taxon_group = extract_taxon_group_from_filename(filename)
    taxon_group = groups[raw_taxon_group]
        
    filenames.append(filename)
    relative_paths.append(relative_path)
    taxon_groups.append(taxon_group)

In [6]:
dict = {"file": filenames,
        "path": relative_paths,
       "taxon_groups": taxon_groups}

metadata = pd.DataFrame(dict)
metadata.shape

(1253, 3)

In [7]:
metadata.head()

Unnamed: 0,file,path,taxon_groups
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology


In [8]:
metadata.to_csv(metadata_path, index=False)