In [20]:
import sys
sys.path.append('../scripts/')
import glob
import shutil
import re
import os
import numpy as np
import pandas as pd

from normalize_data import (
    check_duplicate_columns,
    extract_taxon_group_from_filename,
    csv_cleanup,
    create_sample_name_for_row,
    get_common_columns
    
)

sys.path.append('../')
import db 

In [21]:
LIMS_lith_paths = [
    os.path.join('cleaned_data', 'Lithology_CSV')
]

LIMS_paleo_paths = [
    os.path.join('cleaned_data', 'Micropal_CSV_1'),
    os.path.join('cleaned_data', 'Micropal_CSV_2'),
    os.path.join('cleaned_data', 'Micropal_CSV_3'),
    os.path.join('cleaned_data', 'Micropal_CSV_revised'),
]

LIMS_paths = LIMS_lith_paths + LIMS_paleo_paths


taxa_meta = os.path.join('cleaned_data', 'metadata', 'LIMS', 'Micropal_changes.csv')
lith_meta = os.path.join('cleaned_data', 'metadata', 'LIMS', 'Lithology_changes.csv')


## LIMS lithology

In [111]:
metadata = pd.read_csv(lith_meta)

In [112]:
paths = list(metadata['path'])

In [113]:
nonstandard_columns = {'2ND crystal roundness',
 '2ND lithic roundness',
 '2ND vitric roundness',
 '2ND volcanic clast roundness',
 '2nd sedimentary structure',
 '3RD lithic roundness',
 '3RD vitric roundness',
 '3RD volcanic clast roundness',
 '3rd sedimentary structure',
 'Accessory mineral (comments)',
 'Accessory mineral 1 (<1 Vol. %)',
 'Accessory mineral 2 (<1 Vol. %)',
 'Altered glass (palagonite) abundance [%]',
 'Amount of ash in lithology',
 'Amount of ash in lithology rank [read only]',
 'Apparent dip angle 1 [deg]',
 'Apparent dip angle 2 [deg]',
 'Apparent dip azimuth 1 [deg]',
 'Apparent dip azimuth 2 [deg]',
 'Bedding thickness',
 'Bedding thickness of MAJOR lithology',
 'Biogenic carbonate',
 'Biogenic material',
 'Biotite abundance',
 'Bioturbation intensity',
 'Bioturbation intensity rank',
 'Bioturbation intensity rank [read only]',
 'Bioturbation type',
 'Bottom contact attitude',
 'Bottom contact definition',
 'Bottom contact geometry',
 'Bottom contact or boundary attitude',
 'Bottom contact or boundary definition',
 'Bottom contact or boundary geometry',
 'Bottom contact or boundary geometry+definition [read only]',
 'Bottom contact or boundary type',
 'COLOR',
 'COMMENT',
 'COMMENTS',
 'COMPLETE NAME',
 'Calcareous nannofossils abundance',
 'Calcite, allogenic  abundance',
 'Chalcedony abundance',
 'Classification',
 'Clast -  grain size',
 'Clast 1 igneous -  grain size',
 'Clast 1 igneous -  roundness',
 'Clast 1 igneous - pumiceous [%]',
 'Clast 1 igneous - pumiceous grain size',
 'Clast 2 igneous - scoriaceous [%]',
 'Clast 2 igneous - scoriaceous grain size',
 'Clast 2 sediment - grain size',
 'Clast 2 sediment - roundness',
 'Clast 2 sediment grain size',
 'Clast 3 -other lithology grain size',
 'Clast 3 igneous - massive [%]',
 'Clast 3 igneous - massive grain size',
 'Clast 3 metamorphic - grain size',
 'Clast 3 metamorphic - roundness',
 'Clast 4 igneous - vesicular [%]',
 'Clast 4 igneous - vesicular grain size',
 'Clast 5 volcaniclastic [%]',
 'Clast 5 volcaniclastic grain size',
 'Clast 6 sediment [%]',
 'Clast 6 sediment grain size',
 'Clast 7 -other lithology [%]',
 'Clast 7 -other lithology grain size',
 'Clast abundance',
 'Clast color',
 'Clast comment',
 'Clast comments',
 'Clast lith. abund. sum [%]',
 'Clast lithology',
 'Clast lithology comment',
 'Clast morphology (shape)',
 'Clast percentage [%]',
 'Clast roundness',
 'Clast roundness rank',
 'Clast size',
 'Clast sorting',
 'Clast/matrix texture',
 'Clastic Grain roundness',
 'Clastic grain roundness',
 'Clastic grain: roundness',
 'Clastic grainRoundness',
 'Clasts',
 'Clasts (>2 mm) 2ND ORDER',
 'Clasts (>2 mm) 3ND ORDER',
 'Clasts (>2 mm) DOMINANT',
 'Clasts (>2 mm) Subdominant',
 'Clasts [%]',
 'Clasts abundance',
 'Clay minerals abundance',
 'Color',
 'Color (name)',
 'Color code',
 'Color(name)',
 'Comment',
 'Complete lithology name',
 'Complete name',
 'Contact geometry',
 'Contact or boundary type',
 'Crystals (>2 mm) 2ND ORDER',
 'Crystals (>2 mm) 3RD ORDER',
 'Crystals (>2 mm) DOMINANT',
 'DOMINANT Clasts % [%]',
 'DOMINANT Clasts ave. grain size class',
 'DOMINANT Clasts ave. grain size rank [read only]',
 'DOMINANT Clasts max. grain size class',
 'DOMINANT Clasts max. grain size rank [read only]',
 'DOMINANT clast roundness',
 'DOMINANT crystal roundness',
 'DOMINANT lithic roundness',
 'DOMINANT vitric roundness',
 'DOMINANT volcanic clast roundness',
 'Deformational structure comment',
 'Deformational structure dip [deg]',
 'Deformational structures',
 'Degree of alteration',
 'Detrital carbonate',
 'Diagenetic constituent',
 'Diagenetic constituent composition',
 'Diagenetic structure',
 'Diagenetic structure comment',
 'Diatoms abundance',
 'Dip angle calculated in CRF [deg]',
 'Dip angle measured in CRF [deg]',
 'Dip azimuth calculated in CRF [deg]',
 'Dip azimuth measured in CRF [deg]',
 'Domain relative abundance [%]',
 'Dominant grain size (Wentworth, 1922)',
 'Dominant grain size rank',
 'Dominant grain size(Wentworth, 1922)',
 'Dominantgrain size(Wentworth, 1922)',
 'Drilling disturbance intensity',
 'Drilling disturbance intensity rank',
 'Drilling disturbance intensity rank (read only)',
 'Drilling disturbance type',
 'FACIES (neritic, hemipelagic, pelagic)',
 'Feldspar abundance',
 'Foraminifera abundance',
 'Fossil',
 'Fresh glass',
 'GRAVEL SIZE CLAST',
 'GRAVEL SIZE CLASTLITHOLOGY',
 'GRAVEL SIZE CLASTROUNDNESS',
 'GRAVEL SIZE CLASTSHAPE',
 'GROUNDMASS COLOUR',
 'GROUNDMASS GRAIN SIZE (mm)',
 'General comment',
 'General interval comment',
 'Glass abundance',
 'Glass preservation comment',
 'Glauconite abundance',
 'Grading',
 'Grain size distribution',
 'Grain size max. [mm]',
 'Grain size rank [read only]',
 'Groundmass comments',
 'Groundmass crystal size [mm]',
 'Groundmass modal grain size [mm]',
 'Groundmass modal percent [%]',
 'Groundmass modal percentage [%]',
 'Halite abundance',
 'Hornblende abundance',
 'INTERPRETATION - Turbiditic or Hemipelagic',
 'Igneous texture 1',
 'Igneous texture 2',
 'Interbed lithology',
 'LITH 1 Ash abundance name',
 'LITH 1 Ash abundance rank (read only)',
 'LITH 1 Bioturbation intensity',
 'LITH 1 Bioturbation intensity rank [read only]',
 'LITH 1 Clast 1 igneous -  grain size',
 'LITH 1 Clast 1 igneous -  lithology',
 'LITH 1 Clast 1 igneous -  roundness',
 'LITH 1 Clast 1 igneous [%]',
 'LITH 1 Clast 2 sediment - grain size',
 'LITH 1 Clast 2 sediment - roundness',
 'LITH 1 Clast 2 sediment [%]',
 'LITH 1 Clast 3 -other LITH [%]',
 'LITH 1 Clast 3 metamorphic - grain size',
 'LITH 1 Clast 3 metamorphic - roundness',
 'LITH 1 Clast abundance',
 'LITH 1 Deformational structure',
 'LITH 1 Deformational structure attitude',
 'LITH 1 Diagenetic constituent',
 'LITH 1 Diagenetic constituent composition',
 'LITH 1 Diagenetic structure',
 'LITH 1 Layer or bedding thickness',
 'LITH 1 Macrofossil',
 'LITH 1 Macrofossil abundance name',
 'LITH 1 Matrix 6 sediment [%]',
 'LITH 1 Matrix alteration intensity',
 'LITH 1 Matrix alteration rank',
 'LITH 1 Matrix comments',
 'LITH 1 Matrix sorting',
 'LITH 1 Number of clasts (>2mm)',
 'LITH 1 Sedimentary structure',
 'LITH 1 abundance [%]',
 'LITH 1 ave. grain size class',
 'LITH 1 ave. grain size rank [read only]',
 'LITH 1 color',
 'LITH 1 comment',
 'LITH 1 grain sorting',
 'LITH 1 lithification',
 'LITH 1 max. grain size class',
 'LITH 1 max. grain size rank [read only]',
 'LITH 1 prefix+name [read only]',
 'LITH 1 prefix+name+suffix [read only]',
 'LITH 2 Ash abundance name',
 'LITH 2 Ash abundance rank (read only)',
 'LITH 2 Macrofossil',
 'LITH 2 abundance [%]',
 'LITH 2 ave. grain size class',
 'LITH 2 ave. grain size rank [read only]',
 'LITH 2 color',
 'LITH 2 grain sorting',
 'LITH 2 lithification',
 'LITH 2 max. grain size class',
 'LITH 2 max. grain size rank [read only]',
 'LITH 2 prefix+name [read only]',
 'LITH 2 prefix+name+suffix [read only]',
 'LITHIFICATION',
 'LITHOLOGY COMMENTS',
 'LITHOLOGY NAME',
 'LITHOLOGY [%]',
 'Lamination',
 'Layer or bedding thickness',
 'Layer, boundary, lamination, grading',
 'Layer/Beddding',
 'Lith. abundance sum [%]',
 'Lith. comment',
 'Lith. prefix+name+suffix',
 'Lithic clasts (>2 mm) 2ND ORDER',
 'Lithic clasts (>2 mm) 3RD ORDER',
 'Lithic clasts (>2 mm) DOMINANT',
 'Lithification',
 'Lithification [name]',
 'Lithification rank [read only]',
 'Lithologic accessories',
 'Lithologic unit (preliminary)',
 'Lithological Accessories 1',
 'Lithological Accessories 2',
 'Lithological accessories comment',
 'Lithological asseccories',
 'Lithological unit number',
 'Lithology  abundance [%]',
 'Lithology (%)',
 'Lithology (5)',
 'Lithology [%]',
 'Lithology ave. grain size class',
 'Lithology ave. grain size rank [read only]',
 'Lithology avg. grain size',
 'Lithology avg. grain size rank',
 'Lithology classification',
 'Lithology color',
 'Lithology color (Munsell)',
 'Lithology color (simple)',
 'Lithology comment',
 'Lithology comments',
 'Lithology lithification',
 'Lithology max. grain size class',
 'Lithology max. grain size rank [read only]',
 'Lithology name',
 'Lithology name+suffix',
 'Lithology name+suffix [read only]',
 'Lithology prefix + name',
 'Lithology prefix+name [read only]',
 'Lithology prefix+name+suffix [read only]',
 'Lowerr boundary ave. grain size class',
 'MAJ Lith. Prefix abundance Plot [%]',
 'MAJ Lith. Principal name abundance Plot [%]',
 'MAJ Lith. abundance [%]',
 'MAJ Lith. ave. grain size',
 'MAJ Lith. ave. grain size rank',
 'MAJ Lith. ave. grain size rank [read only]',
 'MAJ Lith. color',
 'MAJ Lith. max. grain size',
 'MAJ Lith. max. grain size rank',
 'MAJ Lith. max. grain size rank [read only]',
 'MAJ Lith. name+suffix',
 'MAJ Lith. prefix + name',
 'MAJ Lith. prefix abundance plot [%]',
 'MAJ Lith. prefix+name [read only]',
 'MAJ Lith. prefix+name+suffix',
 'MAJ Lith. prefix+name+suffix [read only]',
 'MAJ Lith. principal name abundance plot [%]',
 'MAJ Lithology ABC',
 'MAJ Lithology color',
 'MAJ lith color (simple)',
 'MAJ lithology comment',
 'MIN Lith. abundance [%]',
 'MIN Lith. ave. grain size',
 'MIN Lith. ave. grain size rank',
 'MIN Lith. color',
 'MIN Lith. max. grain size',
 'MIN Lith. max. grain size rank',
 'MIN Lith. name+suffix',
 'MIN Lith. prefix + name',
 'MIN Lith. prefix+name+suffix',
 'MIN lith color (simple)',
 'MIN lithology comment',
 'Macrofossil',
 'Macrofossil abundance name',
 'Matrix 1 igneous - pumiceous [%]',
 'Matrix 2 igneous - scoriaceous [%]',
 'Matrix 3 igneous - massive [%]',
 'Matrix 4 igneous - vesicular [%]',
 'Matrix 5 volcaniclastic [%]',
 'Matrix 6 sediment [%]',
 'Matrix 7 -other lithology [%]',
 'Matrix abund. sum [%]',
 'Matrix abundance [%]',
 'Matrix alteration intensity',
 'Matrix alteration rank',
 'Matrix biogenic carbonate [%]',
 'Matrix carbonate [%]',
 'Matrix comment',
 'Matrix comments',
 'Matrix detritical carbonate [%]',
 'Matrix modifier',
 'Matrix sorting',
 'Matrix, fines (<2 mm) 2ND ORDER',
 'Matrix, fines (<2 mm) 3ND ORDER',
 'Matrix, fines (<2 mm) DOMINANT',
 'Microfossil abundance',
 'Minor components',
 'Minor trace fossils',
 'Non-skeletal components',
 'Non-skeletal components comment',
 'Number of clasts (>2mm)',
 'Opaques abundance',
 'Other mineral',
 'Other sedimentary feature',
 'PHENOCRYST COMPOSITION',
 'PHENOCRYST GRAIN SIZE (mm)',
 'Phenocryst 1 (most abundant)',
 'Phenocryst 1 abundance [%]',
 'Phenocryst 1 comments',
 'Phenocryst 1 shape',
 'Phenocryst 1 size MAX [mm]',
 'Phenocryst 1 size MODE [mm]',
 'Phenocryst 2 (2nd most abundant)',
 'Phenocryst 2 abundance [%]',
 'Phenocryst 2 comments',
 'Phenocryst 2 shape',
 'Phenocryst 2 size MAX [mm]',
 'Phenocryst 2 size MODE [mm]',
 'Primary sedimentary structure',
 'Primary trace fossils',
 'Pyroxene abundance',
 'Quartz abundance',
 'REFERENCE',
 'Radiolarians abundance',
 'Rock fragment - plutonic lithic',
 'Rock fragment - sedimentary lithic',
 'Rock fragment - volcanic lithic',
 'Sample domain name (if >1 domain)',
 'Sample domain number (if >1 domain)',
 'Secondary trace fossils',
 'Sed. structures',
 'Sediment structures',
 'Sedimentary structure',
 'Sedimentary structure comment',
 'Siliciclastic',
 'Silicoflagellate, ebridian, actiniscidian abundance',
 'Skeletal components INTERMEDIATE',
 'Skeletal components INTERMEDIATE comment',
 'Skeletal components MAJOR',
 'Skeletal components MAJOR comment',
 'Skeletal components MINOR',
 'Skeletal components MINOR comment',
 'Sorting',
 'Sponge spicule fragments abundance',
 'Structure comment',
 'Subdominant Clasts % [%]',
 'Subdominant Clasts ave. grain size class',
 'Subdominant Clasts ave. grain size rank [read only]',
 'Subdominant Clasts max. grain size class',
 'Subdominant Clasts max. grain size rank [read only]',
 'Subdominant clast roundness',
 'TEPHRA type',
 'Tephra',
 'Texture Rank',
 'Texture Rank [read only]',
 'Texture comment',
 'Thickness of interbeds',
 'Total LITH abundance [%]',
 'Total clasts calculated LITH 1 [%]',
 'Total matrix calculated LITH 1 [%]',
 'Total phenocryst abundance [%]',
 'Upper boundary ave. grain size class',
 'VESICLE ABUNDANCE',
 'VESICLE SHAPE',
 'VOLCANICLASTIC GRAINS',
 'Vesicle abundance',
 'Vesicle comments',
 'Vesicle filling',
 'Vesicle roundness',
 'Vesicle size MAX [mm]',
 'Vesicle size MODE [mm]',
 'Vesicle sphericity',
 'Vitric clasts (>2 mm) 2ND ORDER',
 'Vitric clasts (>2 mm) 3RD ORDER',
 'Vitric clasts (>2 mm) DOMINANT',
 'Volcanic clasts (>2 mm) 2ND ORDER',
 'Volcanic clasts (>2 mm) 3RD ORDER',
 'Volcanic clasts (>2 mm) DOMINANT',
 'Volcaniclastic grain minerals',
 'Volcaniclastic grain size name',
 'dupes and comments',
 'lithology classification',
 'lithology_classification',
 'structure'}

In [114]:

def process_file(path):
    path = f"cleaned_data/{path}"
    df = pd.read_csv(path, dtype=str)

    target_cols = set(df.columns).intersection(nonstandard_columns)
    data = {}

    data = {}
    for col in target_cols:
        values = list(df[col].dropna().unique())[0:3]
        
        values = ', '.join(values)
        data[col] = values
    return data

In [115]:
all_data = []

for path in paths:
    data = process_file(path)
    all_data.append(data)

In [119]:
files = [path.split('/')[1] for path in paths]


In [121]:
final_df = pd.DataFrame(all_data, index=[files])

In [123]:
final_df.to_csv('cleaned_data/LIMS/lith_standardization.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_data/LIMS/lith_standardization.csv'

In [None]:
## LIMS taxa 