# LIMS dolomite
find all lithology files with dolomite

In [4]:
import sys
sys.path.append('../../')
import glob
import re

import pandas as pd
import numpy as np
from config import OUTPUT_DIR, CLEAN_DATA_DIR, RAW_DATA_DIR


In [5]:
base_dir = CLEAN_DATA_DIR
LIMS = OUTPUT_DIR/'metadata'/'LIMS'/'Lithology_changes.csv' 
NOAA_1_96 = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_dsdp_files.csv'
NOAA_101_124 = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_janus_files.csv'

files = [NOAA_1_96,NOAA_101_124, LIMS]

# find all files with dolmite

In [7]:
word =  'dolomite'

lims_columns = ['Exp', 'Site', 'Hole', 'Core', 'Section', 'text', 'path']
noaa_columns = ['leg', 'site', 'hole', 'core', 'section', 'text', 'path']

df_word = pd.DataFrame(columns=lims_columns)

def has_word(row):
    text = []
    word_present = False
    
    for col in row.keys():
        if row[col] == row[col] and word in row[col].lower():
            word_present = True
            text.append(f"{col}: {row[col]}")

    row['text'] = ' | '.join(text)
    
    if 'Exp' in row:
        columns = lims_columns
    else:
        columns = noaa_columns

         
    if word_present:
            df_word.loc[len(df_word.index)] = [row[col] for col in columns]
            
            
for file in files:
    metadata = pd.read_csv(file, dtype=str)
    for index, row in metadata.iterrows():
        if 'type' in row and row['type'] == 'lithology':
            path = row['path']
        else:
            path = row['path']
            
        df = pd.read_csv(base_dir / path, dtype=str)
        df['text'] = ''
        df['path'] = path
        df.apply(has_word, axis=1)

df_word.head()

Unnamed: 0,Exp,Site,Hole,Core,Section,text,path
0,50,415,A,8,2,lithology: DOLOMITE,NOAA/DSDP_core_data/50/415A/vistxt.csv
1,50,415,A,10,CC,lithology: MUDSTONE WITH DOLOMITE LAYERS,NOAA/DSDP_core_data/50/415A/vistxt.csv
2,32,307,,9,1,lithology: DOLOMITE BEARING NANNO CHALK,NOAA/DSDP_core_data/32/307/vistxt.csv
3,32,307,,9,1,structures: SHARP CONTACT OF AB OVE DOLOMITE ...,NOAA/DSDP_core_data/32/307/vistxt.csv
4,32,307,,9,1,structures: SHARP CONTACT OF AB OVE DOLOMITE ...,NOAA/DSDP_core_data/32/307/vistxt.csv


In [260]:
df_word.to_csv(OUTPUT_DIR / 'tmp' / 'dolomite.csv', index=False)

## find all lithology

In [6]:
sources = [
    {
        'metadata': NOAA_1_96, 
        'source': 'dsdp'
    },
    {
        'metadata': NOAA_101_124, 
        'source': 'janus'
    },
    {
        'metadata': LIMS, 
        'source': 'lims'
    }
]

lims_columns = ['Exp', 'Site', 'Hole', 'Core', 'Section', 'All Lithology', 'path']
noaa_columns = ['leg', 'site', 'hole', 'core', 'section', 'All Lithology', 'path']
rows = []

def process_file(path, source):
    df = pd.read_csv(base_dir / path, dtype=str)
    df.fillna('', inplace=True)
    if source['source'] == 'lims':
        df['All Lithology'] = df['Lithology Prefix'] + ' ' + df['Lithology Principal Name'] + ' ' + df['Lithology Suffix']
    else:
        df = df.rename(columns={"leg": "Exp", "site": "Site", "hole": "Hole", "core": "Core", "section": "Section"})
        df['All Lithology'] = df['lithology']
        
    df['path'] = path
    
    for index, row in df.iterrows():
        rows.append({col: row[col] for col in lims_columns})


for source in sources:
    metadata = pd.read_csv(source['metadata'], dtype=str)
    for index, row in metadata.iterrows():
#         if index > 1:
#             continue

        if 'type' in metadata.columns:
            if row['type'] == 'lithology':
                path = row['path']
                process_file(path, source)
        else:
            path = row['path']
            process_file(path, source)
       




In [7]:
len(rows)

305083

In [8]:
df =  pd.DataFrame(rows)
df.tail()

Unnamed: 0,Exp,Site,Hole,Core,Section,All Lithology,path
305078,361,U1476,A,24,4,nannofossil ooze [2014] with silt,LIMS/Lithology_CSV/361_macroscopic_U1476A.csv
305079,361,U1476,A,24,5,nannofossil ooze [2014] with silt,LIMS/Lithology_CSV/361_macroscopic_U1476A.csv
305080,361,U1476,A,24,6,nannofossil ooze [2014] with silt,LIMS/Lithology_CSV/361_macroscopic_U1476A.csv
305081,361,U1476,A,24,7,nannofossil ooze [2014] with silt,LIMS/Lithology_CSV/361_macroscopic_U1476A.csv
305082,361,U1476,A,24,CC,nannofossil ooze [2014] with silt,LIMS/Lithology_CSV/361_macroscopic_U1476A.csv


In [None]:
df.to_csv('tmp/all_lithology.csv', index=False)