# Create Lithology metadata

Create a metadata file that tracks changes will we make to the Lithology CSVs files.

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import re

import pandas as pd

from normalize_data import create_directory, get_expedition_from_csv

In [2]:
clean_data_path = 'cleaned_data/Lithology_CSV'
metadata_path = 'cleaned_data/metadata/Lithology_changes.csv'

In [3]:
raw_csvs = glob.glob(f"{clean_data_path}/*.csv")
raw_csvs[0:6]

['cleaned_data/Lithology_CSV/361_macroscopic_U1474D.csv',
 'cleaned_data/Lithology_CSV/323 Core Description Template_U1341A.csv',
 'cleaned_data/Lithology_CSV/361_macroscopic_U1479C.csv',
 'cleaned_data/Lithology_CSV/340_sediment_U1393A.csv',
 'cleaned_data/Lithology_CSV/339_sediment_U1386A.csv',
 'cleaned_data/Lithology_CSV/320 Core Description_U1332A.csv']

In [4]:
len(raw_csvs)

518

Get expedition for each CSV. Copy all the CSVs.

In [5]:
expeditions = []
filenames = []
filename_has_exps = []

for path in raw_csvs:
    filename = path.split('/')[2]
    starts_with_expedition = re.search('^X?([0-9]{3})[_ \-]', filename)

    if starts_with_expedition != None:
        expedition = starts_with_expedition.groups()[0]
        filename_has_exp = True
    else:
        df = pd.read_csv(path)
        expedition = get_expedition_from_csv(df)
        filename_has_exp = False
            
    expeditions.append(expedition)
    filenames.append(filename)
    filename_has_exps.append(filename_has_exp)

## create metadata csv

In [6]:
dict = {"file": filenames,
        "expedition": expeditions,
        "filename_has_exp": filename_has_exps
        }
metadata = pd.DataFrame(dict)
metadata.shape

(518, 3)

In [7]:
metadata.head()

Unnamed: 0,file,expedition,filename_has_exp
0,361_macroscopic_U1474D.csv,361,True
1,323 Core Description Template_U1341A.csv,323,True
2,361_macroscopic_U1479C.csv,361,True
3,340_sediment_U1393A.csv,340,True
4,339_sediment_U1386A.csv,339,True


In [8]:
metadata.to_csv(metadata_path, index=False)