# Copy lithology CSVs and create metadata

Make a copy of all the Lithology CSVs files. Create a metadata file that tracks changes will we make to the files.

In [3]:
import sys
sys.path.append('../scripts/')
import glob
import re
import shutil  

import pandas as pd
import numpy as np

from normalize_data import create_directory


In [4]:
clean_data_path = 'cleaned_data/Lithology_CSV'
create_directory(clean_data_path)

In [5]:
metadata_path = 'cleaned_data/metadata'
create_directory(metadata_path)

In [6]:
raw_csvs = glob.glob("./raw_data/DESC-Lithology-CSV/*.csv")
raw_csvs[0:6]

['./raw_data/DESC-Lithology-CSV/361_macroscopic_U1474D.csv',
 './raw_data/DESC-Lithology-CSV/323 Core Description Template_U1341A.csv',
 './raw_data/DESC-Lithology-CSV/361_macroscopic_U1479C.csv',
 './raw_data/DESC-Lithology-CSV/340_sediment_U1393A.csv',
 './raw_data/DESC-Lithology-CSV/339_sediment_U1386A.csv',
 './raw_data/DESC-Lithology-CSV/320 Core Description_U1332A.csv']

In [7]:
print(len(raw_csvs))

518


In [25]:
def get_expedition_from_csv(path):
    df = pd.read_csv(path)
    
    if 'Label ID' in df.columns:
        expedition = df['Label ID'][0].split('-')[0] 
    elif 'Sample' in df.columns:
        expedition = df['Sample'][0].split('-')[0] 
    else:
        expedition = None
        
    return expedition

Get expedition for each CSV. Copy all the CSVs.

In [34]:
expeditions = []
filenames = []
filename_has_exps = []

for path in raw_csvs:
    filename = path.split('/')[3]
    starts_with_expedition = re.search('^X?([0-9]{3})[_ \-]', filename)

    if starts_with_expedition != None:
        expedition = starts_with_expedition.groups()[0]
        filename_has_exp = True
    else:
        expedition = get_expedition_from_csv(path)
        filename_has_exp = False
            
    expeditions.append(expedition)
    filenames.append(filename)
    filename_has_exps.append(filename_has_exp)
    
    shutil.copyfile(path, f"{clean_data_path}/{filename}")  


## create metadata csv

In [38]:
dict = {"file": filenames,
        "expedition": expeditions,
        "filename_has_exp": filename_has_exps
        }
metadata = pd.DataFrame(dict)
metadata.shape

(518, 3)

In [41]:
metadata.head()

Unnamed: 0,file,expedition,filename_has_exp
0,361_macroscopic_U1474D.csv,361,True
1,323 Core Description Template_U1341A.csv,323,True
2,361_macroscopic_U1479C.csv,361,True
3,340_sediment_U1393A.csv,340,True
4,339_sediment_U1386A.csv,339,True


In [40]:
metadata.to_csv(f'{metadata_path}/Lithology_changes.csv', index=False)