# QA LIMS files

Do some basic QA on the LIMS files.

In [3]:
import sys
sys.path.append('../scripts/')
import glob
import shutil
import re
import os
import numpy as np
import pandas as pd

from normalize_data import (
    check_duplicate_columns,
    extract_taxon_group_from_filename,
    csv_cleanup,
    create_sample_name_for_row
    
)

In [2]:
LIMS_lith_paths = [
    'cleaned_data/Lithology_CSV',
]

LIMS_paleo_paths = [
    'cleaned_data/Micropal_CSV_1', 
    'cleaned_data/Micropal_CSV_2',
    'cleaned_data/Micropal_CSV_3',
    'cleaned_data/Micropal_CSV_revised',
]

LIMS_paths = LIMS_lith_paths + LIMS_paleo_paths

taxa_list = 'cleaned_data/taxa/'

## duplicate column names

check if csv has duplicate column names

In [11]:
def duplicate_columns(directories, file_extension='csv'):
    for directory in directories:
        raw_csvs = glob.glob(f"{directory}/**/*.{file_extension}", recursive=True)

        for path in raw_csvs:
            content = pd.read_csv(path)
            content.dropna(inplace=True, axis='columns', how='all')

            check_duplicate_columns(content, path)

### one file

In [22]:
path='cleaned_data/Micropal_CSV_3/341_benthic_forams_U1417B.csv'
content = pd.read_csv(path)
content.dropna(inplace=True, axis='columns', how='all')

check_duplicate_columns(content, path)

### LIMS: Leg 317 - present

In [23]:
duplicate_columns(LIMS_paths)

cleaned_data/Lithology_CSV/323 Core Description Template_U1341A.csv, GRAVEL SIZE CLAST: duplicate columns have different values


### Janus: Leg 130 - 312

### NOAA: Leg 1 - 129

## Look for files that contain taxon name


In [63]:
taxon_name =  'Preservation palynofacies'

for clean_data_path in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        df = pd.read_csv(path, nrows=1)
        df.dropna(how="all", axis="columns")
        if taxon_name in list(df.columns):
            url = 'https://github.com/eODP/data-processing/blob/master/notebooks/'
            print(f'{url}{path}')
        

https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1335A_Nannofossils_1.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1336A_Nannofossils_1.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1335B_Nannofossils_1.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1334B_Nannofossils_1.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1336B_Nannofossils.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1334C_Nannofossils_1.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_3/321_U1338B_Planktic_forams.csv
https://github.com/eODP/data-processing/b

## Look for duplicate samples in all mircopal files

In [6]:
data =[]
for clean_data_path in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")
    
    for path in raw_csvs:
        content = pd.read_csv(path, usecols=['Sample'])
        content.dropna(inplace=True, axis='index', how='all')
        new_df = content[content.duplicated()]
        for index, row in new_df.iterrows():
            data.append({'sample': row['Sample'], 'path': path})
        

In [7]:
len(data)

610

## files with duplicate rows

Files that have identical rows

In [8]:
dup_rows =[]
files_dup_rows = set()

for clean_data_path in LIMS_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")
    
    for path in raw_csvs:
        content = pd.read_csv(path)
        content.dropna(inplace=True, axis='index', how='all')
        new_df = content[content.duplicated()]
        
        for index, row in new_df.iterrows():
            dup_rows.append({'sample': row['Sample'], 'path': path})
            files_dup_rows.add(path)
        

In [9]:
len(dup_rows)

104

In [10]:
len(files_dup_rows)

33

In [11]:
new_df = pd.DataFrame(dup_rows)
new_df.to_csv('tmp/csvs_with_duplicate_rows.csv', index=False)

## Missing samples names

Look for files that have rows with no sample names

In [68]:
missing_samples = set()

for clean_data_path in LIMS_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        content = pd.read_csv(path)
        content.dropna(inplace=True, axis='index', how='all')
        
        if sum(content.isna()['Sample']) > 0:  
            missing_samples.add(path)

In [69]:
len(missing_samples)

5

In [71]:
df = pd.DataFrame({"path": list(missing_samples)})
df.to_csv('tmp/csvs_with_missing_samples.csv', index=False)

## Compare sample name with exp...extra columns

In [13]:
def create_temp_sample_name(df):
    """Uses Exp...A/W columns to create a name for a sample"""
    names = {"Exp", "Site", "Hole", "Core", "Type", "Section", "A/W"}
    if names.issubset(df.columns):
        df["temp_sample"] = df.apply(
            lambda row: create_sample_name_for_row(row, df.columns), axis=1
        )

    else:
        raise ValueError("File does not have the expected columns.")

In [14]:
path = 'cleaned_data/Micropal_CSV_1/375_U1518F_planktic_forams.csv'
content = pd.read_csv(path, dtype=str)

create_temp_sample_name(content)

content['valid_sample_name'] = content['Sample'] == content['temp_sample']

if sum(content['valid_sample_name']) > 0:
    print(content.head())

                       Sample Top [cm] Bottom [cm] Top Depth [m]  \
0  375-U1518F-2R-CC-PAL-FORAM        0          10         199.8   
1  375-U1518F-3R-CC-PAL-FORAM        0          10        212.49   
2  375-U1518F-4R-CC-PAL-FORAM        0          10        219.21   
3  375-U1518F-5R-CC-PAL-FORAM        0          10        229.91   
4  375-U1518F-6R-CC-PAL-FORAM        0          13        239.08   

  Bottom Depth [m] Preservation Group abundance General comment  \
0            199.9           VG               D             NaN   
1           212.59           VG               D             NaN   
2           219.31       VG (M)               A             NaN   
3           230.01           VG               A             NaN   
4           239.21           VG               D             NaN   

  Beella digitata Beella praedigitata  ... Hole Core Type Section  A/W  \
0             NaN                 NaN  ...    F    2    R      CC  PAL   
1             NaN                 NaN  .

In [6]:
for directory in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{directory}/*.csv")
    for path in raw_csvs:
#         print(path)
        pass

path = 'cleaned_data/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv'
content = pd.read_csv(path)
content.dropna(inplace=True, axis='index', how='all')
my_create_temp_sample_name(content)

content['invalid_sample_name'] = content['Sample'] != content['temp_sample']

if sum(content['invalid_sample_name']) > 0:
    print(path)

#             print(content[['Sample', 'temp_sample']])

----
363-U1482A-1H-CC-nan
----
363-U1482A-2H-CC-nan
----
363-U1482A-3H-CC-nan
----
363-U1482A-4H-CC-nan
----
363-U1482A-5H-CC-nan
----
363-U1482A-6H-CC-nan
----
363-U1482A-7H-CC-nan
----
363-U1482A-8H-CC-nan
----
363-U1482A-9H-CC-nan
----
363-U1482A-10H-CC-nan
----
363-U1482A-11H-CC-nan
----
363-U1482A-12H-CC-nan
----
363-U1482A-13H-CC-nan
----
363-U1482A-14H-CC-nan
----
363-U1482A-15H-CC-nan
----
363-U1482A-16H-CC-nan
----
363-U1482A-17H-CC-nan
----
363-U1482A-18H-CC-nan
----
363-U1482A-19H-CC-nan
----
363-U1482A-20H-CC-nan
----
363-U1482A-21H-CC-nan
----
363-U1482A-22H-CC-nan
----
363-U1482A-23H-CC-nan
----
363-U1482A-24H-CC-nan
----
363-U1482A-25H-CC-nan
----
363-U1482A-26H-CC-nan
----
363-U1482A-27H-CC-nan
----
363-U1482A-28H-CC-nan
----
363-U1482A-29H-CC-nan
----
363-U1482A-30H-CC-nan
----
363-U1482A-31H-CC-nan
----
363-U1482A-32H-CC-nan
----
363-U1482A-33H-CC-nan
----
363-U1482A-34H-CC-nan
----
363-U1482A-35H-CC-nan
----
363-U1482A-36H-CC-nan
----
363-U1482A-37H-CC-nan
----
363-U

## duplicate sample

In [78]:
data = []
files = set()

for directory in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{directory}/*.csv")
    for path in raw_csvs:
        cols = ['Sample', 'Top [cm]', 'Top Depth [m]', 'Bottom [cm]', 'Bottom Depth [m]',
               'Zone name', 'Zone name (short)', 'Extra Sample ID Data']
        df = pd.read_csv(path, usecols = cols)
        new_df = df[df.duplicated()]
        for index, row in new_df.iterrows():
            data.append({'sample': row['Sample'], 'path': path})
            files.add(path)

In [79]:
len(data)

251

In [80]:
len(files)

78

In [81]:
new_df = pd.DataFrame(data)
new_df.to_csv('tmp/csvs_with_duplicate_sample_names.csv', index=False)

## gather problematic files for PI 

In [12]:
def add_paths(metadata_path, output_base_path):
    df = pd.read_csv(metadata_path)
    df['raw_data_path'] = ''
    raw_data_index = df.columns.get_loc('raw_data_path')
    df['output_path'] = ''
    output_index = df.columns.get_loc('output_path')
    df['relative_path'] = ''
    relative_index = df.columns.get_loc('relative_path')
    
    for index, row in df.iterrows():
        parts = row['path'].split('/')
        original_directory = parts[1]
        filename = parts[2]

        if original_directory == 'Micropal_CSV_1':
            directory = 'DESC Micropal CSV 1'
        elif original_directory == 'Micropal_CSV_2':
            directory = 'DESC Micropal CSV 2'
        elif original_directory == 'Micropal_CSV_3':
            directory = 'DESC Micropal CSV 3'
        elif original_directory == 'Micropal_CSV_revised':
            directory = 'DESC Micropal CSV revised'
        else:
            directory = 'DESC-Lithology-CSV'
            
        
        df.iloc[index, raw_data_index]  = f'raw_data/{directory}/{filename}'
        df.iloc[index, output_index]  = f'{output_base_path}/{directory}/{filename}'
        df.iloc[index, relative_index]  = f'{directory}/{filename}'


    df.to_csv(metadata_path, index=False)
    
def copy_files(metadata_path):
    df = pd.read_csv(metadata_path)
    
    directories = [re.sub('/[A-Za-z0-9\-_ ]+\.csv$', '', path)for path in list(df['output_path'])]
    unique_directories = set(directories)
    for directory in unique_directories:
        os.makedirs(directory)
        
    for index, row in df.iterrows():
        shutil.copy(row['raw_data_path'], row['output_path'])
        

def create_sample_name(df):
    """Uses Exp...A/W columns to create a name for a sample"""
    names = {"Exp", "Site", "Hole", "Core", "Type", "Section", "A/W"}
    if names.issubset(df.columns):
        df["Temp_Sample"] = df.apply(
            lambda row: create_sample_name_for_row(row, df.columns), axis=1
        )

    else:
        raise ValueError("File does not have the expected columns.")
        

def process_files(metadata_path):
    df = pd.read_csv(metadata_path)
    for index, row in df.iterrows():
        content = pd.read_csv(row['output_path'], dtype=str)
        
        if "Sample" in content.columns:
            pass
        elif "Label ID" in content.columns:
            pass
        else:
            content['Temp_Sample'] = ''
            create_sample_name(content)
            
        content = csv_cleanup(content, row['output_path'])
        content.to_csv(row['output_path'], index=False)


In [10]:
metadata_path = 'tmp/csvs_with_duplicate_sample_names.csv'
output_base_path = 'tmp/duplicate_samples'

add_paths(metadata_path, output_base_path)
copy_files(metadata_path)
process_files(metadata_path)


In [11]:
metadata_path = 'tmp/csvs_with_missing_samples.csv'
output_base_path = 'tmp/missing_samples'

add_paths(metadata_path, output_base_path)
copy_files(metadata_path)
process_files(metadata_path)

## find all taxon groups

In [5]:
taxon_groups = set()

for directory in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{directory}/*.csv")
    for path in raw_csvs:
        
        parts = path.split('/')
        filename = parts[2]
        partial_path = '/'.join(parts[1:3])
        taxon_group = extract_taxon_group_from_filename(filename)
        taxon_groups.add(taxon_group)


In [6]:
taxon_groups

{'benthic_forams',
 'bolboformids',
 'chrysophyte_cysts',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'silicoflagellates'}