# QA LIMS files

Do some basic QA on the LIMS files.

In [7]:
import sys
sys.path.append('../scripts/')
import glob

import pandas as pd

from normalize_data import (
    check_duplicate_columns,
    extract_taxon_group_from_filename
)

In [8]:
LIMS_lith_paths = [
    'cleaned_data/Lithology_CSV',
]

LIMS_paleo_paths = [
    'cleaned_data/Micropal_CSV_1', 
    'cleaned_data/Micropal_CSV_2',
    'cleaned_data/Micropal_CSV_3',
    'cleaned_data/Micropal_CSV_revised',
]

LIMS_paths = LIMS_lith_paths + LIMS_paleo_paths

taxa_list = 'cleaned_data/taxa/'

## duplicate column names

check if csv has duplicate column names

In [11]:
def duplicate_columns(directories, file_extension='csv'):
    for directory in directories:
        raw_csvs = glob.glob(f"{directory}/**/*.{file_extension}", recursive=True)

        for path in raw_csvs:
            content = pd.read_csv(path)
            content.dropna(inplace=True, axis='columns', how='all')

            check_duplicate_columns(content, path)

### one file

In [22]:
path='cleaned_data/Micropal_CSV_3/341_benthic_forams_U1417B.csv'
content = pd.read_csv(path)
content.dropna(inplace=True, axis='columns', how='all')

check_duplicate_columns(content, path)

### LIMS: Leg 317 - present

In [23]:
duplicate_columns(LIMS_paths)

cleaned_data/Lithology_CSV/323 Core Description Template_U1341A.csv, GRAVEL SIZE CLAST: duplicate columns have different values


### Janus: Leg 130 - 312

### NOAA: Leg 1 - 129

## Look for files that contain taxon name


In [63]:
taxon_name =  'Preservation palynofacies'

for clean_data_path in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        df = pd.read_csv(path, nrows=1)
        df.dropna(how="all", axis="columns")
        if taxon_name in list(df.columns):
            url = 'https://github.com/eODP/data-processing/blob/master/notebooks/'
            print(f'{url}{path}')
        

https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1335A_Nannofossils_1.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1336A_Nannofossils_1.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1335B_Nannofossils_1.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1334B_Nannofossils_1.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1336B_Nannofossils.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_1/320_U1334C_Nannofossils_1.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_3/321_U1338B_Planktic_forams.csv
https://github.com/eODP/data-processing/b

## Look for duplicate samples in all mircopal files

In [6]:
data =[]
for clean_data_path in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")
    
    for path in raw_csvs:
        content = pd.read_csv(path, usecols=['Sample'])
        content.dropna(inplace=True, axis='index', how='all')
        new_df = content[content.duplicated()]
        for index, row in new_df.iterrows():
            data.append({'sample': row['Sample'], 'path': path})
        

In [7]:
len(data)

610

## files with duplicate rows

Files that have identical rows

In [8]:
dup_rows =[]
files_dup_rows = set()

for clean_data_path in LIMS_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")
    
    for path in raw_csvs:
        content = pd.read_csv(path)
        content.dropna(inplace=True, axis='index', how='all')
        new_df = content[content.duplicated()]
        
        for index, row in new_df.iterrows():
            dup_rows.append({'sample': row['Sample'], 'path': path})
            files_dup_rows.add(path)
        

In [9]:
len(dup_rows)

104

In [10]:
len(files_dup_rows)

33

In [11]:
new_df = pd.DataFrame(dup_rows)
new_df.to_csv('tmp/csvs_with_duplicate_rows.csv', index=False)

## Missing samples names

Look for files that have rows with no sample names

In [12]:
missing_samples = set()
url = "https://github.com/eODP/data-processing/blob/master/notebooks/"

for clean_data_path in LIMS_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        content = pd.read_csv(path)
        content.dropna(inplace=True, axis='index', how='all')
        
        if sum(content.isna()['Sample']) > 0:  
            missing_samples.add(url + path)

In [13]:
len(missing_samples)

6

In [14]:
df = pd.DataFrame({"files with missing sample names": list(missing_samples)})
df.to_csv('tmp/csvs_with_missing_samples.csv', index=False)

## duplicate sample

In [3]:
data = []
files = set()

for directory in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{directory}/*.csv")
    for path in raw_csvs:
        cols = ['Sample', 'Top [cm]', 'Top Depth [m]', 'Bottom [cm]', 'Bottom Depth [m]',
               'Zone name', 'Zone name (short)', 'Extra Sample ID Data']
        df = pd.read_csv(path, usecols = cols)
        new_df = df[df.duplicated()]
        for index, row in new_df.iterrows():
            data.append({'sample': row['Sample'], 'path': path})
            files.add(path)

In [4]:
len(data)

243

In [5]:
len(files)

78

In [6]:
new_df = pd.DataFrame(data)
new_df.to_csv('tmp/csvs_with_duplicate_sample_names.csv', index=False)

## find all taxon groups

In [5]:
taxon_groups = set()

for directory in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{directory}/*.csv")
    for path in raw_csvs:
        
        parts = path.split('/')
        filename = parts[2]
        partial_path = '/'.join(parts[1:3])
        taxon_group = extract_taxon_group_from_filename(filename)
        taxon_groups.add(taxon_group)


In [6]:
taxon_groups

{'benthic_forams',
 'bolboformids',
 'chrysophyte_cysts',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'silicoflagellates'}