# QA LIMS files

Do some basic QA on the LIMS files.

In [1]:
import sys
sys.path.append('../scripts/')
import glob

import pandas as pd

from normalize_data import (
    check_duplicate_columns
)

In [2]:
LIMS_lith_paths = [
    'cleaned_data/Lithology_CSV',
]

LIMS_paleo_paths = [
    'cleaned_data/Micropal_CSV_1', 
    'cleaned_data/Micropal_CSV_2',
    'cleaned_data/Micropal_CSV_3',
    'cleaned_data/Micropal_CSV_revised',
]

LIMS_paths = LIMS_lith_paths + LIMS_paleo_paths

taxa_list = 'cleaned_data/taxa/'

## duplicate column names

check if csv has duplicate column names

In [3]:
def duplicate_columns(directories, file_extension='csv'):
    for directory in directories:
        raw_csvs = glob.glob(f"{directory}/**/*.{file_extension}", recursive=True)

        for path in raw_csvs:
            content = pd.read_csv(path, nrows=1)
            content.dropna(inplace=True, axis='columns', how='all')

            check_duplicate_columns(content, path)

### LIMS: Leg 317 - present

In [4]:
duplicate_columns(LIMS_paths)

cleaned_data/Lithology_CSV/323 Core Description Template_U1341A.csv, GRAVEL SIZE CLAST: duplicate columns have different values


### Janus: Leg 130 - 312

### NOAA: Leg 1 - 129

## Look for files that contain taxon name


In [5]:
taxon_name =  'Preservation palynofacies'

for clean_data_path in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        df = pd.read_csv(path, nrows=1)
        df.dropna(how="all", axis="columns")
        if taxon_name in list(df.columns):
            url = 'https://github.com/eODP/data-processing/blob/master/notebooks/'
            print(f'{url}{path}')
        

https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_2/374_U1522A_palynology.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_2/374_U1523E_palynology.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_2/374_U1525A_palynology.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_2/374_U1523B_palynology.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_2/374_U1524C_palynology.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_2/374_U1524A_palynology.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_2/374_U1523A_palynology.csv
https://github.com/eODP/data-processing/blob/master/notebooks/cleaned_data/Micropal_CSV_2/374_U1521A_palynology.csv


## Look for duplicate samples in all mircopal files

In [6]:
data =[]
for clean_data_path in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")
    
    for path in raw_csvs:
        content = pd.read_csv(path, usecols=['Sample'])
        content.dropna(inplace=True, axis='index', how='all')
        new_df = content[content.duplicated()]
        for index, row in new_df.iterrows():
            data.append({'sample': row['Sample'], 'path': path})
        

In [7]:
len(data)

610

## files with duplicate rows

Files that have identical rows

In [8]:
dup_rows =[]
files_dup_rows = set()

for clean_data_path in LIMS_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")
    
    for path in raw_csvs:
        content = pd.read_csv(path)
        content.dropna(inplace=True, axis='index', how='all')
        new_df = content[content.duplicated()]
        
        for index, row in new_df.iterrows():
            dup_rows.append({'sample': row['Sample'], 'path': path})
            files_dup_rows.add(path)
        

In [9]:
len(dup_rows)

104

In [10]:
len(files_dup_rows)

33

In [11]:
new_df = pd.DataFrame(dup_rows)
new_df.to_csv('tmp/csvs_with_duplicate_rows.csv', index=False)

## Missing samples names

Look for files that have rows with no sample names

In [12]:
missing_samples = set()
url = "https://github.com/eODP/data-processing/blob/master/notebooks/"

for clean_data_path in LIMS_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        content = pd.read_csv(path)
        content.dropna(inplace=True, axis='index', how='all')
        
        if sum(content.isna()['Sample']) > 0:  
            missing_samples.add(url + path)

In [13]:
len(missing_samples)

6

In [14]:
df = pd.DataFrame({"files with missing sample names": list(missing_samples)})
df.to_csv('tmp/csvs_with_missing_samples.csv', index=False)