# QA LIMS files

Do some basic QA on the LIMS files.

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import shutil
import re
import os
import numpy as np
import pandas as pd

from normalize_data import (
    check_duplicate_columns,
    extract_taxon_group_from_filename,
    csv_cleanup,
    create_sample_name_for_row
    
)

sys.path.append('../')
import db 

In [10]:
LIMS_lith_paths = [
    os.path.join('cleaned_data', 'Lithology_CSV')
]

LIMS_paleo_paths = [
    os.path.join('cleaned_data', 'Micropal_CSV_1'),
    os.path.join('cleaned_data', 'Micropal_CSV_2'),
    os.path.join('cleaned_data', 'Micropal_CSV_3'),
    os.path.join('cleaned_data', 'Micropal_CSV_revised'),
]

LIMS_paths = LIMS_lith_paths + LIMS_paleo_paths


taxa_meta = os.path.join('cleaned_data', 'metadata', 'LIMS', 'Micropal_changes.csv')
lith_meta = os.path.join('cleaned_data', 'metadata', 'LIMS', 'Lithology_changes.csv')


## duplicate column names

check if csv has duplicate column names

In [6]:
def duplicate_columns(directories, file_extension='csv'):
    for directory in directories:
        raw_csvs = glob.glob(f"{directory}/**/*.{file_extension}", recursive=True)

        for path in raw_csvs:
            content = pd.read_csv(path)
            content.dropna(inplace=True, axis='columns', how='all')

            check_duplicate_columns(content, path)

### one file

In [7]:
path='cleaned_data/Micropal_CSV_3/341_benthic_forams_U1417B.csv'
content = pd.read_csv(path)
content.dropna(inplace=True, axis='columns', how='all')

df = check_duplicate_columns(content, path)

cleaned_data/Micropal_CSV_3/341_benthic_forams_U1417B.csv, Type: duplicate columns have different values


### LIMS: Leg 317 - present

In [8]:
duplicate_columns(LIMS_paths)

cleaned_data/Lithology_CSV/323 Core Description Template_U1341A.csv, GRAVEL SIZE CLAST: duplicate columns have different values
cleaned_data/Micropal_CSV_3/341_benthic_forams_U1417B.csv, Type: duplicate columns have different values


### Janus: Leg 130 - 312

### NOAA: Leg 1 - 129

## Look for files that contain certain headers


In [12]:
colors = {
    'COLOR',
    'Clast color',
    'Color',
    'Color (name)',
    'Color code',
    'Color(name)',
    'LITH 1 color',
    'LITH 2 color',
    'Lithology color',
    'Lithology color (Munsell)',
    'Lithology color (simple)',
    'MAJ Lith. color',
    'MAJ Lithology color',
    'MAJ lith color (simple)',
    'MIN Lith. color',
    'MIN lith color (simple)'
}

zones = {
    'Zone name',
    'Zone name (short)'
}

keywords = zones

process all files in a directory on disks

In [15]:
for clean_data_path in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        df = pd.read_csv(path, dtype=str)
        df.dropna(how="all", axis="columns", inplace=True)

        if keywords.intersection(set(df.columns)):
            pass
#             url = 'https://github.com/eODP/data-processing/blob/master/notebooks/'
#             print(path)
        

process all files in a csv

In [24]:
metadata_path = 'tmp/csvs_with_duplicate_sample_names.csv'
df = pd.read_csv(metadata_path)
paths = list(df['path'].unique())

for path in paths:
    df = pd.read_csv(path, dtype=str)
    df.dropna(how="all", axis="columns", inplace=True)

    if keywords.intersection(set(df.columns)):
        print(path)

cleaned_data/Micropal_CSV_1/363-U1487A-nannofossils.csv
cleaned_data/Micropal_CSV_1/363-U1483A-nannofossils.csv
cleaned_data/Micropal_CSV_1/363-U1489C-nannofossils.csv
cleaned_data/Micropal_CSV_1/320_U1331C_Radiolarians_3.csv
cleaned_data/Micropal_CSV_1/363-U1484A-nannofossils.csv
cleaned_data/Micropal_CSV_1/363-U1482A-planktic_forams.csv
cleaned_data/Micropal_CSV_1/363-U1482C-nannofossils.csv
cleaned_data/Micropal_CSV_2/351_U1438E_radiolarians.csv
cleaned_data/Micropal_CSV_2/353_U1448A_diatoms.csv
cleaned_data/Micropal_CSV_2/353_U1446A_nannofossils.csv
cleaned_data/Micropal_CSV_2/371_U1509A_nannofossils.csv
cleaned_data/Micropal_CSV_2/351_U1438A_radiolarians.csv
cleaned_data/Micropal_CSV_2/354_U1451B_planktic_forams.csv
cleaned_data/Micropal_CSV_2/356-U1462C_planktic_forams.csv
cleaned_data/Micropal_CSV_2/354_U1451A_planktic_forams.csv
cleaned_data/Micropal_CSV_2/371_U1510A_nannofossils.csv
cleaned_data/Micropal_CSV_2/353_U1443B_diatoms.csv
cleaned_data/Micropal_CSV_2/362_U1480E_diato

In [23]:
list(df['path'].unique())

['cleaned_data/Micropal_CSV_1/363-U1487A-nannofossils.csv',
 'cleaned_data/Micropal_CSV_1/363-U1483A-nannofossils.csv',
 'cleaned_data/Micropal_CSV_1/363-U1489C-nannofossils.csv',
 'cleaned_data/Micropal_CSV_1/320_U1331C_Radiolarians_3.csv',
 'cleaned_data/Micropal_CSV_1/363-U1484A-nannofossils.csv',
 'cleaned_data/Micropal_CSV_1/363-U1482A-planktic_forams.csv',
 'cleaned_data/Micropal_CSV_1/363-U1482C-nannofossils.csv',
 'cleaned_data/Micropal_CSV_1/318_U1359A_Planktic_Forams.csv',
 'cleaned_data/Micropal_CSV_1/318_U1356A_Diatoms_1.csv',
 'cleaned_data/Micropal_CSV_1/320_U1331B_Nannofossils.csv',
 'cleaned_data/Micropal_CSV_2/346_U1430A_nannofossils.csv',
 'cleaned_data/Micropal_CSV_2/374_U1525A_diatoms.csv',
 'cleaned_data/Micropal_CSV_2/351_U1438E_radiolarians.csv',
 'cleaned_data/Micropal_CSV_2/353_U1448A_diatoms.csv',
 'cleaned_data/Micropal_CSV_2/353_U1446A_nannofossils.csv',
 'cleaned_data/Micropal_CSV_2/371_U1508B_radiolarians.csv',
 'cleaned_data/Micropal_CSV_2/371_U1509A_nann

In [87]:
path = 'cleaned_data/Lithology_CSV/368_macroscopic_U1502A.csv'
df = pd.read_csv(path, nrows=1)

cols = [
    'Sample', 
    'Top [cm]', 
    'Top Depth [m]', 
    'Bottom [cm]', 
    'Bottom Depth [m]', 
    'Extra Sample ID Data'
]
has_dups = sum(df.duplicated(subset = cols)) > 1
        
df.dropna(how="all", axis="columns", inplace=True)

has_dups

False

## files with duplicate rows

Files that have identical rows

In [3]:
dup_rows =[]
files_dup_rows = set()

for clean_data_path in LIMS_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")
    
    for path in raw_csvs:
        content = pd.read_csv(path)
        content.dropna(inplace=True, axis='index', how='all')
        new_df = content[content.duplicated()]
        
        for index, row in new_df.iterrows():
            dup_rows.append({'sample': row['Sample'], 'path': path})
            files_dup_rows.add(path)
        

In [4]:
len(dup_rows)

8

In [5]:
len(files_dup_rows)

2

In [11]:
new_df = pd.DataFrame(dup_rows)
new_df.to_csv('tmp/csvs_with_duplicate_rows.csv', index=False)

## multiple expedition

In [21]:
for clean_data_path in LIMS_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")
    
    for path in raw_csvs:
        pass
#         df = pd.read_csv(path, dtype=str, usecols=['Exp'])
#         if sum(df.duplicated())>0:
#             print(path)
            
path = 'cleaned_data/Lithology_CSV/361_macroscopic_U1474D.csv'    
df = pd.read_csv(path, dtype=str, usecols=['Exp'])

In [24]:
len(df['Exp'].unique())

1

## Missing samples names

Look for files that have rows with no sample names

In [6]:
missing_samples = set()

for clean_data_path in LIMS_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        content = pd.read_csv(path)
        content.dropna(inplace=True, axis='index', how='all')
        
        if sum(content.isna()['Sample']) > 0:  
            missing_samples.add(path)

In [7]:
len(missing_samples)

5

In [8]:
df = pd.DataFrame({"path": list(missing_samples)})
df.to_csv('tmp/csvs_with_missing_samples.csv', index=False)

## compare lith row count

In [44]:
raw_csvs = glob.glob(f"{LIMS_lith_paths[0]}/*.csv")

files = []

for path in raw_csvs:
    content = pd.read_csv(path, dtype=str)
    files.append({"file": path.split('/')[2], "original_count": len(content)})

    

In [45]:
files
df = pd.DataFrame(files)
df.head()

Unnamed: 0,file,original_count
0,361_macroscopic_U1474D.csv,151
1,323 Core Description Template_U1341A.csv,654
2,361_macroscopic_U1479C.csv,254
3,340_sediment_U1393A.csv,20
4,339_sediment_U1386A.csv,592


In [46]:
df.shape

(518, 2)

In [50]:
sql = """
select count(*) as db_count, data_source_notes as file
from  samples
where samples.data_source_type = 'lithology csv'
group by data_source_notes;
"""
db_df = pd.read_sql(sql, db.conn)
db_df.head()

Unnamed: 0,db_count,file
0,49,317_Lithostratigraphy_U1351A.csv
1,499,317_Lithostratigraphy_U1351B.csv
2,75,317_Lithostratigraphy_U1352A.csv
3,740,317_Lithostratigraphy_U1352B.csv
4,768,317_Lithostratigraphy_U1352C.csv


In [51]:
db_df.shape

(516, 2)

In [49]:
merged_db_df = pd.merge(df, db_df, on="file", how="outer")
merged_db_df.head()

Unnamed: 0,file,original_count,db_count
0,361_macroscopic_U1474D.csv,151,151.0
1,323 Core Description Template_U1341A.csv,654,611.0
2,361_macroscopic_U1479C.csv,254,254.0
3,340_sediment_U1393A.csv,20,20.0
4,339_sediment_U1386A.csv,592,592.0


In [52]:
merged_db_df.shape

(518, 3)

In [54]:
merged_db_df['compare'] = merged_db_df['original_count'] == merged_db_df['db_count']
merged_db_df.head()

Unnamed: 0,file,original_count,db_count,compare
0,361_macroscopic_U1474D.csv,151,151.0,True
1,323 Core Description Template_U1341A.csv,654,611.0,False
2,361_macroscopic_U1479C.csv,254,254.0,True
3,340_sediment_U1393A.csv,20,20.0,True
4,339_sediment_U1386A.csv,592,592.0,True


In [55]:
merged_db_df[merged_db_df['compare'] == False]

Unnamed: 0,file,original_count,db_count,compare
1,323 Core Description Template_U1341A.csv,654,611.0,False
5,320 Core Description_U1332A.csv,208,207.0,False
13,334_sediment_U1378B.csv,756,744.0,False
26,323 Core Description Template_U1340D.csv,67,65.0,False
29,323 Core Description Template_U1341B.csv,893,845.0,False
...,...,...,...,...
486,323 Core Description Template_U1343E.csv,783,779.0,False
488,320 Core Description_U1331A.csv,247,238.0,False
499,356-U1464B_macroscopic.csv,320,319.0,False
510,374_U1524A_macroscopic.csv,725,710.0,False


## Compare sample name with exp...extra columns

In [3]:
def create_temp_sample_name(df):
    """Uses Exp...A/W columns to create a name for a sample"""
    names = {"Exp", "Site", "Hole", "Core", "Type", "Section", "A/W"}
    if names.issubset(df.columns):
        df["temp_sample"] = df.apply(
            lambda row: create_sample_name_for_row(row, df.columns), axis=1
        )

    else:
        raise ValueError("File does not have the expected columns.")

In [17]:
files = []

for directory in LIMS_paths:
    raw_csvs = glob.glob(f"{directory}/*.csv")
    
    for path in raw_csvs:
        content = pd.read_csv(path, dtype=str)

        create_temp_sample_name(content)
        content['invalid_sample_name'] = content['Sample'] != content['temp_sample']

        invalid_count = sum(content['invalid_sample_name'])
        if invalid_count > 0:
            files.append({'path': path, 'invalid_sample_name':invalid_count })
            base


In [18]:
files

[{'path': 'cleaned_data/Lithology_CSV/339_sediment_U1390A.csv',
  'invalid_sample_name': 581},
 {'path': 'cleaned_data/Lithology_CSV/334_sediment_U1378B.csv',
  'invalid_sample_name': 756},
 {'path': 'cleaned_data/Lithology_CSV/349_macroscopic_U1431E.csv',
  'invalid_sample_name': 1355},
 {'path': 'cleaned_data/Lithology_CSV/320 Core Description_U1332C.csv',
  'invalid_sample_name': 226},
 {'path': 'cleaned_data/Lithology_CSV/339_sediment_U1386C.csv',
  'invalid_sample_name': 169},
 {'path': 'cleaned_data/Lithology_CSV/344_sediment_U1413B.csv',
  'invalid_sample_name': 2},
 {'path': 'cleaned_data/Lithology_CSV/340_sediment_U1399A.csv',
  'invalid_sample_name': 1},
 {'path': 'cleaned_data/Lithology_CSV/340_sediment_U1395B.csv',
  'invalid_sample_name': 1},
 {'path': 'cleaned_data/Lithology_CSV/340_sediment_U1399B.csv',
  'invalid_sample_name': 670},
 {'path': 'cleaned_data/Lithology_CSV/351_macroscopic_U1438D.csv',
  'invalid_sample_name': 11},
 {'path': 'cleaned_data/Lithology_CSV/329_

## duplicate samples

In [88]:
data = []
files = set()

for directory in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{directory}/*.csv")
    for path in raw_csvs:
        cols = ['Sample', 'Top [cm]', 'Top Depth [m]', 'Bottom [cm]', 'Bottom Depth [m]',
#                'Zone name', 'Zone name (short)',
               'Extra Sample ID Data']
        df = pd.read_csv(path, usecols = cols)
        new_df = df[df.duplicated()]
        for index, row in new_df.iterrows():
            data.append({'sample': row['Sample'], 'path': path})
            files.add(path)

In [89]:
len(data)

469

In [90]:
len(files)

99

In [91]:
new_df = pd.DataFrame(data)
new_df.to_csv('tmp/csvs_with_duplicate_sample_names.csv', index=False)

## gather problematic files for PI 

In [12]:
def add_paths(metadata_path, output_base_path):
    df = pd.read_csv(metadata_path)
    df['raw_data_path'] = ''
    raw_data_index = df.columns.get_loc('raw_data_path')
    df['output_path'] = ''
    output_index = df.columns.get_loc('output_path')
    df['relative_path'] = ''
    relative_index = df.columns.get_loc('relative_path')
    
    for index, row in df.iterrows():
        parts = row['path'].split('/')
        original_directory = parts[1]
        filename = parts[2]

        if original_directory == 'Micropal_CSV_1':
            directory = 'DESC Micropal CSV 1'
        elif original_directory == 'Micropal_CSV_2':
            directory = 'DESC Micropal CSV 2'
        elif original_directory == 'Micropal_CSV_3':
            directory = 'DESC Micropal CSV 3'
        elif original_directory == 'Micropal_CSV_revised':
            directory = 'DESC Micropal CSV revised'
        else:
            directory = 'DESC-Lithology-CSV'
            
        
        df.iloc[index, raw_data_index]  = f'raw_data/{directory}/{filename}'
        df.iloc[index, output_index]  = f'{output_base_path}/{directory}/{filename}'
        df.iloc[index, relative_index]  = f'{directory}/{filename}'


    df.to_csv(metadata_path, index=False)
    
def copy_files(metadata_path):
    df = pd.read_csv(metadata_path)
    
    directories = [re.sub('/[A-Za-z0-9\-_ ]+\.csv$', '', path)for path in list(df['output_path'])]
    unique_directories = set(directories)
    for directory in unique_directories:
        os.makedirs(directory)
        
    for index, row in df.iterrows():
        shutil.copy(row['raw_data_path'], row['output_path'])
        

def create_sample_name(df):
    """Uses Exp...A/W columns to create a name for a sample"""
    names = {"Exp", "Site", "Hole", "Core", "Type", "Section", "A/W"}
    if names.issubset(df.columns):
        df["Temp_Sample"] = df.apply(
            lambda row: create_sample_name_for_row(row, df.columns), axis=1
        )

    else:
        raise ValueError("File does not have the expected columns.")
        

def process_files(metadata_path):
    df = pd.read_csv(metadata_path)
    for index, row in df.iterrows():
        content = pd.read_csv(row['output_path'], dtype=str)
        
        if "Sample" in content.columns:
            pass
        elif "Label ID" in content.columns:
            pass
        else:
            content['Temp_Sample'] = ''
            create_sample_name(content)
            
        content = csv_cleanup(content, row['output_path'])
        content.to_csv(row['output_path'], index=False)


In [10]:
metadata_path = 'tmp/csvs_with_duplicate_sample_names.csv'
output_base_path = 'tmp/duplicate_samples'

add_paths(metadata_path, output_base_path)
copy_files(metadata_path)
process_files(metadata_path)


In [11]:
metadata_path = 'tmp/csvs_with_missing_samples.csv'
output_base_path = 'tmp/missing_samples'

add_paths(metadata_path, output_base_path)
copy_files(metadata_path)
process_files(metadata_path)

## find all taxon groups

In [5]:
taxon_groups = set()

for directory in LIMS_paleo_paths:
    raw_csvs = glob.glob(f"{directory}/*.csv")
    for path in raw_csvs:
        
        parts = path.split('/')
        filename = parts[2]
        partial_path = '/'.join(parts[1:3])
        taxon_group = extract_taxon_group_from_filename(filename)
        taxon_groups.add(taxon_group)


In [6]:
taxon_groups

{'benthic_forams',
 'bolboformids',
 'chrysophyte_cysts',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'silicoflagellates'}

## count metadata

In [15]:
taxa_df = pd.read_csv(taxa_meta)
taxa_df.shape

(1116, 14)

In [16]:
taxa_df.columns

Index(['file', 'path', 'taxon_group', 'empty_rows_columns',
       'remove_identical_rows', 'remove_identical_columns',
       'standardize_headers', 'add_expedition_aw_cols', 'add_sample_column',
       'add_missing_cols', 'clean_up_taxa_values',
       'clean_up_taxa_metadata_values', 'update_zones', 'add_missing_zone'],
      dtype='object')

https://stackoverflow.com/questions/53550988/count-occurrences-of-false-or-true-in-a-column-in-pandas

In [13]:
taxa_df.add_expedition_aw_cols.sum()

614

In [14]:
taxa_df.add_sample_column.sum()

29