# QC samples

In [1]:
import sys
sys.path.append('../../')
import pandas as pd
import glob
from pathlib import Path
from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR
import db as db


from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
)

from scripts.shared_utils import (
    log_df
)

In [4]:
CLEAN_DATA_DIR

PosixPath('../../output/cleaned_data')

In [5]:
clean_data_paths = [
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_1', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_2', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_3', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_4', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_revised', 
]

date = '2022-04-28'


taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"
taxa_crosswalk_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"


## check if all files are imported

In [24]:
files = []

for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")
    for csv in raw_csvs:
        files.append(csv.split('cleaned_data/')[1])

files[0:3]

['LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv',
 'LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv',
 'LIMS/Micropal_CSV_1/375_U1518F_planktic_forams.csv']

In [28]:
len(files)

1253

In [53]:
sql = """
select distinct(data_source_notes) as file
from samples 
where data_source_type = 'micropal csv';
"""

db_files = []
rows = db.fetch_all(sql)
for row in rows:
    db_files.append(row['file'])
    
db_files[0:3]

['LIMS/Micropal_CSV_3/339_nannofossils_U1387C.csv',
 'LIMS/Micropal_CSV_4/317_U1353_planktic_forams.csv',
 'LIMS/Micropal_CSV_3/341_diatoms_U1417E.csv']

In [54]:
len(db_files)

1253

In [55]:
set(db_files) - set(files)

set()

## check if taxa column for all files are imported

get files with no samples_taxa

In [60]:
sql = """
select distinct(data_source_notes) from samples where data_source_type = 'micropal csv'
except
select distinct(data_source_notes) from samples_taxa;
"""

db_files = []
rows = db.fetch_all(sql)
for row in rows:
    db_files.append(row['data_source_notes'])
    
db_files[0:3]

['LIMS/Micropal_CSV_2/371_U1511B_benthic_forams.csv',
 'LIMS/Micropal_CSV_2/371_U1511A_benthic_forams.csv',
 'LIMS/Micropal_CSV_2/346_U1423C_nannofossils.csv']

In [61]:
len(db_files)

48

get all verbatim_name

In [62]:
sql = """
select distinct(verbatim_name)
from taxa_crosswalk;
"""
taxa = []
rows = db.fetch_all(sql)
for row in rows:
    taxa.append(row['verbatim_name'])
    
taxa[0:3]

['Valkyria pukapuka', 'Spirotextularia fistulosa', 'Monalysidium spp.']

print file name if there are taxa columns with abundance values

In [63]:
errors = False 

for file in db_files:
    path = CLEAN_DATA_DIR/file
    df = pd.read_csv(path)
    df.dropna(axis=1, how='all', inplace=True)
    
    if len(set(df.columns).intersection(set(taxa))) > 0:
        print(path)
        errors = True 
        
if not errors:
    print('all files ok')

all files ok


## Look for duplicate sample names in all mircopal files

In [10]:
data =[]
target_columns = [
    'Sample', 
    'Top [cm]', 
    'Bottom [cm]', 
    'Top Depth [m]', 
    'Bottom Depth [m]'
]
for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        content = pd.read_csv(path)
        
        new_df = content[content.duplicated(subset=['Sample'])]
        for index, row in new_df.iterrows():
            data.append({'sample': row['Sample'],  'path': path})

In [11]:
new_df = pd.DataFrame(data)
new_df.shape

(636, 2)

In [12]:
new_df

Unnamed: 0,sample,path
0,363-U1487A-4H-CC-PAL-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
1,363-U1487A-5H-CC-PAL-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
2,363-U1483A-1H-1-A 0/0-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
3,363-U1483A-9H-6-W 50/50-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
4,363-U1483A-11H-2-W 50/50-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
...,...,...
631,323-U1345A-1H-1-nan,../../output/cleaned_data/LIMS/Micropal_CSV_4/...
632,323-U1345A-1H-2-nan,../../output/cleaned_data/LIMS/Micropal_CSV_4/...
633,323-U1345A-1H-2-nan,../../output/cleaned_data/LIMS/Micropal_CSV_4/...
634,323-U1345A-2H-1-nan,../../output/cleaned_data/LIMS/Micropal_CSV_4/...


In [14]:
# new_df.to_csv('../../output/tmp/dup_sample_names.csv')

## import all samples into db

In [37]:
import math

In [39]:
for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        filename = path.split('/')[2]
        content = pd.read_csv(path)

        for index, row in content.iterrows():
             if type(row['Sample']) is str and (type(row['Top [cm]']) is int or type(row['Top [cm]']) is float):

                 top =  0 if math.isnan(row['Top [cm]']) else row['Top [cm]']
                 sample = row['Sample'].strip()
                 sql = f"INSERT INTO staging.samples (name,top,bottom,top_depth,bottom_depth, created_at, data_source_notes)  VALUES (\'{sample}\', {top} , {row['Bottom [cm]']} , {row['Top Depth [m]']} ,{row['Bottom Depth [m]']}, now(), \'{filename}\');"
                 # db.execute(sql);
             else:
                print(row['Sample'], row['Top [cm]'], row['Bottom [cm]'], row['Top Depth [m]'], row['Bottom Depth [m]'], path )


print('done')
