# QC samples

In [1]:
import sys
sys.path.append('../../')
import pandas as pd
import glob
from pathlib import Path
from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR
import db as db


from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
)

In [4]:
CLEAN_DATA_DIR

PosixPath('../../output/cleaned_data')

In [5]:
clean_data_paths = [
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_1', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_2', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_3', 
    CLEAN_DATA_DIR /'LIMS/Micropal_CSV_4', 
]

date = '2022-04-28'


taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"
taxa_crosswalk_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"


In [3]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## Look for duplicate sample names in all mircopal files

In [10]:
data =[]
target_columns = [
    'Sample', 
    'Top [cm]', 
    'Bottom [cm]', 
    'Top Depth [m]', 
    'Bottom Depth [m]'
]
for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        content = pd.read_csv(path)
        
        new_df = content[content.duplicated(subset=['Sample'])]
        for index, row in new_df.iterrows():
            data.append({'sample': row['Sample'],  'path': path})

In [11]:
new_df = pd.DataFrame(data)
new_df.shape

(636, 2)

In [12]:
new_df

Unnamed: 0,sample,path
0,363-U1487A-4H-CC-PAL-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
1,363-U1487A-5H-CC-PAL-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
2,363-U1483A-1H-1-A 0/0-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
3,363-U1483A-9H-6-W 50/50-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
4,363-U1483A-11H-2-W 50/50-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
...,...,...
631,323-U1345A-1H-1-nan,../../output/cleaned_data/LIMS/Micropal_CSV_4/...
632,323-U1345A-1H-2-nan,../../output/cleaned_data/LIMS/Micropal_CSV_4/...
633,323-U1345A-1H-2-nan,../../output/cleaned_data/LIMS/Micropal_CSV_4/...
634,323-U1345A-2H-1-nan,../../output/cleaned_data/LIMS/Micropal_CSV_4/...


In [14]:
# new_df.to_csv('../../output/tmp/dup_sample_names.csv')

## import all samples into db

In [37]:
import math

In [39]:
for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        filename = path.split('/')[2]
        content = pd.read_csv(path)

        for index, row in content.iterrows():
             if type(row['Sample']) is str and (type(row['Top [cm]']) is int or type(row['Top [cm]']) is float):

                 top =  0 if math.isnan(row['Top [cm]']) else row['Top [cm]']
                 sample = row['Sample'].strip()
                 sql = f"INSERT INTO staging.samples (name,top,bottom,top_depth,bottom_depth, created_at, data_source_notes)  VALUES (\'{sample}\', {top} , {row['Bottom [cm]']} , {row['Top Depth [m]']} ,{row['Bottom Depth [m]']}, now(), \'{filename}\');"
                 db.execute(sql);
             else:
                print(row['Sample'], row['Top [cm]'], row['Bottom [cm]'], row['Top Depth [m]'], row['Bottom Depth [m]'], path )


print('done')


# check taxa files have unique values

In [5]:
df = pd.read_csv(taxa_list_file, usecols=['normalized_name', 'taxon_group'])
df.shape
# 4676

(4676, 2)

In [6]:
df[df.duplicated(subset=['normalized_name', 'taxon_group'])]

Unnamed: 0,normalized_name,taxon_group


In [8]:
df2 = pd.read_csv(taxa_crosswalk_list_file, usecols=['normalized_name', 'taxon_group', 'verbatim_name','eodp_id'])
df2.shape
# 5264

(5264, 4)

In [9]:
df2[df2.duplicated(subset=['normalized_name', 'taxon_group', 'verbatim_name'])]

Unnamed: 0,normalized_name,taxon_group,verbatim_name,eodp_id


In [4]:
sql = """
SELECT * FROM taxa_crosswalk 
JOIN taxa on taxa.id = taxa_crosswalk.taxon_id 

"""
rows = db.fetch_all(sql)
data = []
for row in rows:
    data.append({
        'normalized_name': row['name'], 
        'taxon_group': row['taxon_group'],
        'verbatim_name': row['original_name'],
        'eodp_id': row['eodp_id']
        
    })


len(data)

5263

In [32]:
db_df = pd.DataFrame(data)
db_df.head()

Unnamed: 0,normalized_name,taxon_group,verbatim_name,eodp_id
0,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,0
1,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,1
2,Foraminifera indet.,benthic_forams,Others,2
3,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,3
4,Ostracoda indet.,benthic_forams,Ostracoda spp.,4


In [33]:
set(df2['eodp_id']) - set (db_df['eodp_id'])

{3680}

In [34]:
set(db_df['eodp_id']) - set (df2['eodp_id'])

set()

# check additional species are imported

In [22]:
path = CLEAN_DATA_DIR/'LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv'
df = pd.read_csv(path, dtype=str)
df = df.dropna(axis=1, how="all")

log_df(df)

(31, 77)


Unnamed: 0,Sample,Top [cm],Bottom [cm],Top Depth [m],Bottom Depth [m],Zone name,Preservation,Group Abundance,Antarctissa cylindrica,Cycladophora pliocenica,...,Gondwanaria dogieli,Plagiacanthidae indet.,Litharachnium tentorium,Streblacantha circumtexta,Eucyrtidium teuscheri,Verticillata hexacantha,Cypassis irregularis,Prunopyle antarctica,Lithomelissa sp. A,Larcopyle buetschlii
0,374-U1525A-1H-1-IW_MUDLINE,0,0,0.0,0.0,,G,A,,,...,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,Indeterminate due to data source,,
1,374-U1525A-1H-CC-PAL-RADS,0,5,8.57,8.62,,P,B,,,...,,,,,,,,,,
2,374-U1525A-2H-CC-PAL-RADS,0,5,18.72,18.77,,P,Tr,,,...,,,,,,,,,,
3,374-U1525A-3H-CC-PAL-RADS,0,5,28.16,28.21,,P,Tr,,,...,,,,,,,,,,
4,374-U1525A-4H-CC-PAL-RADS,0,5,28.86,28.91,> 0.65 (LAD A. cylindrica),M,Tr,X,,...,,,,,,,,,,


In [23]:
df.columns

Index(['Sample', 'Top [cm]', 'Bottom [cm]', 'Top Depth [m]',
       'Bottom Depth [m]', 'Zone name', 'Preservation', 'Group Abundance',
       'Antarctissa cylindrica', 'Cycladophora pliocenica',
       'Triceraspyris antarctica', 'Eucyrtidium calvertense',
       'Helotholus vema', 'Desmospyris spongiosa', 'Cycladophora davisiana',
       'Ceratocyrtis mashae', 'Prunopyle hayesi', 'Actinomma popofskii',
       'Acrosphaera? mercurius', 'Actinomma boreale', 'Actinomma delicatulum',
       'Actinomma leptodermum', 'Actinomma leptodernum longispinum',
       'Antarctissa denticulata', 'Antarctissa strelkovi',
       'Cenosphaera cristata', 'Ceratocyrtis spp.', 'Cornutella profunda',
       'Cycladophora bicornis', 'Druppatractus hastatus',
       'Enneaphormis rotula', 'Eucyrtidium inflatum',
       'Hexacontium pachydermum', 'Larcopyle pylomaticus',
       'Larcopyle weddellium', 'Lithelius nautiloides', 'Lithelius sp. A',
       'Mitrocalpis araneafera', 'Peripyramis circumtexta',
    

In [24]:
cols = [
    'Antarctissa cylindrica', 'Cycladophora pliocenica',
    'Triceraspyris antarctica', 'Eucyrtidium calvertense',
    'Helotholus vema', 'Desmospyris spongiosa', 'Cycladophora davisiana',
    'Ceratocyrtis mashae', 'Prunopyle hayesi', 'Actinomma popofskii',
    'Acrosphaera? mercurius', 'Actinomma boreale', 'Actinomma delicatulum',
    'Actinomma leptodermum', 'Actinomma leptodernum longispinum',
    'Antarctissa denticulata', 'Antarctissa strelkovi',
    'Cenosphaera cristata', 'Ceratocyrtis spp.', 'Cornutella profunda',
    'Cycladophora bicornis', 'Druppatractus hastatus',
    'Enneaphormis rotula', 'Eucyrtidium inflatum',
    'Hexacontium pachydermum', 'Larcopyle pylomaticus',
    'Larcopyle weddellium', 'Lithelius nautiloides', 'Lithelius sp. A',
    'Mitrocalpis araneafera', 'Peripyramis circumtexta',
    'Phormacantha hystrix/Plectacantha oikiskos group',
    'Phormostichoartus corbula', 'Pseudodictyophimus gracilipes',
    'Saccospyris antarctica', 'Saccospyris conithorax',
    'Saccospyris praeantarctica', 'Spongopyle osculosa',
    'Spongotrochus glacialis', 'Spongotrochus sp. A Abelmann',
    'Sphaeropyle robusta', 'Prunopyle tetrapila', 'Stylatractus neptunus',
    'Stylochlamidium  venustum', 'Stylodictya spp.', 'Trisulcus nana',
    # 'ADDITIONAL SPECIES', 
]

add_cols = [
    'Lonchosphaera spicata', 'Poulpus spp.', 'Lithostrobus cuspidatus',
    'Gondwanaria dogieli', 'Plagiacanthidae indet.',
    'Litharachnium tentorium', 'Streblacantha circumtexta',
    'Eucyrtidium teuscheri', 'Verticillata hexacantha',
    'Cypassis irregularis', 'Prunopyle antarctica', 'Lithomelissa sp. A',
    'Larcopyle buetschlii'
]

In [25]:
data = []
for index, row in df.iterrows():
    for col in cols:
        if pd.notna(row[col]):
            data.append({'Sample': row['Sample'], 'code': row[col], 'taxon': col})
 
len(data)

114

In [7]:
sql = """
SELECT count(*) 
FROM  samples_taxa
WHERE (data_source_notes = 'LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv')
AND (code != 'Indeterminate due to data source') 
"""

db.fetch_one(sql)

[114]

In [26]:
data = []
for index, row in df.iterrows():
    for col in add_cols:
        if pd.notna(row[col]):
            data.append({'Sample': row['Sample'], 'code': row[col], 'taxon': col})
 
len(data)

17

In [8]:
sql = """
SELECT count(*) 
FROM  samples_taxa
WHERE (data_source_notes = 'LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv')
AND (code = 'Indeterminate due to data source') 
"""

db.fetch_one(sql)

[17]