# QC samples

In [1]:
import sys
sys.path.append('../../')
import pandas as pd
import glob
from pathlib import Path


from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
)

In [2]:
clean_data_paths = [
    '../../output/cleaned_data/LIMS/Micropal_CSV_1', 
    '../../output/cleaned_data/LIMS/Micropal_CSV_2',
    '../../output/cleaned_data/LIMS/Micropal_CSV_3',
]

## Look for duplicate rows

In [6]:
path = '../../output/cleaned_data/LIMS/Micropal_CSV_3/341_radiolarians_U1421A.csv'
content = pd.read_csv(path)
cols = list(set(content.columns) - {'eodp_id'})
print(len(cols))

if len(content[content.duplicated(subset=cols)]) > 0:
    content.drop_duplicates(inplace=True, subset=cols)
    print(len(cols))
    
content[content.duplicated(subset=cols)]
 

125


Unnamed: 0,Sample,Top [cm],Bottom [cm],Top Depth [m],Bottom Depth [m],Zone name (short),Zone name,Datum name,Preservation,Group Abundance,...,Shore File Links,File Data,Exp,Site,Hole,Core,Type,Section,A/W,eodp_id


In [5]:
for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for index, path in enumerate(raw_csvs):
        content = pd.read_csv(path, dtype=str)
        cols = list(set(content.columns) - {'eodp_id'})
        
        if len(content[content.duplicated(subset=cols)]) > 0:
            content.drop_duplicates(inplace=True, subset=cols)
            content = csv_cleanup(content, path)
            content.to_csv(path, index=False)

## Look for duplicate sample names in all mircopal files

In [7]:
data =[]
target_columns = [
    'Sample', 
    'Top [cm]', 
    'Bottom [cm]', 
    'Top Depth [m]', 
    'Bottom Depth [m]'
]
for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        content = pd.read_csv(path)
        
        new_df = content[content.duplicated(subset=target_columns)]
        for index, row in new_df.iterrows():
            data.append({'sample': row['Sample'],  'path': path})

In [8]:
new_df = pd.DataFrame(data)
new_df.shape

(467, 2)

In [9]:
new_df

Unnamed: 0,sample,path
0,363-U1487A-4H-CC-PAL-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
1,363-U1487A-5H-CC-PAL-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
2,363-U1483A-1H-1-A 0/0-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
3,363-U1483A-9H-6-W 50/50-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
4,363-U1483A-11H-2-W 50/50-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
...,...,...
462,321-U1337A-32X-4-W,../../output/cleaned_data/LIMS/Micropal_CSV_3/...
463,321-U1337A-32X-4-W,../../output/cleaned_data/LIMS/Micropal_CSV_3/...
464,342-U1409A-1H-CC-PAL,../../output/cleaned_data/LIMS/Micropal_CSV_3/...
465,342-U1409A-2H-CC-PAL,../../output/cleaned_data/LIMS/Micropal_CSV_3/...


In [10]:
new_df.to_csv('../../output/tmp/dup_sample_names.csv')

## import all samples into db

In [37]:
import psycopg2
import math

In [38]:
def connect():
    return psycopg2.connect(
    host="localhost",
    database="eodp_dev",
    user="wyk",
    password="")

In [39]:
conn = connect()
cursor = conn.cursor()

for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        filename = path.split('/')[2]
        content = pd.read_csv(path)

        for index, row in content.iterrows():
             if type(row['Sample']) is str and (type(row['Top [cm]']) is int or type(row['Top [cm]']) is float):

                 top =  0 if math.isnan(row['Top [cm]']) else row['Top [cm]']
                 sample = row['Sample'].strip()
                 sql = f"INSERT INTO staging.samples (name,top,bottom,top_depth,bottom_depth, created_at, data_source_notes)  VALUES (\'{sample}\', {top} , {row['Bottom [cm]']} , {row['Top Depth [m]']} ,{row['Bottom Depth [m]']}, now(), \'{filename}\');"
                 cursor.execute(sql);
             else:
                print(row['Sample'], row['Top [cm]'], row['Bottom [cm]'], row['Top Depth [m]'], row['Bottom Depth [m]'], path )

# conn.commit()
# conn.close()
# print('done')


# check taxa file

In [3]:
import sys
sys.path.append('../../')
from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR


In [137]:
date = '2022-04-28'


taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"
taxa_crosswalk_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"


In [138]:
df = pd.read_csv(taxa_list_file, usecols=['normalized_name', 'taxon_group'])
df.shape
# 4678

(4678, 2)

In [139]:
df[df.duplicated(subset=['normalized_name', 'taxon_group'])]

Unnamed: 0,normalized_name,taxon_group


In [146]:
df2 = pd.read_csv(taxa_crosswalk_list_file, usecols=['normalized_name', 'taxon_group', 'verbatim_name','eodp_id'])
df2.shape
# 5267

(5267, 4)

In [153]:
df2[df2.duplicated(subset=['normalized_name', 'taxon_group', 'verbatim_name'])]

Unnamed: 0,normalized_name,taxon_group,verbatim_name,eodp_id


In [148]:
import psycopg2
import psycopg2.extras

DB_NAME = "eodp_dev"
DB_USER = "wyk"
DB_PASS = ""
DB_HOST = "localhost"
DB_PORT = "5432"
conn = psycopg2.connect(database=DB_NAME,
                        user=DB_USER,
                        password=DB_PASS,
                        host=DB_HOST,
                        port=DB_PORT)

cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cur.execute("""
SELECT * FROM taxa_crosswalk 
JOIN taxa on taxa.id = taxa_crosswalk.taxon_id 

""")
rows = cur.fetchall()
data = []
for row in rows:
    data.append({
        'normalized_name': row['name'], 
        'taxon_group': row['taxon_group'],
        'verbatim_name': row['original_name'],
        'eodp_id': row['eodp_id']
        
    })

conn.close()

In [149]:
len(data)

5266

In [150]:
db_df = pd.DataFrame(data)
db_df.head()

Unnamed: 0,normalized_name,taxon_group,verbatim_name,eodp_id
0,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,0
1,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,1
2,Foraminifera indet.,benthic_forams,Others,2
3,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,3
4,Ostracoda indet.,benthic_forams,Ostracoda spp.,4


In [151]:
set(df2['eodp_id']) - set (db_df['eodp_id'])

{3680}

In [152]:
set(db_df['eodp_id']) - set (df2['eodp_id'])

set()