# QC samples

In [14]:
import sys
sys.path.append('../../')
import pandas as pd
import glob
from pathlib import Path


from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
)

In [15]:
clean_data_paths = [
    '../../output/cleaned_data/LIMS/Micropal_CSV_1', 
    '../../output/cleaned_data/LIMS/Micropal_CSV_2',
    '../../output/cleaned_data/LIMS/Micropal_CSV_3',
]

## Look for duplicate rows

In [6]:
path = '../../output/cleaned_data/LIMS/Micropal_CSV_3/341_radiolarians_U1421A.csv'
content = pd.read_csv(path)
cols = list(set(content.columns) - {'eodp_id'})
print(len(cols))

if len(content[content.duplicated(subset=cols)]) > 0:
    content.drop_duplicates(inplace=True, subset=cols)
    print(len(cols))
    
content[content.duplicated(subset=cols)]
 

125


Unnamed: 0,Sample,Top [cm],Bottom [cm],Top Depth [m],Bottom Depth [m],Zone name (short),Zone name,Datum name,Preservation,Group Abundance,...,Shore File Links,File Data,Exp,Site,Hole,Core,Type,Section,A/W,eodp_id


In [5]:
for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for index, path in enumerate(raw_csvs):
        content = pd.read_csv(path, dtype=str)
        cols = list(set(content.columns) - {'eodp_id'})
        
        if len(content[content.duplicated(subset=cols)]) > 0:
            content.drop_duplicates(inplace=True, subset=cols)
            content = csv_cleanup(content, path)
            content.to_csv(path, index=False)

## Look for duplicate sample names in all mircopal files

In [7]:
data =[]
target_columns = [
    'Sample', 
    'Top [cm]', 
    'Bottom [cm]', 
    'Top Depth [m]', 
    'Bottom Depth [m]'
]
for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        content = pd.read_csv(path)
        
        new_df = content[content.duplicated(subset=target_columns)]
        for index, row in new_df.iterrows():
            data.append({'sample': row['Sample'],  'path': path})

In [8]:
new_df = pd.DataFrame(data)
new_df.shape

(467, 2)

In [9]:
new_df

Unnamed: 0,sample,path
0,363-U1487A-4H-CC-PAL-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
1,363-U1487A-5H-CC-PAL-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
2,363-U1483A-1H-1-A 0/0-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
3,363-U1483A-9H-6-W 50/50-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
4,363-U1483A-11H-2-W 50/50-NANNO,../../output/cleaned_data/LIMS/Micropal_CSV_1/...
...,...,...
462,321-U1337A-32X-4-W,../../output/cleaned_data/LIMS/Micropal_CSV_3/...
463,321-U1337A-32X-4-W,../../output/cleaned_data/LIMS/Micropal_CSV_3/...
464,342-U1409A-1H-CC-PAL,../../output/cleaned_data/LIMS/Micropal_CSV_3/...
465,342-U1409A-2H-CC-PAL,../../output/cleaned_data/LIMS/Micropal_CSV_3/...


In [10]:
new_df.to_csv('../../output/tmp/dup_sample_names.csv')

## import all samples into db

In [142]:
import psycopg2
import math

In [143]:
clean_data_paths = [
    'cleaned_data/Micropal_CSV_1', 
    'cleaned_data/Micropal_CSV_2',
    'cleaned_data/Micropal_CSV_3',
]

In [144]:
def connect():
    return psycopg2.connect(
    host="localhost",
    database="eodp_dev",
    user="wyk",
    password="")

In [145]:
conn = connect()
cursor = conn.cursor()

for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        filename = path.split('/')[2]
        content = pd.read_csv(path)

        for index, row in content.iterrows():
             if type(row['Sample']) is str and (type(row['Top [cm]']) is int or type(row['Top [cm]']) is float):

                 top =  0 if math.isnan(row['Top [cm]']) else row['Top [cm]']
                 sample = row['Sample'].strip()
                 sql = f"INSERT INTO staging.samples (name,top,bottom,top_depth,bottom_depth, created_at, data_source_notes)  VALUES (\'{sample}\', {top} , {row['Bottom [cm]']} , {row['Top Depth [m]']} ,{row['Bottom Depth [m]']}, now(), \'{filename}\');"
                 cursor.execute(sql);
             else:
                print(row['Sample'], row['Top [cm]'], row['Bottom [cm]'], row['Top Depth [m]'], row['Bottom Depth [m]'], path )

# conn.commit()
# conn.close()
# print('done')
