# Find duplicate sample names

In [1]:
import pandas as pd
import glob

## Look for duplicate samples in one file

In [2]:
path = 'cleaned_data/Micropal_CSV_1/363-U1482A-nannofossils.csv'
content = pd.read_csv(path)

# cols = ['Sample', 'Top [cm]', 'Bottom [cm]', 'Top Depth [m]','Bottom Depth [m]']
dups = content.duplicated(subset=['Sample'])
content[dups]['Sample'] 

53     363-U1482A-14H-CC-PAL-NANNO
58     363-U1482A-15H-CC-PAL-NANNO
63     363-U1482A-16H-CC-PAL-NANNO
68     363-U1482A-17H-CC-PAL-NANNO
70     363-U1482A-18H-CC-PAL-nanno
72     363-U1482A-19H-CC-PAL-NANNO
74     363-U1482A-20H-CC-PAL-NANNO
79     363-U1482A-21H-CC-PAL-NANNO
80     363-U1482A-21H-CC-PAL-NANNO
81     363-U1482A-21H-CC-PAL-NANNO
90     363-U1482A-23H-CC-PAL-NANNO
92     363-U1482A-24H-CC-PAL-NANNO
94     363-U1482A-25H-CC-PAL-NANNO
96     363-U1482A-26H-CC-PAL-NANNO
98     363-U1482A-27H-CC-PAL-NANNO
106    363-U1482A-32H-CC-PAL-NANNO
108    363-U1482A-33H-CC-PAL-NANNO
110    363-U1482A-34H-CC-PAL-NANNO
112    363-U1482A-35H-CC-PAL-NANNO
114    363-U1482A-36H-CC-PAL-NANNO
116    363-U1482A-37H-CC-PAL-NANNO
123    363-U1482A-43F-CC-PAL-NANNO
125    363-U1482A-44F-CC-PAL-NANNO
127    363-U1482A-45F-CC-PAL-NANNO
129    363-U1482A-46X-CC-PAL-NANNO
130    363-U1482A-46X-CC-PAL-NANNO
131    363-U1482A-46X-CC-PAL-NANNO
133    363-U1482A-47X-CC-PAL-NANNO
134    363-U1482A-47

## Look for duplicate samples in all mircopal files

In [3]:
clean_data_paths = [
    'cleaned_data/Micropal_CSV_1', 
    'cleaned_data/Micropal_CSV_2',
    'cleaned_data/Micropal_CSV_3',
]

In [4]:
data =[]
for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        content = pd.read_csv(path)
        
        dups = content.duplicated(subset=['Sample'])
        new_df = content[dups][['Sample']] 
        for index, row in new_df.iterrows():
            data.append({'sample': row['Sample'], 'path': path})

In [5]:
new_df = pd.DataFrame(data)
new_df.to_csv('cleaned_data/csvs_with_duplicate_samples.csv', index=True)

## import all samples into db

In [None]:
import psycopg2
import math

In [115]:
clean_data_paths = [
    'cleaned_data/Micropal_CSV_1', 
    'cleaned_data/Micropal_CSV_2',
    'cleaned_data/Micropal_CSV_3',
]

In [116]:
def connect():
    return psycopg2.connect(
    host="localhost",
    database="eodp_dev",
    user="wyk",
    password="")

In [118]:
conn = connect()
cursor = conn.cursor()

for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        filename = path.split('/')[2]
        content = pd.read_csv(path)

        for index, row in content.iterrows():
             if type(row['Sample']) is str and (type(row['Top [cm]']) is int or type(row['Top [cm]']) is float):

                 top =  0 if math.isnan(row['Top [cm]']) else row['Top [cm]']
                 sample = row['Sample'].strip()
                 sql = f"INSERT INTO staging.samples (name,top,bottom,top_depth,bottom_depth, created_at, data_source_notes)  VALUES (\'{sample}\', {top} , {row['Bottom [cm]']} , {row['Top Depth [m]']} ,{row['Bottom Depth [m]']}, now(), \'{filename}\');"
                 cursor.execute(sql);
             else:
                print(row['Sample'], row['Top [cm]'], row['Bottom [cm]'], row['Top Depth [m]'], row['Bottom Depth [m]'], path )

conn.commit()
conn.close()
print('done')


nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1359D_Diatoms_2.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1359D_Diatoms_2.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1359C_Diatoms_2.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/320_U1331C_Radiolarians_3.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/320_U1331C_Radiolarians_3.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1360A_Diatoms_2.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1360A_Diatoms_2.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1360A_Diatoms_2.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1358A_Diatoms_1.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1358A_Diatoms_1.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1358A_Diatoms_1.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1358A_Diatoms_1.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1358A_Diatoms_1.csv
nan nan nan nan nan cleaned_data/Micropal