# Find duplicate sample names

In [1]:
import sys
import pandas as pd
import glob

sys.path.append('../scripts/')
from normalize_data import (
    csv_cleanup
)

## Look for duplicate samples in one file

In [2]:
path = 'cleaned_data/Micropal_CSV_1/363-U1482A-nannofossils.csv'
content = pd.read_csv(path)

# cols = ['Sample', 'Top [cm]', 'Bottom [cm]', 'Top Depth [m]','Bottom Depth [m]']
dups = content.duplicated(subset=['Sample'])
content[dups]['Sample'] 

53     363-U1482A-14H-CC-PAL-NANNO
58     363-U1482A-15H-CC-PAL-NANNO
63     363-U1482A-16H-CC-PAL-NANNO
68     363-U1482A-17H-CC-PAL-NANNO
70     363-U1482A-18H-CC-PAL-nanno
72     363-U1482A-19H-CC-PAL-NANNO
74     363-U1482A-20H-CC-PAL-NANNO
79     363-U1482A-21H-CC-PAL-NANNO
80     363-U1482A-21H-CC-PAL-NANNO
81     363-U1482A-21H-CC-PAL-NANNO
90     363-U1482A-23H-CC-PAL-NANNO
92     363-U1482A-24H-CC-PAL-NANNO
94     363-U1482A-25H-CC-PAL-NANNO
96     363-U1482A-26H-CC-PAL-NANNO
98     363-U1482A-27H-CC-PAL-NANNO
106    363-U1482A-32H-CC-PAL-NANNO
108    363-U1482A-33H-CC-PAL-NANNO
110    363-U1482A-34H-CC-PAL-NANNO
112    363-U1482A-35H-CC-PAL-NANNO
114    363-U1482A-36H-CC-PAL-NANNO
116    363-U1482A-37H-CC-PAL-NANNO
123    363-U1482A-43F-CC-PAL-NANNO
125    363-U1482A-44F-CC-PAL-NANNO
127    363-U1482A-45F-CC-PAL-NANNO
129    363-U1482A-46X-CC-PAL-NANNO
130    363-U1482A-46X-CC-PAL-NANNO
131    363-U1482A-46X-CC-PAL-NANNO
133    363-U1482A-47X-CC-PAL-NANNO
134    363-U1482A-47

## Look for duplicate samples in all mircopal files

In [3]:
clean_data_paths = [
    'cleaned_data/Micropal_CSV_1', 
    'cleaned_data/Micropal_CSV_2',
    'cleaned_data/Micropal_CSV_3',
]

In [4]:
data =[]
for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        content = pd.read_csv(path)
        
        dups = content.duplicated(subset=['Sample'])
        new_df = content[dups][['Sample']] 
        for index, row in new_df.iterrows():
            data.append({'sample': row['Sample'], 'path': path})

In [5]:
new_df = pd.DataFrame(data)
new_df.to_csv('cleaned_data/csvs_with_duplicate_samples.csv', index=True)

## import all samples into db

In [None]:
import psycopg2
import math

In [115]:
clean_data_paths = [
    'cleaned_data/Micropal_CSV_1', 
    'cleaned_data/Micropal_CSV_2',
    'cleaned_data/Micropal_CSV_3',
]

In [116]:
def connect():
    return psycopg2.connect(
    host="localhost",
    database="eodp_dev",
    user="wyk",
    password="")

In [118]:
conn = connect()
cursor = conn.cursor()

for clean_data_path in clean_data_paths:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")

    for path in raw_csvs:
        filename = path.split('/')[2]
        content = pd.read_csv(path)

        for index, row in content.iterrows():
             if type(row['Sample']) is str and (type(row['Top [cm]']) is int or type(row['Top [cm]']) is float):

                 top =  0 if math.isnan(row['Top [cm]']) else row['Top [cm]']
                 sample = row['Sample'].strip()
                 sql = f"INSERT INTO staging.samples (name,top,bottom,top_depth,bottom_depth, created_at, data_source_notes)  VALUES (\'{sample}\', {top} , {row['Bottom [cm]']} , {row['Top Depth [m]']} ,{row['Bottom Depth [m]']}, now(), \'{filename}\');"
                 cursor.execute(sql);
             else:
                print(row['Sample'], row['Top [cm]'], row['Bottom [cm]'], row['Top Depth [m]'], row['Bottom Depth [m]'], path )

conn.commit()
conn.close()
print('done')


nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1359D_Diatoms_2.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1359D_Diatoms_2.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1359C_Diatoms_2.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/320_U1331C_Radiolarians_3.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/320_U1331C_Radiolarians_3.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1360A_Diatoms_2.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1360A_Diatoms_2.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1360A_Diatoms_2.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1358A_Diatoms_1.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1358A_Diatoms_1.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1358A_Diatoms_1.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1358A_Diatoms_1.csv
nan nan nan nan nan cleaned_data/Micropal_CSV_1/318_U1358A_Diatoms_1.csv
nan nan nan nan nan cleaned_data/Micropal

## Reading CSV

test various settings to read csv
https://stackoverflow.com/a/47368368
https://stackoverflow.com/a/36909497


In [2]:
# top and bottom is int
path = 'raw_data/DESC-Lithology-CSV/376_macroscopic_U1527C_2.csv'

# top, bottom, top depth is mixture of int and floats
path = 'raw_data/DESC-Lithology-CSV/330_sediment_U1373A.csv'

# No data this hole
path = 'raw_data/DESC-Lithology-CSV/329_sediment_U1369D.csv'

# blank colums with no header or data
path = 'raw_data/DESC-Lithology-CSV/329_sediment_U1368D.csv'

# blank rows with no data 
path = 'cleaned_data/Micropal_CSV_3/341_planktic_forams_U1417B.csv'

# pandas will add extra decimal places
path = 'raw_data/DESC-Lithology-CSV/320 Core Description_U1336B.csv'

# top and bottom are int or null, top depth and bottom depth are int or float
path = 'raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv'

In [3]:
new_files = False

original and 1 are different

In [4]:
# 1. pandas add extra decimal places to floats, and convert integers to floats

df = pd.read_csv(path)
output = path + '1.csv' if new_files else path
df.to_csv(output, index=False)

1 and 2 are same

In [5]:
# 2. float_precision='round_trip' prevents adding random extra decimal positions

df = pd.read_csv(path, float_precision='round_trip')
output = path + '2.csv' if new_files else path
df.to_csv(output, index=False)

2 and 3 are different

In [6]:
# 3. na_filter=False prevent converting integer to floats when column has NAs

df = pd.read_csv(path, na_filter=False)
output = path + '3.csv' if new_files else path
df.to_csv(output, index=False)

In [7]:
df = pd.read_csv(path)
df = csv_cleanup(df, path)
output = path + '4.csv' if new_files else path
df.to_csv(output, index=False)

In [8]:
df = pd.read_csv(path, float_precision='round_trip')
df = csv_cleanup(df, path)
output = path + '5.csv' if new_files else path
df.to_csv(output, index=False)

In [9]:
df = pd.read_csv(path, float_precision='round_trip',  na_filter=False)
df = csv_cleanup(df, path)
output = path + '6.csv' if new_files else path
df.to_csv(output, index=False)

In [10]:
# set type to strings for columns

df = pd.read_csv(path,
                 dtype = {'Top [cm]': str, 'Bottom [cm]': str, 
                         'Top Depth [m]': str, 'Bottom Depth [m]': str})
output = path + '7.csv' if new_files else path
df.to_csv(output, index=False)

In [11]:
# set type to strings for dataframe

df = pd.read_csv(path, dtype = str)
output = path + '8.csv' if new_files else path
df.to_csv(output, index=False)

In [12]:
for i in range(8):
    if i == 0:
        print(f'diff notebooks/{path} notebooks/{path}{i + 1}.csv')
    else:
        print(f'diff notebooks/{path}{i}.csv notebooks/{path}{i + 1}.csv')

diff notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv1.csv
diff notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv1.csv notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv2.csv
diff notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv2.csv notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv3.csv
diff notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv3.csv notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv4.csv
diff notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv4.csv notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv5.csv
diff notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv5.csv notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv6.csv
diff notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv6.csv notebooks/raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv7.csv
diff notebooks/raw_data/DESC-Lit

## read and write every LIMS file

In [13]:
lithology = 'cleaned_data/Lithology_CSV'
micropal_1 = 'cleaned_data/Micropal_CSV_1'
micropal_2 = 'cleaned_data/Micropal_CSV_2'
micropal_3 = 'cleaned_data/Micropal_CSV_3'

directories = [lithology, micropal_1, micropal_2, micropal_3]

lithology = 'raw_data/DESC-Lithology-CSV'
micropal_1 = 'raw_data/DESC Micropal CSV 1'
micropal_2 = 'raw_data/DESC Micropal CSV 2'
micropal_3 = 'raw_data/DESC Micropal CSV 3'

directories = [lithology, micropal_1, micropal_2, micropal_3]

In [14]:
for directory in directories:
    paths = glob.glob(f"{directory}/*.csv")

    for path in paths:
        df = pd.read_csv(path, dtype=str)
        df = csv_cleanup(df, path)
        df.to_csv(path, index=False)

In [17]:
path = 'raw_data/DESC-Lithology-CSV/342_sediment_U1406C.csv'

df = pd.read_csv(path, dtype=str)
df = csv_cleanup(df, path)
df.to_csv(path, index=False)