# create csvs from database data

In [1]:
import sys
sys.path.append('../../')
import glob
import re
import os.path
import hashlib
from pathlib import Path
import datetime

import pandas as pd
import numpy as np
import db as db

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
    clean_taxon_name,
    remove_whitespace,
    normalize_abundance_codes
)
import scripts.normalize_taxa as nt
from scripts.shared_utils import (
    get_taxa_and_taxon_groups,
    create_df_from_db_rows, 
    log_df
)


In [2]:
file = 'LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv'

## create new file

In [3]:
sql = f"""
select expeditions.name as "Exp", sites.name as "Site", holes.name as "Hole",
cores.name as "Core", cores.type as "Type", sections.name as "Section", 
samples.name as "Sample", 
samples.top as "Top [cm]", 
samples.bottom as "Bottom [cm]",
samples.top_depth as "Top Depth [m]", 
samples.bottom_depth as "Bottom Depth [m]",
taxa.name as "column name", 
samples_taxa.code as value 
from  samples 
join sections on sections.id  = samples.section_id 
join cores on cores.id  = sections.core_id
join holes on holes.id  = cores.hole_id
join sites on sites.id = holes.site_id
join expeditions on expeditions.id = sites.expedition_id
join samples_taxa on samples_taxa.sample_id = samples.id
join taxa on taxa.id = samples_taxa.taxon_id
where samples.data_source_notes = '{file}'

union

select expeditions.name as "Exp", sites.name as "Site", holes.name as "Hole",
cores.name as "Core", cores.type as "Type", sections.name as "Section", 
samples.name as "Sample", 
samples.top as "Top [cm]", 
samples.bottom as "Bottom [cm]",
samples.top_depth as "Top Depth [m]", 
samples.bottom_depth as "Bottom Depth [m]",
'0_' || fields.name as "column name",
samples_fields.value
from  samples 
join sections on sections.id  = samples.section_id 
join cores on cores.id  = sections.core_id
join holes on holes.id  = cores.hole_id
join sites on sites.id = holes.site_id
join expeditions on expeditions.id = sites.expedition_id
join samples_fields on samples_fields.sample_id = samples.id
join fields on fields.id = samples_fields.field_id
where samples.data_source_notes = '{file}'
;
"""
rows = db.fetch_all_dict(sql)


def create_df_from_db_rows(rows):
    records = []
    for row in rows:
        data = {}
        for field, value in row.items():
            data[field] = str(value) if value else pd.NA
        records.append(data)

    records

    return pd.DataFrame(records, dtype=str)


df = create_df_from_db_rows(rows)
log_df(df)

(1063, 13)


Unnamed: 0,Exp,Site,Hole,Core,Type,Section,Sample,Top [cm],Bottom [cm],Top Depth [m],Bottom Depth [m],column name,value
0,363,U1482,A,1,H,CC,363-U1482A-1H-CC,0,14,2.84,2.98,Brizalina earlandi,1
1,363,U1482,A,1,H,CC,363-U1482A-1H-CC,0,14,2.84,2.98,Bulimina aculeata,9
2,363,U1482,A,1,H,CC,363-U1482A-1H-CC,0,14,2.84,2.98,Ceratobulimina jonesiana,1
3,363,U1482,A,1,H,CC,363-U1482A-1H-CC,0,14,2.84,2.98,Cibicidoides bradyi,1
4,363,U1482,A,1,H,CC,363-U1482A-1H-CC,0,14,2.84,2.98,Cibicidoides pachyderma,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058,363,U1482,A,9,H,CC,363-U1482A-9H-CC,0,18,79.12,79.3,Planulina wuellerstorfi,2
1059,363,U1482,A,9,H,CC,363-U1482A-9H-CC,0,18,79.12,79.3,Pleurostomella spp.,2
1060,363,U1482,A,9,H,CC,363-U1482A-9H-CC,0,18,79.12,79.3,Pyrgo spp.,4
1061,363,U1482,A,9,H,CC,363-U1482A-9H-CC,0,18,79.12,79.3,Sigmoilopsis schlumbergeri,1


In [4]:
cols = [
    'Exp', 'Site', 'Hole', 'Core', 'Type', 'Section', 'Sample', 
    "Top [cm]", "Bottom [cm]", "Top Depth [m]", "Bottom Depth [m]"
       
]
db_pivot = df.pivot(index=cols, 
                    columns='column name', 
                    values='value')
db_pivot = db_pivot.reset_index()
 
db_pivot.sort_values(['Sample'], inplace=True)

cols = [col.replace('0_', '') for col in db_pivot.columns]
db_pivot.columns = cols

log_df(db_pivot)

(57, 69)


Unnamed: 0,Exp,Site,Hole,Core,Type,Section,Sample,Top [cm],Bottom [cm],Top Depth [m],...,Saracenaria volpicelli,Sigmoilopsis schlumbergeri,Sphaeroidina bulloides,Stilostomella spp.,Textularia agglutinans,Tristix sp.,Uvigerina spp.,Vaginulina spp.,Vaginulinopsis spp.,Vulvulina pennatula
1,363,U1482,A,10,H,CC,363-U1482A-10H-CC,0,33,88.63,...,,2.0,,,,,24,,,
2,363,U1482,A,11,H,CC,363-U1482A-11H-CC,0,12,98.0,...,,,,,,,10,,,
3,363,U1482,A,12,H,CC,363-U1482A-12H-CC,0,21,107.63,...,,6.0,1.0,,,,30,,,
4,363,U1482,A,13,H,CC,363-U1482A-13H-CC,0,34,117.18,...,,,,,,,24,,,
5,363,U1482,A,14,H,CC,363-U1482A-14H-CC,0,24,126.63,...,,,,1.0,,,12,,,


In [5]:
db_pivot.to_csv(OUTPUT_DIR/'tmp'/'new.csv', index=False)

## edit original file

In [6]:
df = pd.read_csv(CLEAN_DATA_DIR/file, dtype=str)
df=df.dropna(axis=0, how="all")
df=df.dropna(axis=1, how="all")
log_df(df)

(57, 73)


Unnamed: 0,Sample,Top [cm],Bottom [cm],Top Depth [m],Bottom Depth [m],Preservation,Group abundance,Percentage of non-calcareous agglutinated forams in total foram assemblage [%],Anomalinoides globulosus,Bigenerina nodosaria,...,Vaginulina spp.,Vaginulinopsis spp.,Vulvulina pennatula,Exp,Site,Hole,Core,Type,Section,eodp_id
0,363-U1482A-1H-CC,0,14,2.84,2.98,E [P46],R,1,,,...,,,,363,U1482,A,1,H,CC,ec548545f580fd8394585428eec88ad6
1,363-U1482A-2H-CC,0,14,12.53,12.67,E [P46],R,1,,,...,,,,363,U1482,A,2,H,CC,3f4a05429dd4dff0cf98485e487f78b2
2,363-U1482A-3H-CC,0,14,21.9,22.04,E [P46],R,1,,,...,,,,363,U1482,A,3,H,CC,f1699ff6e7353f55d92f41d1a39458d6
3,363-U1482A-4H-CC,0,23,30.46,30.69,E [P46],R,1,,,...,,,,363,U1482,A,4,H,CC,818173d566a0e920c3c1e354c954af03
4,363-U1482A-5H-CC,0,23,40.79,41.02,E [P46],R,1,,,...,,,,363,U1482,A,5,H,CC,d8119ae357cd70aafa4ae3e046ece93e


In [7]:
del df['eodp_id']
df.shape

(57, 72)

In [8]:
set(df.columns) - set(db_pivot.columns)

{'Group abundance',
 'Percentage of non-calcareous agglutinated forams in total foram assemblage [%]',
 'Preservation',
 'Pseudoglandulina glanduliformis'}

In [9]:
set (db_pivot.columns) - set(df.columns) 

{'Pseudoglandulina glanduliniformis'}

In [10]:
df.rename(columns={'Group abundance': 'Group Abundance',
                   'Preservation': 'Group Preservation', 
                   'Pseudoglandulina glanduliformis': 'Pseudoglandulina glanduliniformis'},
         inplace=True)

In [11]:
df = df[db_pivot.columns]

In [12]:
df=df.sort_values('Sample')
df.shape

(57, 69)

In [13]:
df.to_csv(OUTPUT_DIR/'tmp'/'original.csv', index=False)