# QA columns

Do some basic QA on the csvs.

In [3]:
import sys
sys.path.append('../../')
import glob
import shutil
from pathlib import Path
import os

import pandas as pd
from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

from scripts.normalize_data import (
    check_duplicate_columns
)

In [4]:
clean_data_path = CLEAN_DATA_DIR

metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 
metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes_4.csv' 


## check if csv has duplicate column names

In [13]:
def duplicate_columns(directories, file_extension='csv'):
    bad_files = []
    for directory in directories:
        raw_csvs = glob.glob(f"{directory}/**/*.{file_extension}", recursive=True)

        for path in raw_csvs:
            content = pd.read_csv(path)
            content.dropna(inplace=True, axis=1, how='all')

            res = check_duplicate_columns(content, path)
            if res:
                bad_files.append(res)
    return bad_files
                

In [14]:
LIMS_data_paths = [
    CLEAN_DATA_DIR/'LIMS'/'Lithology_CSV',
    CLEAN_DATA_DIR/'LIMS'/'Micropal_CSV_1', 
    CLEAN_DATA_DIR/'LIMS'/'Micropal_CSV_2',
    CLEAN_DATA_DIR/'LIMS'/'Micropal_CSV_3',
    CLEAN_DATA_DIR/'LIMS'/'Micropal_CSV_4',
    CLEAN_DATA_DIR/'LIMS'/ 'Micropal_CSV_revised',
]

In [15]:
bad_files = duplicate_columns(LIMS_data_paths)

../../output/cleaned_data/LIMS/Lithology_CSV/323 Core Description Template_U1341A.csv, GRAVEL SIZE CLAST: duplicate columns have different values
../../output/cleaned_data/LIMS/Lithology_CSV/344_sediment_U1412A.csv, Other mineral : duplicate columns have different values
../../output/cleaned_data/LIMS/Micropal_CSV_1/320_U1334A_Radiolarians_2.csv, Acrocubus octopylus : duplicate columns have different values
../../output/cleaned_data/LIMS/Micropal_CSV_3/341_benthic_forams_U1417B.csv, Type: duplicate columns have different values
../../output/cleaned_data/LIMS/Micropal_CSV_4/323_U1344E_diatoms.csv, Thalassiosira hyalina: duplicate columns have different values
../../output/cleaned_data/LIMS/Micropal_CSV_4/323_U1344E_diatoms.csv, Thalassiosira jouseae: duplicate columns have different values
../../output/cleaned_data/LIMS/Micropal_CSV_4/323_U1344E_diatoms.csv, Thalassiosira latimarginata: duplicate columns have different values
../../output/cleaned_data/LIMS/Micropal_CSV_4/323_U1344E_diat

In [39]:
bad_files

[[{'filename': '../../output/cleaned_data/LIMS/Lithology_CSV/323 Core Description Template_U1341A.csv',
   'bad_column': 'GRAVEL SIZE CLAST.1',
   'same_value': False}],
 [{'filename': '../../output/cleaned_data/LIMS/Lithology_CSV/344_sediment_U1412A.csv',
   'bad_column': 'Other mineral ',
   'same_value': False}],
 [{'filename': '../../output/cleaned_data/LIMS/Micropal_CSV_1/320_U1334A_Radiolarians_2.csv',
   'bad_column': 'Acrocubus octopylus ',
   'same_value': False}],
 [{'filename': '../../output/cleaned_data/LIMS/Micropal_CSV_3/341_benthic_forams_U1417B.csv',
   'bad_column': 'Type.1',
   'same_value': False}]]

## create a file with all columns  in LIMS taxa

In [17]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,change_file_encoding,remove_empty_rows,remove_spaces_from_columns,delete_duplicate_colums_with_spaces,delete_duplicate_rows,delete_renamed_duplicate_columns,remove_bad_characters
0,320_U1334_benthic_forams.csv,LIMS/Micropal_CSV_4/320_U1334_benthic_forams.csv,benthic_forams,False,False,False,False,False,False,False
1,361_U1479B_nannofossils.csv,LIMS/Micropal_CSV_4/361_U1479B_nannofossils.csv,nannofossils,False,False,False,False,False,False,False
2,372_U1517C_planktic_forams.csv,LIMS/Micropal_CSV_4/372_U1517C_planktic_forams...,planktic_forams,False,False,False,False,False,False,False
3,317_U1352_planktic_forams.csv,LIMS/Micropal_CSV_4/317_U1352_planktic_forams.csv,planktic_forams,False,False,True,False,False,False,False
4,323_U1339_palynology.csv,LIMS/Micropal_CSV_4/323_U1339_palynology.csv,palynology,False,False,False,False,False,False,False


In [18]:
data = []
for path in metadata['path']:
    df = pd.read_csv(clean_data_path/path, dtype=str)
    df.dropna(axis="columns", how='all', inplace=True)
    for col in df.columns:
        data.append({"path": path, "column": col})

In [19]:
all_df = pd.DataFrame(data)
all_df.shape

(6176, 2)

In [20]:
all_df.to_csv('../../output/tmp/LIMS_taxa_4_columns_2020-02-23.csv')

## search for column

In [5]:
def find_column_value(all_columns_path, column, copy_files=False):
    all_df = pd.read_csv(all_columns_path, dtype=str)
    files = list(all_df[all_df['column'] == column]['path'].unique())
    
    print(f'"{column}" found in {len(files)} files\n')

    for file in files:
        path = CLEAN_DATA_DIR/file
        df = pd.read_csv(path, dtype=str, usecols=[column])
        df[column].dropna(inplace=True)
        
        print(file, '\n')
        if len(df[column].unique()) > 0:
            print(' | '.join(df[column].fillna('').unique()))
            print('----')
        
        if copy_files:
            temp_path = Path('temp', column)
            if not os.path.exists(temp_path):
                os.makedirs(temp_path)
            shutil.copy(path, temp_path)


In [10]:


column = 'Other taxa'

# all_columns_path = '../../output/tmp/all_LIMS_taxa_columns_2020-02-23.csv'
all_columns_path = '../../output/tmp/LIMS_taxa_4_columns_2020-02-23.csv'

find_column_value(all_columns_path, column, False)

"Other taxa" found in 1 files

LIMS/Micropal_CSV_4/323_U1339B_nannofossils.csv 

unidentified spec (1)
----


In [4]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,change_file_encoding,remove_empty_rows,remove_spaces_from_columns,delete_duplicate_colums_with_spaces,delete_duplicate_rows,delete_renamed_duplicate_columns,remove_bad_characters
0,320_U1334_benthic_forams.csv,LIMS/Micropal_CSV_4/320_U1334_benthic_forams.csv,benthic_forams,False,False,False,False,False,False,False
1,361_U1479B_nannofossils.csv,LIMS/Micropal_CSV_4/361_U1479B_nannofossils.csv,nannofossils,False,False,False,False,False,False,False
2,372_U1517C_planktic_forams.csv,LIMS/Micropal_CSV_4/372_U1517C_planktic_forams...,planktic_forams,False,False,False,False,False,False,False
3,317_U1352_planktic_forams.csv,LIMS/Micropal_CSV_4/317_U1352_planktic_forams.csv,planktic_forams,False,False,True,False,False,False,False
4,323_U1339_palynology.csv,LIMS/Micropal_CSV_4/323_U1339_palynology.csv,palynology,False,False,False,False,False,False,False


In [14]:
data = []
for path in metadata['path']:
    if '317_U1351_planktic_forams.csv' in str(path):
        header = 1
    else:
        header = 0
    df = pd.read_csv(clean_data_path/path, dtype=str, header=header)
    df.dropna(axis="columns", how='all', inplace=True)
    for index, row in df.iterrows():
        row_data = {"path": path}
        for col in df.columns:
            row_data[col] = row[col]
        data.append(row_data)

In [15]:
len(data)

6818

In [16]:
new_df = pd.DataFrame(data)
new_df.shape

(6818, 1712)

In [17]:
new_df = new_df[sorted(new_df.columns)]


In [18]:
new_df.to_csv('../../output/tmp/combine_mircopal_4.csv')