# Merge Thinsection Datasets From Multiple Files

## Parameters

In [None]:
INPUT_DIRECTORY = '../datasets/MargemEquatorial/backup/'

## Imports

In [None]:
import os
import pandas as pd
import numpy as np
from os.path import isfile, join
from IPython.display import display
from functools import partial


## Gathering files from input directory

In [None]:
print('GATHERING FILES at ' + INPUT_DIRECTORY)
csv_file_names = [
    file_name for file_name in os.listdir(INPUT_DIRECTORY)
    if isfile(join(INPUT_DIRECTORY, file_name))
    and file_name.endswith('.csv')
    and file_name != 'dataset.csv'
]
print('DONE')

## Read all CSV files and put generated DataFrames on a list

In [None]:
print('READING THIN SECTION FILES')
csv_data_files = [
    pd.read_csv(open(join(INPUT_DIRECTORY, csv_file_name)), index_col=0)
    for csv_file_name in csv_file_names]
dfs = []
for csv in csv_data_files:
    dfs.append(csv.applymap(partial(pd.to_numeric, errors='ignore')))
csv_data_files = dfs

for csv in csv_data_files:
    features = csv.index.values
    processed_features = []
    for feature in features:
        n_attributes = feature.count(' - ') +1
        if n_attributes == 3:
            processed_features.append('[primary]'+feature)
        elif n_attributes == 7:
            processed_features.append('[diagenetic]'+feature)
        elif n_attributes == 6:
            processed_features.append('[porosity]'+feature)
        else:
            processed_features.append(feature)
            
    csv.index = processed_features

    

print('DONE')

## Duplicated features
Should have no output!

In [None]:
print('DUPLICATES:')
df_list = []
for df, file_name in zip(csv_data_files, csv_file_names):
    print(file_name)
    for index in df.index.values:
        if df.index.values.tolist().count(index) > 1:
            df_list.append(index)
            print('\t'+str(index))

## Removing duplicates

### Function defined to find if a given string contains numbers

In [None]:
def is_number(s):
    try:
        float(s)
        return True
    except (ValueError, TypeError):
        pass
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False

In [None]:
csv_data_files = [df[df.apply(lambda x: x.name == 'petrofacie' or x.apply(is_number), axis=1)] for df in csv_data_files]

In [None]:
# for df in csv_data_files:
#     with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#         display(df)

In [None]:
csv_data_files = [df.dropna(axis=0, how='any') for df in csv_data_files]

In [None]:
csv_data_files = [df.groupby(df.index).sum() for df in csv_data_files]

In [None]:
# for df in csv_data_files:
#     with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#         display(df)

## Merge DataFrames cohesively

In [None]:
print(sum(df.shape[0] for df in csv_data_files))
print(sum(df.shape[1] for df in csv_data_files))
print('MERGING DATA')
full_csv = pd.DataFrame()

full_csv = pd.concat(csv_data_files, axis=1)
full_csv = full_csv.fillna(value=0)

## Display merged DataFrame

In [None]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
# display(full_csv)

## Eliminate unused features
- Non-numeric features (except petrofacies)

### Apply function to DataFrame

## Transpose dataset
Turns a file from the format instances by columns to instances by rows

In [None]:
full_csv = full_csv.transpose()

In [None]:
# display(full_csv)

## Saves merged DataFrame as CSV file

In [None]:
import csv
# csv_file = open(join(INPUT_DIRECTORY, 'dataset.csv'), "w")
# csv_file.write(full_csv.csv)
# csv_file.close()
full_csv.to_csv(join(INPUT_DIRECTORY, 'dataset.csv'), encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC, float_format='%.10f')

print('DONE!')