# Formatting

In [None]:
# import pandas as pd
# pd.options.display.max_columns = None
# pd.options.display.max_rows = None

# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))
# display(HTML("<style>.pre { width:100% !important; }</style>"))

# Options

In [None]:
REPLACE_SUBTOTALS = False
IDIOM = 'ENUS'
TARGET_PATH = '../datasets/Petrofácies (compilação de dados do De Ros)/Tabelas Petrográficas Margem Equatorial/'

# Data Collection

## Dataset files

In [None]:
import pandas as pd
from IPython.display import display
from os.path import join

dataset = pd.read_csv(join(TARGET_PATH,'dataset.csv'), delimiter=',')

feature_names = list(map(lambda x: x.lower(),dataset.columns.values))
dataset.columns = feature_names
result_dataset = dataset.copy(deep=True)

## Attribute separation

In [None]:
import pandas as pd
import numpy as np
import re

attributes = list(map(lambda x: x.split(' - '), feature_names))
attributes = {i:attribute for i, attribute in enumerate(attributes)}
for key in attributes:
    while len(attributes[key]) < 7:
        attributes[key].append(np.nan)
attributes = pd.DataFrame(attributes).transpose()

def extract_compositional_type(s):
    n_attributes = s.count(' - ') +1
    if n_attributes == 3:
        return 'primary'
    elif n_attributes == 7:
        return 'diagenetic'
    elif n_attributes == 6:
        return 'porosity'
    else:
        return ''

compositional_type = pd.Series(map(lambda s: extract_compositional_type(s), feature_names))
attributes[0] = attributes[0].str.replace("\[.*\]","")
attributes.insert(0, -1, compositional_type)

# Subtotals grouping

In [None]:
diagenese_mapping = pd.read_csv('./subtotals_instructive_tables/Categorias de Localização Diagenética revDeRos.csv', delimiter=',')
diagenese_mapping = diagenese_mapping.apply(lambda x: x.astype(str).str.lower())
pore_mapping = pd.read_csv('./subtotals_instructive_tables/Categorias de Localização de Poros revDeRos.csv', delimiter=',')
pore_mapping = pore_mapping.apply(lambda x: x.astype(str).str.lower())

macro_locations = ['interstitial', 'framework', 'framework and interstitial']

## Primary Subotals Grouping

In [None]:
primary = attributes[attributes[-1]  == 'primary'].iloc[:,1:4]
primary.columns = ['constituent', 'location', 'modification']

grouped_primary = primary.groupby(['constituent', 'location'])

for name, group in grouped_primary:
    if len(group) > 1:
        result_dataset['[primary-subtotal]' + ' - '.join(name)] = dataset.iloc[:,group.index.values].sum(axis=1)
        if REPLACE_SUBTOTALS: result_dataset = result_dataset.drop([' - '.join(row) for index, row in group.iterrows()], axis=1)
#         display(group)

## Diagenese Subtotals Grouping

In [None]:
diagenese = attributes[attributes[-1]  == 'diagenetic'].iloc[:,1:]
diagenese.columns = ['consituent', 'habit', 'location', 'modification', 'paragenetic relation', 'paragenetic relation constituents', 'paragenetic relation constituent location']

def define_diagenetic_macro_location(line):
    return_line = line.tolist()
    if line['paragenetic relation constituent location'] in macro_locations:
        return_line.append(line[6])
    else:
        query = diagenese_mapping[diagenese_mapping['VALUE_'+IDIOM] == line[-1]]
        if not query.empty:
            return_line += query['location'].tolist()
        else:
            query = diagenese_mapping[diagenese_mapping['VALUE_'+IDIOM] == line[2]]
            if not query.empty:
                print(query['location'].tolist())
                return_line += query['location'].tolist()
                
    if len(return_line) != 8:
        raise ValueError('Something is wrong in line: '+str(line.tolist())+', generated:'+str(return_line))
        
    return pd.Series(return_line)
        
diagenese = pd.DataFrame(diagenese.apply(define_diagenetic_macro_location, axis=1))
diagenese.columns = ['consituent', 'habit', 'location', 'modification', 'paragenetic relation', 'paragenetic relation constituents', 'paragenetic relation constituent location', 'macro location']

grouped_diagenese = diagenese.groupby(['consituent', 'habit', 'macro location'])
for name, group in grouped_diagenese:
    if len(group) > 1:
        result_dataset['[diagenese-subtotal]' + ' - '.join(name)] = dataset.iloc[:,group.index.values].sum(axis=1)
        if REPLACE_SUBTOTALS: result_dataset = result_dataset.drop([' - '.join(row[:-1]) for index, row in group.iterrows()], axis=1)
#         print(' - '.join(name))
#         display(group)

## Porosity Subtotals Grouping

In [None]:
porosity = attributes[attributes[-1]  == 'porosity'].iloc[:,1:7]
porosity.columns = ['consituent', 'habit', 'location', 'modification', 'paragenetic relation', 'paragenetic relation constituents', 'paragenetic relation constituent location']

grouped_porosity = porosity.groupby(['porosity', 'location'])
for name, group in grouped_porosity:
    if len(group) > 1:
        result_dataset['[porosity-subtotal]' + ' - '.join(name)] = dataset.iloc[:,group.index.values].sum(axis=1)
        if REPLACE_SUBTOTALS: result_dataset = result_dataset.drop([' - '.join(row) for index, row in group.iterrows()], axis=1)
#         print(' - '.join(name))
#         display(group)

# Output File

In [None]:
import csv
result_dataset.to_csv(join(TARGET_PATH,'dataset_subtotals.csv'), sep=',', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)