From 2437daea478fb225acdef25260f67295ea99548b Mon Sep 17 00:00:00 2001
From: "G. H. Erharter" <32653899+geograz@users.noreply.github.com>
Date: Thu, 10 Aug 2023 18:11:34 +0200
Subject: [PATCH] removing experimental code from main branch

---
 src/E_rasterizer.py                  | 143 -------------
 src/F_feature_engineering.py         | 120 -----------
 src/X_feature_engineering_library.py | 288 ---------------------------
 3 files changed, 551 deletions(-)
 delete mode 100644 src/E_rasterizer.py
 delete mode 100644 src/F_feature_engineering.py
 delete mode 100644 src/X_feature_engineering_library.py

diff --git a/src/E_rasterizer.py b/src/E_rasterizer.py
deleted file mode 100644
index 8ecbc22..0000000
--- a/src/E_rasterizer.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Experimental code to the paper XXXX
-Dr. Georg H. Erharter - 2023
-DOI: XXXXXXXXXXX
-
-Script that performs different raster based analyses on the dataset
-(results not included in first paper).
-"""
-
-import gc
-import numpy as np
-import open3d as o3d
-from os import listdir
-
-from X_library import parameters, utilities
-
-#############################
-# static variables and constants
-
-N_SETS_TO_PROCESS = 142  # max number of sets to process in this run
-
-TOT_BBOX_SIZE = 10  # total bounding box size [m]
-UNIT_BOX_SIZE = 1  # size of a measurement box [m]
-RESOLUTION = 256  # 3D grid resolution, 256
-voxel_size = TOT_BBOX_SIZE / RESOLUTION  # .05
-color = 1
-DICE_THRESH = 0.75  # threshold of dice coefficient that indicates similarity
-# run code for random- or sequential unprocessed samples -> multiprocessing
-MODE = 'sequential'  # 'random', 'sequential'
-COMPUTE_SIMILARITY = False  # whether or not self similarity should be computed
-
-#############################
-# processed variables and constants and instantiations
-
-print(f'Rasteranalysis in {MODE} mode')
-params = parameters()
-utils = utilities()
-
-# collect all discontinuity ids
-ids = [c.split('_')[0] for c in listdir(r'../combinations') if 'discontinuities' in c]
-
-#############################
-# main loop
-
-processed_sets = 0
-while processed_sets < N_SETS_TO_PROCESS:
-
-    already_processed = [c.split('_')[0] for c in listdir(r'../combinations') if '_RasterAnalysis' in c]
-    ids_unprocessed = np.where(np.in1d(ids, already_processed) == False)[0]
-
-    if MODE == 'random':
-        set_id = np.random.choice(ids_unprocessed, size=1)[0]
-    elif MODE == 'sequential':
-        set_id = ids_unprocessed[0]
-
-    name = f'{ids[set_id]}_RasterAnalysis.txt'
-
-    print(f'\nprocessing set {ids[set_id]}')
-    discontinuities = o3d.io.read_triangle_mesh(fr'../combinations/{ids[set_id]}_discontinuities.stl')
-    print('\tdiscontinuities loaded')
-
-    boxes_mesh = o3d.geometry.VoxelGrid.create_from_triangle_mesh_within_bounds(
-        input=discontinuities, voxel_size=voxel_size,
-        min_bound=np.array([0, 0, 0]),
-        max_bound=np.array([TOT_BBOX_SIZE, TOT_BBOX_SIZE, TOT_BBOX_SIZE]))
-    del discontinuities
-    print('\tdiscontinuity voxels created')
-
-    boxes_all = o3d.geometry.VoxelGrid.create_dense(width=TOT_BBOX_SIZE,
-                                                    height=TOT_BBOX_SIZE,
-                                                    depth=TOT_BBOX_SIZE,
-                                                    voxel_size=voxel_size,
-                                                    origin=np.array([0, 0, 0]),
-                                                    color=[color, color, color])
-    print('\toverall voxels created')
-
-    combined = boxes_mesh + boxes_all
-    del boxes_mesh
-    del boxes_all
-    gc.collect()
-
-    # convert voxels to 3D numpy array
-    gridded_voxels = utils.voxel2grid(combined, RESOLUTION, color)
-    gc.collect()
-
-    # compute structural complexity acc. to Bagrov et al. (2020)
-    c = params.structural_complexity(gridded_voxels, mode='3Dgrid')
-    print(f'\tstructural complexity: {c}')
-
-    if COMPUTE_SIMILARITY is True:
-        # compute number of voxels per unit box, rounded down to avoid boundary
-        # issues
-        resolution = int(UNIT_BOX_SIZE/voxel_size)
-        intervalls = np.arange(0, TOT_BBOX_SIZE*resolution, resolution)
-
-        # count how many blocks are similar to each other
-        similarity_count = []
-        for x1 in intervalls:
-            for y1 in intervalls:
-                for z1 in intervalls:
-                    test_block1 = gridded_voxels[x1:x1+resolution,
-                                                 y1:y1+resolution,
-                                                 z1:z1+resolution]
-                    dices = []
-                    for x2 in intervalls:
-                        for y2 in intervalls:
-                            for z2 in intervalls:
-
-                                test_block2 = gridded_voxels[x2:x2+resolution,
-                                                             y2:y2+resolution,
-                                                             z2:z2+resolution]
-                                # n elements same in array 1 and 2
-                                n_same = (test_block1 == test_block2).sum()
-                                # compute dice coefficient of similarity
-                                dice = (2*n_same) / (resolution**3 + resolution**3)
-                                dices.append(dice)
-                    # -1 due to the perfect self similarity
-                    similarity_count.append(len(np.where(np.array(dices) > DICE_THRESH)[0])-1)
-
-        n_zeros = len(np.where(np.array(similarity_count) == 0)[0])
-        max_ = max(similarity_count)
-        min_ = min(similarity_count)
-        mean = np.mean(similarity_count)
-        med = np.median(similarity_count)
-        del similarity_count
-    else:
-        n_zeros, max_, min_, mean, med = np.nan, np.nan, np.nan, np.nan, np.nan
-
-    del gridded_voxels
-    gc.collect()
-
-    with open(fr'../combinations/{name}', 'w') as f:
-        f.write(f'{n_zeros},{max_},{min_},{mean},{med},{c}')
-
-    print(f'\t{name} finished')
-    processed_sets += 1
-
-    # check if all files were processed
-    n_finished = [d for d in listdir(r'../combinations') if '_RasterAnalysis' in d]
-    if len(n_finished) == len(ids):
-        print('!!ALL FILES PROCESSED!!')
-        break
diff --git a/src/F_feature_engineering.py b/src/F_feature_engineering.py
deleted file mode 100644
index 1331062..0000000
--- a/src/F_feature_engineering.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Experimental code to the paper XXXX
-Dr. Georg H. Erharter - 2023
-DOI: XXXXXXXXXXX
-
-Script that investigates different ways of developing new parameters for rock
-mass characterization
-(results not included in first paper).
-"""
-
-import numpy as np
-import pandas as pd
-from os import listdir
-
-from X_library import utilities, plotter
-from X_feature_engineering_library import feature_engineer
-
-
-###########################################
-# fixed and constant variables
-###########################################
-
-BATCH_SIZE = 10_000_000  # batch size for the third level feature analysis
-TARGETS = ['structural complexity', 'Jv measured [discs/m³]', 'Minkowski',
-           'P32']
-BASE_FEATURES = [  # 'set 1 - radius [m]', 'set 2 - radius [m]',
-                 # 'set 3 - radius [m]', 'random set - radius [m]',
-                 # 'meas. spacing set 1 [m]', 'meas. spacing set 2 [m]',
-                 # 'meas. spacing set 3 [m]',
-                 'm_length', 'n_intersections',
-                 # 'n_discs',
-                 # 'avg. RQD',
-                 # 'avg. P10', 'avg. app. spacing [m]',
-                 ]
-TARGET_3RD_LEVEL = 'Jv'  # 'struct', 'mink', 'Jv', 'P32'
-MODE = 'evaluation'  # 'structure', 'scores', 'evaluation'
-N_TOP_SCORES = 20
-
-###########################################
-# data loading and other preprocessing
-###########################################
-
-# instantiation
-fe = feature_engineer()
-utils = utilities()
-pltr = plotter()
-
-# load data
-df = pd.read_excel(r'../output/dataset1.xlsx')
-print(len(df))
-
-###########################################
-# first and second level feature generation
-###########################################
-
-# make first level features
-df = fe.make_1st_level_features(df, features=BASE_FEATURES,
-                                operations=['log', 'sqrt', 'sqr', 'power_3',
-                                            'mult10', 'div10', '1div', 'div2'],
-                                drop_empty=True)
-l1_features = [f for f in df.columns if '-l1' in f]
-
-# make second level features
-df = fe.make_2nd_level_features(df, features=BASE_FEATURES + l1_features,
-                                drop_empty=True)
-l2_features = [f for f in df.columns if '-l2' in f]
-
-all_features = BASE_FEATURES + l1_features + l2_features
-print(len(all_features))
-
-# # check how well all features so far fit to the given targets
-# scores_struct, scores_Jv, scores_mink, scores_P32 = utils.assess_fits(
-#     df, features=all_features, targets=TARGETS)
-# struct_best, struct_max = utils.get_best_feature(scores_struct, all_features)
-# Jv_best, Jv_max = utils.get_best_feature(scores_Jv, all_features)
-# mink_best, mink_max = utils.get_best_feature(scores_mink, all_features)
-# P32_best, P32_max = utils.get_best_feature(scores_P32, all_features)
-
-# print(f'struct: {round(struct_max, 3)}, Jv: {round(Jv_max, 3)}, Mink: {round(mink_max, 3)}, P32: {round(P32_max, 3)}')
-
-###########################################
-# third level feature processing
-###########################################
-
-# generate all possible combinations of third level features
-if MODE == 'structure':
-    fe.gen_3rd_level_structure(all_features, list(fe.fusions3.keys()),
-                               batch_size=BATCH_SIZE)
-# compute fitting score for several batches -> can be done in parallel
-elif MODE == 'scores':
-    for batch in np.arange(90000000, 1000000000, step=10_000_000)[0:]:
-        print(f'process batch {batch}')
-        filename = f'{batch}_{TARGET_3RD_LEVEL}_score.gzip'
-        if filename in listdir(r'../features'):
-            print(f'{filename} already processed -> skip')
-            pass
-        else:
-            print(f'process {filename}')
-            savepath = fr'../features/{filename}'
-            fe.assess_3rd_level_features(batch, df, all_features,
-                                         target=TARGET_3RD_LEVEL,
-                                         savepath=savepath)
-# find best performing combinations of parameters
-elif MODE == 'evaluation':
-
-    filepaths = [fr'../features/{f}' for f in listdir(r'../features') if f'{TARGET_3RD_LEVEL}_score' in f]
-
-    result = fe.get_top_n_scores_in_files(n=N_TOP_SCORES, file_paths=filepaths)
-
-    best_comb = result[0]
-
-    best_comb_equation = fe.decode_combination(best_comb, all_features)
-
-    print(TARGET_3RD_LEVEL, best_comb['scores'])
-    print(best_comb_equation)
-
-    pltr.top_x_barplot(values=[r['scores'] for r in result],
-                       labels=[fe.decode_combination(r, all_features) for r in result],
-                       title='test')
diff --git a/src/X_feature_engineering_library.py b/src/X_feature_engineering_library.py
deleted file mode 100644
index e56bb70..0000000
--- a/src/X_feature_engineering_library.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Experimental code to the paper XXXX
-Dr. Georg H. Erharter - 2023
-DOI: XXXXXXXXXXX
-
-Script that contains functions to develop new parameters for rock mass
-characterization.
-(results not included in first paper)
-"""
-
-import numpy as np
-import pandas as pd
-from tqdm import tqdm
-
-from X_library import utilities
-
-
-class operations:
-
-    def __init__(self):
-        pass
-
-    def power_3(self, x):
-        return x**3
-
-    def multiply2(self, x):
-        return x*2
-
-    def multiply3(self, x):
-        return x*3
-
-    def multiply10(self, x):
-        return x*10
-
-    def div2(self, x):
-        return x/2
-
-    def div10(self, x):
-        return x/10
-
-    def _1div(self, x):
-        return 1/x
-
-    def add_2(self, x, y):
-        return x + y
-
-    def subtract_2(self, x, y):
-        return x - y
-
-    def multiply_2(self, x, y):
-        return x * y
-
-    def divide_2(self, x, y):
-        return x / y
-
-    def power(self, x, y):
-        return x**y
-
-    def add_3(self, x, y, z):
-        return x + y + z
-
-    def minus_3(self, x, y, z):
-        return x - y - z
-
-    def multiply_3(self, x, y, z):
-        return x * y * z
-
-    def div_3(self, x, y, z):
-        return x / y / z
-
-
-class feature_engineer(operations):
-
-    def __init__(self):
-
-        self.utils = utilities()
-
-        # collection of operations to apply to make first level features
-        self.transformations = {'log': np.log, 'sqrt': np.sqrt,
-                                'sqr': np.square, 'exp': np.exp,
-                                'power_3': self.power_3,
-                                'mult2': self.multiply2,
-                                'mult3': self.multiply3,
-                                'mult10': self.multiply10,
-                                'div2': self.div2, 'div10': self.div10,
-                                '1div': self._1div}
-        # collection of operations to apply to make second level features
-        self.fusions2 = {'plus': self.add_2, 'minus': self.subtract_2,
-                         'times': self.multiply_2, 'dividedby': self.divide_2,
-                         'power': self.power}
-        # collection of operations to apply to make third level features
-        self.fusions3 = {'plus': self.add_3, 'times': self.multiply_3,
-                         'minus': self.minus_3, 'dividedby': self.div_3}
-        # operations that are commutative e.g. a + b = b + a -> redundant
-        self.commutative = ('plus', 'times')
-
-    def drop_no_information_cols(self, df: pd.DataFrame,
-                                 nan_threshold: int = 300) -> pd.DataFrame:
-        '''function removes columns from dataframe without information or too
-        many NAN'''
-        id_0 = np.where(df.sum(axis=0).values == 0)[0]
-        df.drop(columns=df.columns[id_0], inplace=True)
-        id_nan = np.where(df.isna().sum().values > nan_threshold)[0]
-        df.drop(columns=df.columns[id_nan], inplace=True)
-        return df
-
-    def gen_3rd_level_structure(self, features, operations, batch_size):
-        n_features = tuple(range(len(features)))
-        n_operations = tuple(range(len(operations)))
-
-        tot_combs = len(features)*len(features)*len(features)*len(operations)
-        print(f'{round(tot_combs / 1_000_000_000, 3)} billion combinations')
-        print(f'{int(tot_combs/batch_size)} files will be saved')
-
-        counter = 0
-        i_s, j_s, k_s, l_s = [], [], [], []
-        while counter < batch_size:
-            for i in n_features:
-                for j in n_features:
-                    for k in n_features:
-                        for l in n_operations:
-                            i_s.append(i)
-                            j_s.append(j)
-                            k_s.append(k)
-                            l_s.append(l)
-                            counter += 1
-                            if counter % batch_size == 0 or counter == tot_combs-1:
-                                df = pd.DataFrame({'feature i': np.array(i_s).astype(np.int16),
-                                                   'feature j': np.array(j_s).astype(np.int16),
-                                                   'feature k': np.array(k_s).astype(np.int16),
-                                                   'operation': np.array(l_s).astype(np.int8)})
-                                df.to_parquet(fr'../features/{counter}.gzip',
-                                              index=False)
-                                i_s.clear()
-                                j_s.clear()
-                                k_s.clear()
-                                l_s.clear()
-
-    def assess_3rd_level_features(self, filename, df, features, target,
-                                  savepath):
-        df_indices = pd.read_parquet(fr'../features/{filename}.gzip')
-        operations = list(self.fusions3.keys())
-
-        scores = []
-        for i in tqdm(range(len(df_indices))):
-            x, y, z, f = df_indices.iloc[i]
-            x, y, z, f = features[x], features[y], features[z], operations[f]
-            new_feature = self.fusions3[f](df[x], df[y], df[z]).values
-
-            new_feature = self.utils.convert_inf(new_feature)
-            if np.isnan(new_feature).sum() > 0:
-                # pass if data contains nan
-                score = np.nan
-            else:
-                if target == 'struct':
-                    score = self.utils.assess_fit2(
-                        df['structural complexity'].values, y=new_feature,
-                        scale_indiv=True)
-                elif target == 'mink':
-                    score = self.utils.assess_fit2(
-                        df['Minkowski'].values, y=new_feature,
-                        scale_indiv=True)
-                elif target == 'Jv':
-                    score = self.utils.assess_fit2(
-                        df['Jv measured [discs/m³]'].values, y=new_feature,
-                        scale_indiv=False)
-                elif target == 'P32':
-                    score = self.utils.assess_fit2(
-                        df['P32'].values, y=new_feature,
-                        scale_indiv=False)
-                else:
-                    raise ValueError(f'{target} as target not available!')
-            scores.append(score)
-        df_indices['scores'] = np.array(scores).astype(np.float32)
-        # only save results with a score > R2 = 0.5 to save space
-        df_indices = df_indices[df_indices['scores'] > 0.5]
-        df_indices.to_parquet(savepath, index=False)
-
-    def make_1st_level_features(self, df, features: list = None,
-                                operations: list = None,
-                                drop_empty: bool = False) -> pd.DataFrame:
-        '''function that computes new features based on single features only.
-        Unless specified the function computes new features for all columns of
-        the dataframe and also uses all possible operations.'''
-        if features is None:
-            features = df.columns
-        if operations is None:
-            operations = self.transformations.keys()
-
-        new_headers = []
-        new_cols = []
-        for f in features:
-            for t in operations:
-                new_headers.append(f'{t}_{f}-l1')
-                new_cols.append(self.transformations[t](df[f]))
-        df_temp = pd.DataFrame(columns=new_headers,
-                               data=np.array(new_cols).T,
-                               index=df.index)
-        df = pd.concat([df, df_temp], axis=1)
-
-        if drop_empty is True:
-            df = self.drop_no_information_cols(df)
-        print('level 1 features computed', len(df.columns))
-        return df
-
-    def make_2nd_level_features(self, df, features: list = None,
-                                operations: list = None,
-                                drop_empty: bool = False) -> pd.DataFrame:
-        '''function that computes new features based on all unique combinations
-        of 2 features in the dataframe.
-        Unless specified the function computes new features for all columns of
-        the dataframe and also uses all possible operations.'''
-        if features is None:
-            features = df.columns
-        if operations is None:
-            operations = self.fusions2.keys()
-
-        new_headers = []
-        new_cols = []
-        for i, x in enumerate(features):
-            for j, y in enumerate(features):
-                if i == j:  # avoid duplicates
-                    pass
-                else:
-                    for f in operations:
-                        if f in self.commutative and j > i:
-                            # avoid making duplicate features due to
-                            # commutative operations
-                            pass
-                        else:
-                            new_headers.append(f'{x}_{f}_{y}-l2')
-                            new_cols.append(self.fusions2[f](df[x], df[y]))
-        # check if duplicates were generated
-        if len(new_headers) != len(set(new_headers)):
-            raise ValueError('duplicate features generated in second level')
-        else:
-            df_temp = pd.DataFrame(columns=new_headers,
-                                   data=np.array(new_cols).T,
-                                   index=df.index)
-            df = pd.concat([df, df_temp], axis=1)
-
-            if drop_empty is True:
-                df = self.drop_no_information_cols(df)
-            print('level 2 features computed', len(df.columns))
-            return df
-
-    def get_top_n_scores_in_files(self, n, file_paths):
-        '''function iterates files with scores and only saves the top n scores
-        and combinations for further analyses.
-        original function made by ChatGPT'''
-        top_scores = []
-        for file_path in file_paths:
-            df = pd.read_parquet(file_path)
-            for idx, row in df.iterrows():
-                score = row['scores']
-                if len(top_scores) < n:
-                    top_scores.append((score, row))
-                    top_scores.sort(reverse=True, key=lambda x: x[0])
-                elif score > top_scores[-1][0]:
-                    top_scores[-1] = (score, row)
-                    top_scores.sort(reverse=True, key=lambda x: x[0])
-        return [x[1] for x in top_scores]
-
-    def decode_combination(self, combination: pd.Series, all_features: list):
-        f1 = all_features[int(combination['feature i'])]
-        f2 = all_features[int(combination['feature j'])]
-        f3 = all_features[int(combination['feature k'])]
-        operation = list(self.fusions3.keys())[int(combination['operation'])]
-        comb_string = f'{f1} {operation} {f2} {operation} {f3}'
-        return comb_string
-
-
-# example usage of code
-if __name__ == '__main__':
-
-    # create dummy data
-    df = pd.DataFrame({'x': np.random.uniform(0, 1, 100),
-                       'y': np.random.uniform(0, 1, 100),
-                       'z': np.random.uniform(0, 1, 100)})
-
-    # instantiate feature engineer
-    fe = feature_engineer()
-    # create first level features
-    df = fe.make_1st_level_features(df, features=['x', 'y'],
-                                    operations=['log', 'sqrt'])
-    # create second level features of all previous ones
-    df = fe.make_2nd_level_features(df)