From 2437daea478fb225acdef25260f67295ea99548b Mon Sep 17 00:00:00 2001 From: "G. H. Erharter" <32653899+geograz@users.noreply.github.com> Date: Thu, 10 Aug 2023 18:11:34 +0200 Subject: [PATCH] removing experimental code from main branch --- src/E_rasterizer.py | 143 ------------- src/F_feature_engineering.py | 120 ----------- src/X_feature_engineering_library.py | 288 --------------------------- 3 files changed, 551 deletions(-) delete mode 100644 src/E_rasterizer.py delete mode 100644 src/F_feature_engineering.py delete mode 100644 src/X_feature_engineering_library.py diff --git a/src/E_rasterizer.py b/src/E_rasterizer.py deleted file mode 100644 index 8ecbc22..0000000 --- a/src/E_rasterizer.py +++ /dev/null @@ -1,143 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Experimental code to the paper XXXX -Dr. Georg H. Erharter - 2023 -DOI: XXXXXXXXXXX - -Script that performs different raster based analyses on the dataset -(results not included in first paper). -""" - -import gc -import numpy as np -import open3d as o3d -from os import listdir - -from X_library import parameters, utilities - -############################# -# static variables and constants - -N_SETS_TO_PROCESS = 142 # max number of sets to process in this run - -TOT_BBOX_SIZE = 10 # total bounding box size [m] -UNIT_BOX_SIZE = 1 # size of a measurement box [m] -RESOLUTION = 256 # 3D grid resolution, 256 -voxel_size = TOT_BBOX_SIZE / RESOLUTION # .05 -color = 1 -DICE_THRESH = 0.75 # threshold of dice coefficient that indicates similarity -# run code for random- or sequential unprocessed samples -> multiprocessing -MODE = 'sequential' # 'random', 'sequential' -COMPUTE_SIMILARITY = False # whether or not self similarity should be computed - -############################# -# processed variables and constants and instantiations - -print(f'Rasteranalysis in {MODE} mode') -params = parameters() -utils = utilities() - -# collect all discontinuity ids -ids = [c.split('_')[0] for c in listdir(r'../combinations') if 'discontinuities' in c] - -############################# -# main loop - -processed_sets = 0 -while processed_sets < N_SETS_TO_PROCESS: - - already_processed = [c.split('_')[0] for c in listdir(r'../combinations') if '_RasterAnalysis' in c] - ids_unprocessed = np.where(np.in1d(ids, already_processed) == False)[0] - - if MODE == 'random': - set_id = np.random.choice(ids_unprocessed, size=1)[0] - elif MODE == 'sequential': - set_id = ids_unprocessed[0] - - name = f'{ids[set_id]}_RasterAnalysis.txt' - - print(f'\nprocessing set {ids[set_id]}') - discontinuities = o3d.io.read_triangle_mesh(fr'../combinations/{ids[set_id]}_discontinuities.stl') - print('\tdiscontinuities loaded') - - boxes_mesh = o3d.geometry.VoxelGrid.create_from_triangle_mesh_within_bounds( - input=discontinuities, voxel_size=voxel_size, - min_bound=np.array([0, 0, 0]), - max_bound=np.array([TOT_BBOX_SIZE, TOT_BBOX_SIZE, TOT_BBOX_SIZE])) - del discontinuities - print('\tdiscontinuity voxels created') - - boxes_all = o3d.geometry.VoxelGrid.create_dense(width=TOT_BBOX_SIZE, - height=TOT_BBOX_SIZE, - depth=TOT_BBOX_SIZE, - voxel_size=voxel_size, - origin=np.array([0, 0, 0]), - color=[color, color, color]) - print('\toverall voxels created') - - combined = boxes_mesh + boxes_all - del boxes_mesh - del boxes_all - gc.collect() - - # convert voxels to 3D numpy array - gridded_voxels = utils.voxel2grid(combined, RESOLUTION, color) - gc.collect() - - # compute structural complexity acc. to Bagrov et al. (2020) - c = params.structural_complexity(gridded_voxels, mode='3Dgrid') - print(f'\tstructural complexity: {c}') - - if COMPUTE_SIMILARITY is True: - # compute number of voxels per unit box, rounded down to avoid boundary - # issues - resolution = int(UNIT_BOX_SIZE/voxel_size) - intervalls = np.arange(0, TOT_BBOX_SIZE*resolution, resolution) - - # count how many blocks are similar to each other - similarity_count = [] - for x1 in intervalls: - for y1 in intervalls: - for z1 in intervalls: - test_block1 = gridded_voxels[x1:x1+resolution, - y1:y1+resolution, - z1:z1+resolution] - dices = [] - for x2 in intervalls: - for y2 in intervalls: - for z2 in intervalls: - - test_block2 = gridded_voxels[x2:x2+resolution, - y2:y2+resolution, - z2:z2+resolution] - # n elements same in array 1 and 2 - n_same = (test_block1 == test_block2).sum() - # compute dice coefficient of similarity - dice = (2*n_same) / (resolution**3 + resolution**3) - dices.append(dice) - # -1 due to the perfect self similarity - similarity_count.append(len(np.where(np.array(dices) > DICE_THRESH)[0])-1) - - n_zeros = len(np.where(np.array(similarity_count) == 0)[0]) - max_ = max(similarity_count) - min_ = min(similarity_count) - mean = np.mean(similarity_count) - med = np.median(similarity_count) - del similarity_count - else: - n_zeros, max_, min_, mean, med = np.nan, np.nan, np.nan, np.nan, np.nan - - del gridded_voxels - gc.collect() - - with open(fr'../combinations/{name}', 'w') as f: - f.write(f'{n_zeros},{max_},{min_},{mean},{med},{c}') - - print(f'\t{name} finished') - processed_sets += 1 - - # check if all files were processed - n_finished = [d for d in listdir(r'../combinations') if '_RasterAnalysis' in d] - if len(n_finished) == len(ids): - print('!!ALL FILES PROCESSED!!') - break diff --git a/src/F_feature_engineering.py b/src/F_feature_engineering.py deleted file mode 100644 index 1331062..0000000 --- a/src/F_feature_engineering.py +++ /dev/null @@ -1,120 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Experimental code to the paper XXXX -Dr. Georg H. Erharter - 2023 -DOI: XXXXXXXXXXX - -Script that investigates different ways of developing new parameters for rock -mass characterization -(results not included in first paper). -""" - -import numpy as np -import pandas as pd -from os import listdir - -from X_library import utilities, plotter -from X_feature_engineering_library import feature_engineer - - -########################################### -# fixed and constant variables -########################################### - -BATCH_SIZE = 10_000_000 # batch size for the third level feature analysis -TARGETS = ['structural complexity', 'Jv measured [discs/m³]', 'Minkowski', - 'P32'] -BASE_FEATURES = [ # 'set 1 - radius [m]', 'set 2 - radius [m]', - # 'set 3 - radius [m]', 'random set - radius [m]', - # 'meas. spacing set 1 [m]', 'meas. spacing set 2 [m]', - # 'meas. spacing set 3 [m]', - 'm_length', 'n_intersections', - # 'n_discs', - # 'avg. RQD', - # 'avg. P10', 'avg. app. spacing [m]', - ] -TARGET_3RD_LEVEL = 'Jv' # 'struct', 'mink', 'Jv', 'P32' -MODE = 'evaluation' # 'structure', 'scores', 'evaluation' -N_TOP_SCORES = 20 - -########################################### -# data loading and other preprocessing -########################################### - -# instantiation -fe = feature_engineer() -utils = utilities() -pltr = plotter() - -# load data -df = pd.read_excel(r'../output/dataset1.xlsx') -print(len(df)) - -########################################### -# first and second level feature generation -########################################### - -# make first level features -df = fe.make_1st_level_features(df, features=BASE_FEATURES, - operations=['log', 'sqrt', 'sqr', 'power_3', - 'mult10', 'div10', '1div', 'div2'], - drop_empty=True) -l1_features = [f for f in df.columns if '-l1' in f] - -# make second level features -df = fe.make_2nd_level_features(df, features=BASE_FEATURES + l1_features, - drop_empty=True) -l2_features = [f for f in df.columns if '-l2' in f] - -all_features = BASE_FEATURES + l1_features + l2_features -print(len(all_features)) - -# # check how well all features so far fit to the given targets -# scores_struct, scores_Jv, scores_mink, scores_P32 = utils.assess_fits( -# df, features=all_features, targets=TARGETS) -# struct_best, struct_max = utils.get_best_feature(scores_struct, all_features) -# Jv_best, Jv_max = utils.get_best_feature(scores_Jv, all_features) -# mink_best, mink_max = utils.get_best_feature(scores_mink, all_features) -# P32_best, P32_max = utils.get_best_feature(scores_P32, all_features) - -# print(f'struct: {round(struct_max, 3)}, Jv: {round(Jv_max, 3)}, Mink: {round(mink_max, 3)}, P32: {round(P32_max, 3)}') - -########################################### -# third level feature processing -########################################### - -# generate all possible combinations of third level features -if MODE == 'structure': - fe.gen_3rd_level_structure(all_features, list(fe.fusions3.keys()), - batch_size=BATCH_SIZE) -# compute fitting score for several batches -> can be done in parallel -elif MODE == 'scores': - for batch in np.arange(90000000, 1000000000, step=10_000_000)[0:]: - print(f'process batch {batch}') - filename = f'{batch}_{TARGET_3RD_LEVEL}_score.gzip' - if filename in listdir(r'../features'): - print(f'{filename} already processed -> skip') - pass - else: - print(f'process {filename}') - savepath = fr'../features/{filename}' - fe.assess_3rd_level_features(batch, df, all_features, - target=TARGET_3RD_LEVEL, - savepath=savepath) -# find best performing combinations of parameters -elif MODE == 'evaluation': - - filepaths = [fr'../features/{f}' for f in listdir(r'../features') if f'{TARGET_3RD_LEVEL}_score' in f] - - result = fe.get_top_n_scores_in_files(n=N_TOP_SCORES, file_paths=filepaths) - - best_comb = result[0] - - best_comb_equation = fe.decode_combination(best_comb, all_features) - - print(TARGET_3RD_LEVEL, best_comb['scores']) - print(best_comb_equation) - - pltr.top_x_barplot(values=[r['scores'] for r in result], - labels=[fe.decode_combination(r, all_features) for r in result], - title='test') diff --git a/src/X_feature_engineering_library.py b/src/X_feature_engineering_library.py deleted file mode 100644 index e56bb70..0000000 --- a/src/X_feature_engineering_library.py +++ /dev/null @@ -1,288 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Experimental code to the paper XXXX -Dr. Georg H. Erharter - 2023 -DOI: XXXXXXXXXXX - -Script that contains functions to develop new parameters for rock mass -characterization. -(results not included in first paper) -""" - -import numpy as np -import pandas as pd -from tqdm import tqdm - -from X_library import utilities - - -class operations: - - def __init__(self): - pass - - def power_3(self, x): - return x**3 - - def multiply2(self, x): - return x*2 - - def multiply3(self, x): - return x*3 - - def multiply10(self, x): - return x*10 - - def div2(self, x): - return x/2 - - def div10(self, x): - return x/10 - - def _1div(self, x): - return 1/x - - def add_2(self, x, y): - return x + y - - def subtract_2(self, x, y): - return x - y - - def multiply_2(self, x, y): - return x * y - - def divide_2(self, x, y): - return x / y - - def power(self, x, y): - return x**y - - def add_3(self, x, y, z): - return x + y + z - - def minus_3(self, x, y, z): - return x - y - z - - def multiply_3(self, x, y, z): - return x * y * z - - def div_3(self, x, y, z): - return x / y / z - - -class feature_engineer(operations): - - def __init__(self): - - self.utils = utilities() - - # collection of operations to apply to make first level features - self.transformations = {'log': np.log, 'sqrt': np.sqrt, - 'sqr': np.square, 'exp': np.exp, - 'power_3': self.power_3, - 'mult2': self.multiply2, - 'mult3': self.multiply3, - 'mult10': self.multiply10, - 'div2': self.div2, 'div10': self.div10, - '1div': self._1div} - # collection of operations to apply to make second level features - self.fusions2 = {'plus': self.add_2, 'minus': self.subtract_2, - 'times': self.multiply_2, 'dividedby': self.divide_2, - 'power': self.power} - # collection of operations to apply to make third level features - self.fusions3 = {'plus': self.add_3, 'times': self.multiply_3, - 'minus': self.minus_3, 'dividedby': self.div_3} - # operations that are commutative e.g. a + b = b + a -> redundant - self.commutative = ('plus', 'times') - - def drop_no_information_cols(self, df: pd.DataFrame, - nan_threshold: int = 300) -> pd.DataFrame: - '''function removes columns from dataframe without information or too - many NAN''' - id_0 = np.where(df.sum(axis=0).values == 0)[0] - df.drop(columns=df.columns[id_0], inplace=True) - id_nan = np.where(df.isna().sum().values > nan_threshold)[0] - df.drop(columns=df.columns[id_nan], inplace=True) - return df - - def gen_3rd_level_structure(self, features, operations, batch_size): - n_features = tuple(range(len(features))) - n_operations = tuple(range(len(operations))) - - tot_combs = len(features)*len(features)*len(features)*len(operations) - print(f'{round(tot_combs / 1_000_000_000, 3)} billion combinations') - print(f'{int(tot_combs/batch_size)} files will be saved') - - counter = 0 - i_s, j_s, k_s, l_s = [], [], [], [] - while counter < batch_size: - for i in n_features: - for j in n_features: - for k in n_features: - for l in n_operations: - i_s.append(i) - j_s.append(j) - k_s.append(k) - l_s.append(l) - counter += 1 - if counter % batch_size == 0 or counter == tot_combs-1: - df = pd.DataFrame({'feature i': np.array(i_s).astype(np.int16), - 'feature j': np.array(j_s).astype(np.int16), - 'feature k': np.array(k_s).astype(np.int16), - 'operation': np.array(l_s).astype(np.int8)}) - df.to_parquet(fr'../features/{counter}.gzip', - index=False) - i_s.clear() - j_s.clear() - k_s.clear() - l_s.clear() - - def assess_3rd_level_features(self, filename, df, features, target, - savepath): - df_indices = pd.read_parquet(fr'../features/{filename}.gzip') - operations = list(self.fusions3.keys()) - - scores = [] - for i in tqdm(range(len(df_indices))): - x, y, z, f = df_indices.iloc[i] - x, y, z, f = features[x], features[y], features[z], operations[f] - new_feature = self.fusions3[f](df[x], df[y], df[z]).values - - new_feature = self.utils.convert_inf(new_feature) - if np.isnan(new_feature).sum() > 0: - # pass if data contains nan - score = np.nan - else: - if target == 'struct': - score = self.utils.assess_fit2( - df['structural complexity'].values, y=new_feature, - scale_indiv=True) - elif target == 'mink': - score = self.utils.assess_fit2( - df['Minkowski'].values, y=new_feature, - scale_indiv=True) - elif target == 'Jv': - score = self.utils.assess_fit2( - df['Jv measured [discs/m³]'].values, y=new_feature, - scale_indiv=False) - elif target == 'P32': - score = self.utils.assess_fit2( - df['P32'].values, y=new_feature, - scale_indiv=False) - else: - raise ValueError(f'{target} as target not available!') - scores.append(score) - df_indices['scores'] = np.array(scores).astype(np.float32) - # only save results with a score > R2 = 0.5 to save space - df_indices = df_indices[df_indices['scores'] > 0.5] - df_indices.to_parquet(savepath, index=False) - - def make_1st_level_features(self, df, features: list = None, - operations: list = None, - drop_empty: bool = False) -> pd.DataFrame: - '''function that computes new features based on single features only. - Unless specified the function computes new features for all columns of - the dataframe and also uses all possible operations.''' - if features is None: - features = df.columns - if operations is None: - operations = self.transformations.keys() - - new_headers = [] - new_cols = [] - for f in features: - for t in operations: - new_headers.append(f'{t}_{f}-l1') - new_cols.append(self.transformations[t](df[f])) - df_temp = pd.DataFrame(columns=new_headers, - data=np.array(new_cols).T, - index=df.index) - df = pd.concat([df, df_temp], axis=1) - - if drop_empty is True: - df = self.drop_no_information_cols(df) - print('level 1 features computed', len(df.columns)) - return df - - def make_2nd_level_features(self, df, features: list = None, - operations: list = None, - drop_empty: bool = False) -> pd.DataFrame: - '''function that computes new features based on all unique combinations - of 2 features in the dataframe. - Unless specified the function computes new features for all columns of - the dataframe and also uses all possible operations.''' - if features is None: - features = df.columns - if operations is None: - operations = self.fusions2.keys() - - new_headers = [] - new_cols = [] - for i, x in enumerate(features): - for j, y in enumerate(features): - if i == j: # avoid duplicates - pass - else: - for f in operations: - if f in self.commutative and j > i: - # avoid making duplicate features due to - # commutative operations - pass - else: - new_headers.append(f'{x}_{f}_{y}-l2') - new_cols.append(self.fusions2[f](df[x], df[y])) - # check if duplicates were generated - if len(new_headers) != len(set(new_headers)): - raise ValueError('duplicate features generated in second level') - else: - df_temp = pd.DataFrame(columns=new_headers, - data=np.array(new_cols).T, - index=df.index) - df = pd.concat([df, df_temp], axis=1) - - if drop_empty is True: - df = self.drop_no_information_cols(df) - print('level 2 features computed', len(df.columns)) - return df - - def get_top_n_scores_in_files(self, n, file_paths): - '''function iterates files with scores and only saves the top n scores - and combinations for further analyses. - original function made by ChatGPT''' - top_scores = [] - for file_path in file_paths: - df = pd.read_parquet(file_path) - for idx, row in df.iterrows(): - score = row['scores'] - if len(top_scores) < n: - top_scores.append((score, row)) - top_scores.sort(reverse=True, key=lambda x: x[0]) - elif score > top_scores[-1][0]: - top_scores[-1] = (score, row) - top_scores.sort(reverse=True, key=lambda x: x[0]) - return [x[1] for x in top_scores] - - def decode_combination(self, combination: pd.Series, all_features: list): - f1 = all_features[int(combination['feature i'])] - f2 = all_features[int(combination['feature j'])] - f3 = all_features[int(combination['feature k'])] - operation = list(self.fusions3.keys())[int(combination['operation'])] - comb_string = f'{f1} {operation} {f2} {operation} {f3}' - return comb_string - - -# example usage of code -if __name__ == '__main__': - - # create dummy data - df = pd.DataFrame({'x': np.random.uniform(0, 1, 100), - 'y': np.random.uniform(0, 1, 100), - 'z': np.random.uniform(0, 1, 100)}) - - # instantiate feature engineer - fe = feature_engineer() - # create first level features - df = fe.make_1st_level_features(df, features=['x', 'y'], - operations=['log', 'sqrt']) - # create second level features of all previous ones - df = fe.make_2nd_level_features(df)