In [1]:
import logging
import argparse
import sys
import json
import os
import pandas as pd 
import time
import numpy as np

from bigml.api import BigML
from datetime import datetime

from shapsplain.forest import ShapForest

# HTTPS WARNINGS workaround https://github.com/influxdata/influxdb-python/issues/240
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
#### INIT CONFIG ################################################################################
def init_config(json_file_path):
     """Initializes environment variables from given JSON file into a dictionnary"""
     with open(json_file_path, "r") as f:
      config_dict = json.load(f)
     return config_dict


In [3]:
#### INIT LOGGER ################################################################################
def init_logger(log_level):
     """Initializes log structure. """
     LOGGER = logging.getLogger()
     LOGGER.setLevel(log_level)
     ch = logging.StreamHandler(sys.stdout)
     # ch.setLevel('INFO')
     formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     ch.setFormatter(formatter)
     LOGGER.addHandler(ch)
     return LOGGER

In [4]:
#### INIT PARAMS ################################################################################
def init_params(json_file_path):
     """Initializes parameter variables from given JSON file into a dictionnary"""
     with open(json_file_path, "r") as f:
      param_dict = json.load(f)
     return param_dict

In [5]:
#### EXECUTE WHIZZML ################################################################################
def execute_whizzml(whizzml_script_id, script_inputs, api, log):
     """Executes whizzml script and returns results """
     log.info("Executing WhizzML script %s" % whizzml_script_id)
     log.info("Script inputs: %s" % script_inputs)     
     # execute WhizzML script to generate ensemble and evaluation
     execution = api.create_execution(whizzml_script_id, script_inputs)
     if not api.ok(execution,wait_time=60):
        log.error("WhizzML execution error %s" % execution["resource"])
        sys.exit("WhizzML execution could not be performed")
     log.info("WhizzML execution ended %s" % execution["resource"])
     # if error found, raise and exit
     if execution["object"]["status"]["code"] == -1:
        log.error("WhizzML execution error: %s. Execution id: %s" % (execution["object"]["status"]["message"],execution["object"]["resource"]))
        sys.exit("Error found while executing WhizzML script")
     return execution["object"]["execution"]["result"]

In [6]:
#### CREATE SOURCE ################################################################################
def create_source(source_path, api, log, args=None):
    """Creates a source."""
    log.info("Creating source from file %s" % source_path)
    # check if file exists
    if not os.path.exists(source_path):
        log.error("Provided file does not exist %s" % source_path)
        sys.exit("Source file not found")
    if args is None:
        args = {}
    source = api.create_source(source_path, args)
    if not api.ok(source):
        log.error("Could not create source %s from file %s" % (source["resource"],source_path))
        sys.exit("Source couldn't be created or retrieved.")
    log.info("Source created successfuly %s" % source["resource"])
    return source

In [7]:
###  CREATE TRAIN DATASET #############################################################
def create_joined_dataset(input_file, repairs_dataset, log, api):
        # create training source
        source = create_source(input_file, api, log)
        # create training dataset
        api.ok(source)
        ds = api.create_dataset(source)
        # join datasets
        api.ok(ds)
        ds_flags = api.create_dataset(
                             [{
                         
                                 "id": ds["resource"],
                                 "name": "A"
                              },
                              {
                                 "id": repairs_dataset,
                                 "name": "B"
                              }
                             ],
                             {
                                 "sql_query": "SELECT `A`.*, `B`.`tse`, `B`.`score`, `B`.`body_repair`, `B`.`assembly_repair`, `B`.`repaired`, `B`.`alert` FROM `A` LEFT JOIN `B` ON `A`.`fingerprint` = `B`.`fingerprint`"
                             }
                         )
        log.info("Dataset ready %s" % ds_flags)
        return ds_flags

In [8]:
json_config_file = '/Users/guillem/Projects/feature-engineering-utils/anomaly-importances-util/cfg/importances-test-config.json'
json_params_file = '/Users/guillem/Projects/feature-engineering-utils/anomaly-importances-util/cfg/importances-test-params.json'

In [9]:
config_dict = init_config(json_config_file)
params_dict = init_params(json_params_file)

In [10]:
log = init_logger(config_dict["log_level"])
all_input_features=params_dict["input-features"]
importances_whizzml_script_id = config_dict["extract_importances_whizzml_id"]

tse = params_dict["tse-files-list"][1]

In [11]:
api = BigML(config_dict["bigml_username"],config_dict["bigml_apikey"],project=config_dict["bigml_project"],domain=config_dict["bigml_domain"])
     
# define sources paths
#training_file_path = '/Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features/#3_all_features_train.csv'
#test_file_path = '/Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features/#3_all_features_test.csv'
#repairs_file_path = '/Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features/repair_flags_all_features.csv'

training_file_path = '/Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features2/050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1_1_2_1_1_train.csv'
test_file_path = '/Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features2/050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1_1_2_1_1_test.csv'
repairs_file_path = '/Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features2/repair_flags.csv'


# create sources
train_source = create_source(training_file_path, api, log)
test_source = create_source(test_file_path, api, log)
repairs_source = create_source(repairs_file_path, api, log)

2021-03-04 10:42:40,687 - root - INFO - Creating source from file /Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features2/050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1_1_2_1_1_train.csv
2021-03-04 10:43:05,702 - root - INFO - Source created successfuly source/6040ab9979b77d6ecf0019b9
2021-03-04 10:43:05,703 - root - INFO - Creating source from file /Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features2/050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1_1_2_1_1_test.csv
2021-03-04 10:43:24,189 - root - INFO - Source created successfuly source/6040abaf79b77d6ecf0019bd
2021-03-04 10:43:24,190 - root - INFO - Creating source from file /Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features2/repair_flags.csv
2021-03-04 10:44:10,105 - root - INFO - Source created successfuly source/6040abe779b77d6eaa000e72


In [12]:
repairs_dataset = api.create_dataset(repairs_source)
log.info("Repairs dataset created %s" % repairs_dataset)

2021-03-04 10:44:40,580 - root - INFO - Repairs dataset created {'code': 201, 'resource': 'dataset/6040ac0879b77d6ecf0019c9', 'location': 'http://apian-sisyphus.bigml.local/andromeda/dataset/6040ac0879b77d6ecf0019c9', 'object': {'all_fields': True, 'category': 0, 'cluster': None, 'cluster_status': False, 'code': 201, 'columns': 0, 'configuration': None, 'configuration_status': False, 'correlations': {}, 'created': '2021-03-04T09:44:40.430856', 'creator': 'bigml09', 'credits': 44.94744396209717, 'dataset_origin_status': True, 'description': '', 'download': {'code': 0, 'excluded_input_fields': [], 'header': True, 'input_fields': [], 'message': '', 'preview': [], 'separator': ','}, 'evaluation': None, 'excluded_fields': [], 'field_types': {'categorical': 0, 'datetime': 0, 'items': 0, 'numeric': 0, 'preferred': 0, 'text': 0, 'total': 0}, 'fields_meta': {'count': 0, 'limit': 1000, 'offset': 0, 'total': 0}, 'input_fields': [], 'juxtapose': False, 'locale': 'en-US', 'missing_numeric_rows': 0,

In [13]:
log.info("Train dataset treatment...")
train_ds = create_joined_dataset(training_file_path, repairs_dataset["resource"], log, api)
log.info("Test dataset treatment...")
test_ds = create_joined_dataset(test_file_path, repairs_dataset["resource"], log, api)

2021-03-04 10:44:45,031 - root - INFO - Train dataset treatment...
2021-03-04 10:44:45,032 - root - INFO - Creating source from file /Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features2/050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1_1_2_1_1_train.csv
2021-03-04 10:45:02,263 - root - INFO - Source created successfuly source/6040ac1479b77d6eb4001611
2021-03-04 10:45:15,995 - root - INFO - Dataset ready {'code': 201, 'resource': 'dataset/6040ac2b79b77d6ea90008bb', 'location': 'http://apian-sisyphus.bigml.local/andromeda/dataset/6040ac2b79b77d6ea90008bb', 'object': {'all_fields': True, 'category': 0, 'cluster': None, 'cluster_status': False, 'code': 201, 'columns': 0, 'configuration': None, 'configuration_status': False, 'correlations': {}, 'created': '2021-03-04T09:45:15.807215', 'creator': 'bigml09', 'credits': 55.71978187561035, 'dataset_origin_status': True, 'description': '', 'download': {'code': 0, 'excluded_input_fields': [], 'header': Tru

In [14]:
print(all_input_features)

['CleanTimeActual', 'DropTimeActual', 'LMLiftHeightActual', 'PilotVoltageActual', 'StickoutActual', 'WeldCurrentActualPositive', 'WeldEnergyActual', 'WeldTimeActual', 'WeldVoltageActual']


In [15]:
print(train_ds['resource'])

dataset/6040ac2b79b77d6ea90008bb


In [16]:
# get all features anomaly detector
api.ok(train_ds)
anomaly_all_features = api.create_anomaly(train_ds, {"input_fields": all_input_features})
api.ok(anomaly_all_features)
log.info("All features anomaly detector ready %s" % anomaly_all_features["resource"])

2021-03-04 10:48:24,059 - root - INFO - All features anomaly detector ready anomaly/6040acbd79b77d6ecf0019d8


In [17]:
train_export_file_path = config_dict["tmp_datasets_directory"] + "/" + tse["name"] + "_train_dataset.csv"
api.download_dataset(train_ds,train_export_file_path)
log.info("Train dataset downloaded %s" % train_export_file_path)
train_df = pd.read_csv(train_export_file_path)

2021-03-04 10:53:41,152 - root - INFO - Train dataset downloaded /Users/guillem/Data/Customers/Daimler/anomalies-analysis/tmp_datasets/TSE2_train_dataset.csv


In [18]:
train_df.head()

Unnamed: 0,location,series,line,plant,station,robot,controller,tool,StudID,MeasurementData.MeasurementParameter.StudID.value,...,MeasurementData.MeasurementParameter.ProtectiveGasFlowDuringWeldprocessActive.value,MeasurementData.MeasurementParameter.ProtectiveGasPostFlowActive.value,MeasurementData.MeasurementParameter.ProtectiveGasPreFlowActive.value,MeasurementData.MeasurementParameter.WeldProcess.value,tse,score,body_repair,assembly_repair,repaired,alert
0,50,213,Z1,UB63,130,100,101,1.1,620556,620556_213_1_1_1_2_1_1,...,1,1,1,6,050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1...,0.33642,f,f,f,f
1,50,213,Z1,UB63,130,100,101,1.1,620556,620556_213_1_1_1_2_1_1,...,1,1,1,6,050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1...,0.56865,f,f,f,f
2,50,213,Z1,UB63,130,100,101,1.1,620556,620556_213_1_1_1_2_1_1,...,1,1,1,6,050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1...,0.38022,f,f,f,f
3,50,213,Z1,UB63,130,100,101,1.1,620556,620556_213_1_1_1_2_1_1,...,1,1,1,6,050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1...,0.35821,f,f,f,f
4,50,213,Z1,UB63,130,100,101,1.1,620556,620556_213_1_1_1_2_1_1,...,1,1,1,6,050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1...,0.33855,f,f,f,f


In [19]:
# init shap forest
forest = ShapForest(anomaly_all_features)

In [47]:
print(all_input_features)

['CleanTimeActual', 'DropTimeActual', 'LMLiftHeightActual', 'PilotVoltageActual', 'StickoutActual', 'WeldCurrentActualPositive', 'WeldEnergyActual', 'WeldTimeActual', 'WeldVoltageActual']


In [48]:
forest.predict({'CleanTimeActual': 20, 
                'DropTimeActual': 10, 
                'LMLiftHeightActual': 3, 
                'PilotVoltageActual': 0,
                'StickoutActual': 3.7,
                'WeldCurrentActualPositive': 750,
                'WeldEnergyActual': 1682,
                'WeldTimeActual': 105}, explanation=True)

# repairs keep only positive importances
# normal welds use all positive and negative importances

[[0.7729790942082068,
  ['000035', 0.26335720269947427],
  ['000030', 0.10835482670643148],
  ['000031', 0.02234214514423616],
  ['000032', 0.0228046459849216],
  ['000016', -0.010987886961260185],
  ['000017', -0.008738472836425704],
  ['000033', -0.001962886246872686],
  ['000034', -0.0017075335610332587],
  ['000015', -1.5324461875132478e-05]]]

In [49]:
print(train_ds)

{'code': 200, 'resource': 'dataset/6040ac2b79b77d6ea90008bb', 'location': 'https://daimler.dev.bigml.com/io/andromeda/dataset/6040ac2b79b77d6ea90008bb', 'object': {'all_fields': True, 'category': 0, 'cluster': None, 'cluster_status': False, 'code': 200, 'columns': 112, 'configuration': None, 'configuration_status': False, 'correlations': {}, 'created': '2021-03-04T09:45:15.807000', 'creator': 'bigml09', 'credits': 55.71978187561035, 'dataset_origin_status': True, 'description': '', 'download': {'code': 0, 'excluded_input_fields': [], 'header': True, 'input_fields': [], 'message': '', 'preview': [], 'separator': ','}, 'evaluation': None, 'excluded_fields': [], 'field_types': {'categorical': 29, 'datetime': 0, 'items': 0, 'numeric': 80, 'preferred': 78, 'text': 3, 'total': 112}, 'fields': {'000000': {'column_number': 0, 'datatype': 'int8', 'name': 'location', 'optype': 'numeric', 'order': 0, 'preferred': False, 'provenance': 'query', 'summary': {'counts': [[50, 10000]], 'exact_histogram'

In [39]:
# build train candidates dataframe to make explained predictions:

train_df['score_rank']= train_df['score'].rank(method='max',ascending=False)

# train candidates are repairs and highly scored normal welds >= 100
candidates_train_df = train_df[(train_df['repaired'] == 't') | (train_df['score_rank'] <= 100)]

print(candidates_train_df.shape)

(100, 113)


In [68]:
candidates_train_df = candidates_train_df.reset_index(drop=True)
candidates_train_df.head()

Unnamed: 0,location,series,line,plant,station,robot,controller,tool,StudID,MeasurementData.MeasurementParameter.StudID.value,...,MeasurementData.MeasurementParameter.ProtectiveGasPostFlowActive.value,MeasurementData.MeasurementParameter.ProtectiveGasPreFlowActive.value,MeasurementData.MeasurementParameter.WeldProcess.value,tse,score,body_repair,assembly_repair,repaired,alert,score_rank
0,50,213,Z1,UB63,130,100,101,1.1,620556,620556_213_1_1_1_2_1_1,...,1,1,6,050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1...,0.56865,f,f,f,f,57.0
1,50,213,Z1,UB63,130,100,101,1.1,620556,620556_213_1_1_1_2_1_1,...,1,1,6,050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1...,0.57879,f,f,f,f,41.0
2,50,213,Z1,UB63,130,100,101,1.1,620556,620556_213_1_1_1_2_1_1,...,1,1,6,050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1...,0.61145,f,f,f,f,13.0
3,50,213,Z1,UB63,130,100,101,1.1,620556,620556_213_1_1_1_2_1_1,...,1,1,6,050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1...,0.55635,f,f,f,f,95.0
4,50,213,Z1,UB63,130,100,101,1.1,620556,620556_213_1_1_1_2_1_1,...,1,1,6,050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1...,0.57889,f,f,f,f,40.0


In [43]:
#calculate_importances(imp_anomaly_detector_id, train_ds,  params_dict, api, log)
#anomaly_all_features 


imp_df = pd.DataFrame()

# loop over train dataframe and make/store explained predictions
for index, row in candidates_train_df.iterrows():
    print("treating row %s" % index)

    # build current predictions dictionnary looping over features list
    input_values = {} 
    for i in all_input_features: 
        # build input_values dynamically
        log.debug("Adding %s information to input_values, value: %s" % (i,row[i]))
        input_values[i] = row[i]
        

    # append score and importances in both lists
    log.debug("Making prediction for row: %s" % row["fingerprint"])
    current_pred_res = forest.predict(input_values, explanation=True)

    cur_importances = current_pred_res[0][1:]
    
    # build current prediction dictionnary
    cur_prediction = {}
    cur_prediction['score'] = current_pred_res[0][0]
    # add importances keys in loop
    for imp in cur_importances:
        # each imp is an importance with format ['000031', 0.03384744428211511] field + importance value
        cur_prediction[train_ds["object"]["fields"][imp[0]]["name"]+"_importance"]=[imp[1]]
    
    col_list = list(cur_prediction.keys())
    cur_df = pd.DataFrame(cur_prediction)
    imp_df = imp_df.append(cur_df, sort = True)


treating row 1
treating row 121
treating row 509
treating row 656
treating row 696
treating row 738
treating row 956
treating row 973
treating row 1002
treating row 1059
treating row 1127
treating row 1217
treating row 1241
treating row 1363
treating row 1364
treating row 1534
treating row 1662
treating row 1673
treating row 1823
treating row 1969
treating row 2234
treating row 2324
treating row 2349
treating row 2443
treating row 2574
treating row 2595
treating row 2807
treating row 2945
treating row 2983
treating row 3295
treating row 3412
treating row 3436
treating row 3733
treating row 3751
treating row 4034
treating row 4053
treating row 4184
treating row 4186
treating row 4207
treating row 4218
treating row 4263
treating row 4438
treating row 4467
treating row 4510
treating row 4537
treating row 4588
treating row 4639
treating row 4659
treating row 4665
treating row 4803
treating row 4874
treating row 5020
treating row 5072
treating row 5256
treating row 5318
treating row 5483
tr

In [71]:
imp_df = imp_df.rename(columns={'score': 'imp_score'})
imp_df = imp_df.reset_index(drop=True)
imp_df['fingerprint'] = candidates_train_df['fingerprint']
imp_df['repaired'] = candidates_train_df['repaired']
print(imp_df)

    CleanTimeActual_importance  DropTimeActual_importance  \
0                     0.000014                   0.013461   
1                     0.000015                   0.023540   
2                     0.000011                   0.013020   
3                     0.000015                   0.012796   
4                     0.000016                   0.011742   
..                         ...                        ...   
95                    0.000015                   0.010052   
96                    0.000016                   0.012781   
97                    0.000015                   0.012210   
98                    0.000013                   0.012851   
99                    0.000015                   0.011815   

    LMLiftHeightActual_importance  PilotVoltageActual_importance  \
0                        0.009605                       0.009049   
1                        0.058255                       0.010326   
2                        0.008552                       0.00664

In [73]:
#### BUILD_IMPORTANCES_DATAFRAMES #################################################################
def build_importances_dataframes(imp_df, log):
    log.info("Starting to build importances dataframe...")

    # init empty arrays as temporary variables in further loop
    field_names=[]
    importances_means=[]
    importances_medians=[]
    importances_maxes=[]
    importances_mins=[]
    importances_mean_diffs=[]
    importances_median_diffs=[]
    
    # loop over fields, retrieve name and stats + build dictionnary for further dataframe
    for column in imp_df:
        # for importances fields only:
        if 'importance' in column:
            # if importance is greater than 0
            if imp_df[imp_df.repaired == 't'][column].mean() > 0:
                log.debug("Adding stats for %s ..." % column)

                field_names.append(column)
                # mean
                repair_imp_field_mean = imp_df[imp_df.repaired == 't'][column].mean()
                importances_means.append(repair_imp_field_mean)
                # median
                repair_imp_field_median = imp_df[imp_df.repaired == 't'][column].median()
                importances_medians.append(repair_imp_field_median)
                #max + min
                importances_maxes.append(imp_df[imp_df.repaired == 't'][column].max())
                importances_mins.append(imp_df[imp_df.repaired == 't'][column].min())
                # mean diff
                importances_mean_diffs.append(repair_imp_field_mean - imp_df[imp_df.repaired == 'f'][column].mean())
                # median diff
                importances_median_diffs.append(repair_imp_field_median - imp_df[imp_df.repaired == 'f'][column].median())
                
    
    log.info("Building dataframe...")

    data = {'field_names': [sub.replace(' importance','') for sub in field_names],  # removes importance string from current field name
            'imp_means': importances_means,
            'imp_medians': importances_medians,
            'imp_maxes': importances_maxes,
            'imp_mins': importances_mins,
            'imp_mean_diffs': importances_mean_diffs,
            'imp_median_diffs': importances_median_diffs}
    
    importances_df = pd.DataFrame(data, columns = ['field_names','imp_means','imp_medians','imp_maxes','imp_mins','imp_mean_diffs','imp_median_diffs'])
                
    return(importances_df)

In [74]:
importances_df = build_importances_dataframes(imp_df, log)
importances_df.head()

2021-03-08 12:03:46,792 - root - INFO - Starting to build importances dataframe...
2021-03-08 12:03:46,872 - root - INFO - Building dataframe...


Unnamed: 0,field_names,imp_means,imp_medians,imp_maxes,imp_mins,imp_mean_diffs,imp_median_diffs
0,CleanTimeActual_importance,0.065596,0.065596,0.131177,1.4e-05,0.065581,0.065581
1,DropTimeActual_importance,0.007216,0.007216,0.007773,0.00666,-0.008342,-0.005297
2,LMLiftHeightActual_importance,0.005705,0.005705,0.006174,0.005236,-0.037263,-0.003784
3,PilotVoltageActual_importance,0.064808,0.064808,0.068925,0.06069,0.046425,0.052841
4,StickoutActual_importance,0.058464,0.058464,0.063883,0.053045,0.012114,0.037516


In [14]:
print("Loading repairs importances dataset...")
repairs_importances_dataset = api.get_dataset(repairs_importances_dataset_id)
print("Loading normal importances dataset...")
normal_importances_dataset = api.get_dataset(normal_importances_dataset_id)

Loading repairs importances dataset...
Loading normal importances dataset...


In [15]:
print(repairs_importances_dataset)

{'code': 200, 'resource': 'dataset/6018403179b77d363b00e5fe', 'location': 'https://daimler.dev.bigml.com/io/andromeda/dataset/6018403179b77d363b00e5fe', 'object': {'all_fields': True, 'category': 0, 'cluster': None, 'cluster_status': False, 'code': 200, 'columns': 185, 'configuration': None, 'configuration_status': False, 'correlations': {}, 'created': '2021-02-01T17:53:53.949000', 'creator': 'bigml09', 'credits': 1.4428863525390625, 'dataset_origin_status': True, 'description': '', 'download': {'code': 0, 'excluded_input_fields': [], 'header': True, 'input_fields': [], 'message': '', 'preview': [], 'separator': ','}, 'evaluation': None, 'excluded_fields': [], 'execution_id': '6018401079b77d364700fb2d', 'execution_status': True, 'field_types': {'categorical': 17, 'datetime': 0, 'items': 0, 'numeric': 164, 'preferred': 125, 'text': 4, 'total': 185}, 'fields': {'000000': {'column_number': 0, 'datatype': 'int8', 'name': 'location', 'optype': 'numeric', 'order': 0, 'preferred': False, 'pro

In [36]:
import pandas as pd

## Importances dataframes

In [41]:
# init empty arrays as temporary variables in further loop
field_names=[]
importances_means=[]
importances_medians=[]
importances_maxes=[]
importances_mean_diffs=[]
importances_median_diffs=[]


# loop over fields, retrieve name and stats + build dictionnary for further dataframe
for field_id in repairs_importances_dataset["object"]["fields"]:
    field_name = repairs_importances_dataset["object"]["fields"][field_id]["name"]
    # for importances fields only:
    if 'importance' in field_name:
        # if importance is greater than 0
        if repairs_importances_dataset["object"]["fields"][field_id]["summary"]["mean"] > 0:
            field_names.append(field_name)
            # mean
            repair_imp_field_mean = repairs_importances_dataset["object"]["fields"][field_id]["summary"]["mean"]
            importances_means.append(repair_imp_field_mean)
            # median
            repair_imp_field_median = repairs_importances_dataset["object"]["fields"][field_id]["summary"]["median"]
            importances_medians.append(repair_imp_field_median)
            #max
            importances_maxes.append(repairs_importances_dataset["object"]["fields"][field_id]["summary"]["maximum"])
            # mean diff
            importances_mean_diffs.append(repair_imp_field_mean - normal_importances_dataset["object"]["fields"][field_id]["summary"]["mean"])
            # median diff
            importances_median_diffs.append(repair_imp_field_median - normal_importances_dataset["object"]["fields"][field_id]["summary"]["median"])
            

data = {'field_names': field_names,
        'imp_means': importances_means,
        'imp_medians': importances_medians,
        'imp_maxes': importances_maxes,
        'imp_mean_diffs': importances_mean_diffs,
        'imp_median_diffs': importances_median_diffs}


rep_importances_df = pd.DataFrame (data, columns = ['field_names','imp_means','imp_medians','imp_maxes','imp_mean_diffs','imp_median_diffs'])
            
print(rep_importances_df)

                                          field_names  imp_means  imp_medians  \
0   MeasurementData.MeasurementParameter.LMPositio...    0.00883      0.00853   
1   MeasurementData.MeasurementParameter.PilotCurr...    0.00007      0.00007   
2   MeasurementData.MeasurementParameter.WeldToolM...    0.02570      0.02337   
3   MeasurementData.MeasurementParameter.WeldToolS...    0.01790      0.01574   
4   MeasurementData.MeasurementParameter.WeldToolS...    0.02470      0.02355   
5   MeasurementData.MeasurementParameter.WeldToolS...    0.03192      0.02897   
6                       PilotVoltageActual importance    0.01123      0.01184   
7                           StickoutActual importance    0.00881      0.00935   
8                         WeldEnergyActual importance    0.00936      0.00958   
9                           WeldTimeActual importance    0.01439      0.01036   
10                       WeldVoltageActual importance    0.01360      0.01310   
11                      LMLi

In [43]:
print(rep_importances_df.head())

                                         field_names  imp_means  imp_medians  \
0  MeasurementData.MeasurementParameter.LMPositio...    0.00883      0.00853   
1  MeasurementData.MeasurementParameter.PilotCurr...    0.00007      0.00007   
2  MeasurementData.MeasurementParameter.WeldToolM...    0.02570      0.02337   
3  MeasurementData.MeasurementParameter.WeldToolS...    0.01790      0.01574   
4  MeasurementData.MeasurementParameter.WeldToolS...    0.02470      0.02355   

   imp_maxes  imp_mean_diffs  imp_median_diffs  
0    0.01086         0.00002           0.00145  
1    0.00008        -0.00003           0.00001  
2    0.03829        -0.00332          -0.00276  
3    0.02422         0.00127           0.00092  
4    0.02993         0.00186           0.00398  
