In [1]:
import logging
import argparse
import sys
import json
import os
import pandas as pd 
import time
import numpy as np

from bigml.api import BigML
from datetime import datetime

# HTTPS WARNINGS workaround https://github.com/influxdata/influxdb-python/issues/240
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
#### INIT CONFIG ################################################################################
def init_config(json_file_path):
     """Initializes environment variables from given JSON file into a dictionnary"""
     with open(json_file_path, "r") as f:
      config_dict = json.load(f)

     return config_dict


In [3]:
#### INIT LOGGER ################################################################################
def init_logger(log_level):
     """Initializes log structure. """
     LOGGER = logging.getLogger()
     LOGGER.setLevel(log_level)

     ch = logging.StreamHandler(sys.stdout)
     # ch.setLevel('INFO')
     formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     ch.setFormatter(formatter)
     LOGGER.addHandler(ch)

     return LOGGER

In [4]:
#### INIT PARAMS ################################################################################
def init_params(json_file_path):
     """Initializes parameter variables from given JSON file into a dictionnary"""
     with open(json_file_path, "r") as f:
      param_dict = json.load(f)

     return param_dict

In [5]:
#### EXECUTE WHIZZML ################################################################################
def execute_whizzml(whizzml_script_id, script_inputs, api, log):
     """Executes whizzml script and returns results """
     log.info("Executing WhizzML script %s" % whizzml_script_id)
     log.info("Script inputs: %s" % script_inputs)
     
     # execute WhizzML script to generate ensemble and evaluation
     execution = api.create_execution(whizzml_script_id, script_inputs)
     if not api.ok(execution,wait_time=60):
        log.error("WhizzML execution error %s" % execution["resource"])
        sys.exit("WhizzML execution could not be performed")

     log.info("WhizzML execution ended %s" % execution["resource"])

     # if error found, raise and exit
     if execution["object"]["status"]["code"] == -1:
        log.error("WhizzML execution error: %s. Execution id: %s" % (execution["object"]["status"]["message"],execution["object"]["resource"]))
        sys.exit("Error found while executing WhizzML script")
     
     return execution["object"]["execution"]["result"]

In [6]:
#### CREATE SOURCE ################################################################################
def create_source(source_path, api, log, args=None):
    """Creates a source."""
    log.info("Creating source from file %s" % source_path)

    # check if file exists
    if not os.path.exists(source_path):
        log.error("Provided file does not exist %s" % source_path)
        sys.exit("Source file not found")

    if args is None:
        args = {}
    source = api.create_source(source_path, args)
    
    if not api.ok(source):
        log.error("Could not create source %s from file %s" % (source["resource"],source_path))
        sys.exit("Source couldn't be created or retrieved.")
    
    log.info("Source created successfuly %s" % source["resource"])

    return source

In [7]:
###  CREATE TRAIN DATASET #############################################################
def create_joined_dataset(input_file, repairs_dataset, log, api):
        # create training source
        source = create_source(input_file, api, log)
       
        # create training dataset
        api.ok(source)
        ds = api.create_dataset(source)
         
        # join datasets
        api.ok(ds)
    
        ds_flags = api.create_dataset(
                             [{
                         
                                 "id": ds["resource"],
                                 "name": "A"
                              },
                              {
                                 "id": repairs_dataset,
                                 "name": "B"
                              }
                             ],
                             {
                                 "sql_query": "SELECT `A`.*, `B`.`tse`, `B`.`score`, `B`.`body_repair`, `B`.`assembly_repair`, `B`.`repaired`, `B`.`alert` FROM `A` LEFT JOIN `B` ON `A`.`fingerprint` = `B`.`fingerprint`"
                             }
                         )

        log.info("Dataset ready %s" % ds_flags)

        return ds_flags

In [8]:
#### BUILD_IMPORTANCES_DATAFRAMES #################################################################
def build_importances_dataframes(repairs_importances_dataset, normal_importances_dataset, log):
    log.info("Starting to build importances dataframe...")

    # init empty arrays as temporary variables in further loop
    field_names=[]
    importances_means=[]
    importances_medians=[]
    importances_maxes=[]
    importances_mins=[]
    importances_mean_diffs=[]
    importances_median_diffs=[]
    
    
    # loop over fields, retrieve name and stats + build dictionnary for further dataframe
    for field_id in repairs_importances_dataset["object"]["fields"]:
        field_name = repairs_importances_dataset["object"]["fields"][field_id]["name"]
        # for importances fields only:
        if 'importance' in field_name:
            # if importance is greater than 0
            if repairs_importances_dataset["object"]["fields"][field_id]["summary"]["mean"] > 0:
                log.debug("Adding stats for %s ..." % field_name)

                field_names.append(field_name)
                # mean
                repair_imp_field_mean = repairs_importances_dataset["object"]["fields"][field_id]["summary"]["mean"]
                importances_means.append(repair_imp_field_mean)
                # median
                repair_imp_field_median = repairs_importances_dataset["object"]["fields"][field_id]["summary"]["median"]
                importances_medians.append(repair_imp_field_median)
                #max + min
                importances_maxes.append(repairs_importances_dataset["object"]["fields"][field_id]["summary"]["maximum"])
                importances_mins.append(repairs_importances_dataset["object"]["fields"][field_id]["summary"]["minimum"])
                # mean diff
                importances_mean_diffs.append(repair_imp_field_mean - normal_importances_dataset["object"]["fields"][field_id]["summary"]["mean"])
                # median diff
                importances_median_diffs.append(repair_imp_field_median - normal_importances_dataset["object"]["fields"][field_id]["summary"]["median"])
                
    
    log.info("Building dataframe...")

    data = {'field_names': [sub.replace(' importance','') for sub in field_names],  # removes importance string from current field name
            'imp_means': importances_means,
            'imp_medians': importances_medians,
            'imp_maxes': importances_maxes,
            'imp_mins': importances_mins,
            'imp_mean_diffs': importances_mean_diffs,
            'imp_median_diffs': importances_median_diffs}
    
    importances_df = pd.DataFrame(data, columns = ['field_names','imp_means','imp_medians','imp_maxes','imp_mins','imp_mean_diffs','imp_median_diffs'])
                
    return(importances_df)

In [9]:
json_config_file = '/Users/guillem/Projects/feature-engineering-utils/anomaly-importances-util/cfg/importances-test-config.json'
json_params_file = '/Users/guillem/Projects/feature-engineering-utils/anomaly-importances-util/cfg/importances-test-params.json'

In [22]:
config_dict = init_config(json_config_file)
params_dict = init_params(json_params_file)

In [None]:
log = init_logger(config_dict["log_level"])
all_input_features=params_dict["input-features"]
importances_whizzml_script_id = config_dict["extract_importances_whizzml_id"]

tse = params_dict["tse-files-list"][1]

In [11]:
api = BigML(config_dict["bigml_username"],config_dict["bigml_apikey"],project=config_dict["bigml_project"],domain=config_dict["bigml_domain"])
     
# define sources paths
training_file_path = '/Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features/#3_all_features_train.csv'
test_file_path = '/Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features/#3_all_features_test.csv'
repairs_file_path = '/Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features/repair_flags_all_features.csv'

# create sources
train_source = create_source(training_file_path, api, log)
test_source = create_source(test_file_path, api, log)
repairs_source = create_source(repairs_file_path, api, log)

2021-02-26 13:04:42,202 - root - INFO - Creating source from file /Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features/#3_all_features_train.csv
2021-02-26 13:04:48,567 - root - INFO - Source created successfuly source/6038e3dc79b77d73690031dd
2021-02-26 13:04:48,568 - root - INFO - Creating source from file /Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features/#3_all_features_test.csv
2021-02-26 13:04:54,506 - root - INFO - Source created successfuly source/6038e3e279b77d73750048a0
2021-02-26 13:04:54,507 - root - INFO - Creating source from file /Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features/repair_flags_all_features.csv
2021-02-26 13:05:09,424 - root - INFO - Source created successfuly source/6038e3f279b77d73690031e0


In [12]:
importances_whizzml_script_id = config_dict["extract_importances_whizzml_id"]
if importances_whizzml_script_id == "":
   log.error("The WhizzML script is not defined in the configuration please deploy the WhizzML code before carrying on")
   sys.exit("WhizzML script not configured")
# init total importances dataframe
all_importances_df = pd.DataFrame()

# init worse features list
useless_features_list = all_input_features

In [13]:
log.info("Starting treatment for TSE dataset %s , file: %s" % (tse["name"], tse["train_file"]))

2021-02-26 13:05:50,989 - root - INFO - Starting treatment for TSE dataset TSE2 , file: /Users/guillem/Data/Customers/Daimler/anomalies-analysis/importances_fs/golden_lrg_all_features2/050-213-Z1-UB63-130-100-101-1.1-620556_213_1_1_1_1_1_1_train.csv


In [14]:
# WHIZZML train anomaly detector and extract importances
log.info("Building WhizzML inputs")
script_inputs = {
   "inputs": [
    ["source_repair_flags", repairs_source["resource"]],
    ["source_train", train_source["resource"]],
    ["all_input_features", all_input_features]
   ]
}

importances_execution_result = execute_whizzml(importances_whizzml_script_id, script_inputs, api, log)

# get whizzml results
repairs_importances_dataset_id = importances_execution_result["repairs-importances"]
normal_importances_dataset_id = importances_execution_result["normal-importances"]

# get structured datasets in JSON format
log.info("Loading repairs importances dataset...")
repairs_importances_dataset = api.get_dataset(repairs_importances_dataset_id)
log.info("Loading normal importances dataset...")
normal_importances_dataset = api.get_dataset(normal_importances_dataset_id)

# build importances dataframes
importances_df = build_importances_dataframes(repairs_importances_dataset, normal_importances_dataset, log)

2021-02-26 13:05:58,529 - root - INFO - Building WhizzML inputs
2021-02-26 13:05:58,531 - root - INFO - Executing WhizzML script script/60183d0c79b77d363a00a4cb
2021-02-26 13:05:58,531 - root - INFO - Script inputs: {'inputs': [['source_repair_flags', 'source/6038e3f279b77d73690031e0'], ['source_train', 'source/6038e3dc79b77d73690031dd'], ['all_input_features', ['CleanTimeActual', 'DropTimeActual', 'LMLiftHeightActual', 'PilotVoltageActual', 'StickoutActual', 'WeldCurrentActualPositive', 'WeldEnergyActual', 'WeldTimeActual', 'WeldVoltageActual']]]}
2021-02-26 13:07:01,003 - root - INFO - WhizzML execution ended execution/6038e42779b77d73750048a3
2021-02-26 13:07:01,004 - root - INFO - Loading repairs importances dataset...
2021-02-26 13:07:01,720 - root - INFO - Loading normal importances dataset...
2021-02-26 13:07:02,580 - root - INFO - Starting to build importances dataframe...
2021-02-26 13:07:02,581 - root - INFO - Building dataframe...


In [15]:
importances_df.head()

Unnamed: 0,field_names,imp_means,imp_medians,imp_maxes,imp_mins,imp_mean_diffs,imp_median_diffs
0,PilotVoltageActual,0.10268,0.10995,0.16858,0.04312,-0.02687,-0.01056
1,StickoutActual,0.1514,0.16697,0.1942,0.10064,-0.01035,0.01216
2,WeldEnergyActual,0.12384,0.13179,0.15152,0.0716,-0.02931,-0.01443
3,WeldTimeActual,0.14115,0.13482,0.21638,0.06132,-0.00186,-0.00242
4,WeldVoltageActual,0.16138,0.17219,0.19526,0.09822,-0.03545,-0.01733


In [16]:
# UPDATE USELESS FEATURES LIST by removing current dataset useful features
# loop over current dataframe
for index, row in importances_df.iterrows():
    # if both diff median and mean are positives the feature is isolating well repairs this time
    if row["imp_mean_diffs"] > config_dict["useless_importance_limit"] and row["imp_median_diffs"] > config_dict["useless_importance_limit"]:
        #check if current row parameter exists in the original list
        if row["field_names"] in useless_features_list:
            log.info("Useful field found: %s" % row["field_names"])
            useless_features_list.remove(row["field_names"])

In [17]:
# export into CSV file
export_file_path = config_dict["tmp_datasets_directory"] + "/" + tse["name"] + "_field_importances_stats.csv"
importances_df.to_csv(export_file_path, index = False, header=True)
log.info("Importances TSE detail file exported: %s" % export_file_path)

2021-02-26 13:07:42,144 - root - INFO - Importances TSE detail file exported: /Users/guillem/Data/Customers/Daimler/anomalies-analysis/tmp_datasets/TSE2_field_importances_stats.csv


In [42]:
# Retrieve new input fields
#importances_df_sorted = importances_df.sort_values('imp_median_diffs', ascending=False).reset_index(drop=True)

new_input_fields = []
# loop over first N optimal features and store feature names
for index, row in importances_df.sort_values('imp_median_diffs', ascending=False).reset_index(drop=True).iterrows():
    if index == config_dict["optimal_field_num"]:
        break  # exit loop
    new_input_fields.append(row["field_names"])

In [23]:
# create anomaly detector and retrieve BAS
# WHIZZML train anomaly detector and extract BAS
log.info("Building WhizzML inputs")
script_inputs = {
   "inputs": [
    ["source_repair_flags", repairs_source["resource"]],
    ["source_train", train_source["resource"]],
    ["source_test", test_source["resource"]],
    ["optimal_input_features", new_input_fields],
    ["original_input_features", params_dict["original-input-features"]]
   ]
}
bas_execution_result = execute_whizzml(config_dict["anomaly_detector_whizzml_id"], script_inputs, api, log)
     
# get whizzml results
test_BAS_optimal_ds_id = bas_execution_result["ds_test_optimal_BAS"]
test_BAS_original_ds_id = bas_execution_result["ds_test_original_BAS"]

2021-02-26 13:36:42,499 - root - INFO - Building WhizzML inputs
2021-02-26 13:36:42,500 - root - INFO - Executing WhizzML script script/6038e9c079b77d73750048c7
2021-02-26 13:36:42,501 - root - INFO - Script inputs: {'inputs': [['source_repair_flags', 'source/6038e3f279b77d73690031e0'], ['source_train', 'source/6038e3dc79b77d73690031dd'], ['source_test', 'source/6038e3e279b77d73750048a0'], ['optimal_input_features', ['StickoutActual', 'LMLiftHeightActual', 'DropTimeActual', 'WeldTimeActual']], ['original_input_features', ['MeasurementData.MeasurementParameter.CleanVoltageActual.value', 'MeasurementData.MeasurementParameter.LMPositionActual.value', 'WeldCurrentActualPositive', 'LMPenetrationActual', 'WeldEnergyActual', 'WeldVoltageActual', 'PilotVoltageActual', 'MeasurementData.MeasurementParameter.WeldCurrentActualNegative.value', 'resistance_part2_range', 'lift_pos_min', 'lift_pos_mean', 'voltage_SNR', 'resistance_part3_SNR']]]}
2021-02-26 13:37:46,512 - root - INFO - WhizzML executio

In [36]:
# get BAS datasets into dataframes
export_file_path_opti = config_dict["tmp_datasets_directory"] + "/" + tse["name"] + "_optimal_BAS.csv"
export_file_path_orig = config_dict["tmp_datasets_directory"] + "/" + tse["name"] + "_original_BAS.csv"


api.download_dataset(test_BAS_optimal_ds_id,export_file_path_opti)
log.info("BAS optimal dataset downloaded: %s" % export_file_path_opti)
api.download_dataset(test_BAS_original_ds_id,export_file_path_orig)
log.info("BAS original dataset downloaded: %s" % export_file_path_orig)

optimal_bas_df = pd.read_csv(export_file_path_opti)
original_bas_df = pd.read_csv(export_file_path_orig)

optimal_bas_df.head()

2021-02-26 17:32:37,727 - root - INFO - BAS optimal dataset downloaded: /Users/guillem/Data/Customers/Daimler/anomalies-analysis/tmp_datasets/TSE2_optimal_BAS.csv
2021-02-26 17:32:39,322 - root - INFO - BAS original dataset downloaded: /Users/guillem/Data/Customers/Daimler/anomalies-analysis/tmp_datasets/TSE2_original_BAS.csv


Unnamed: 0,location,series,line,plant,station,robot,controller,tool,StudID,MeasurementData.MeasurementParameter.StudID.value,...,resistance_part1_max,resistance_part2_max,resistance_part3_min,tse,score,body_repair,assembly_repair,repaired,alert,std_anomaly_score
0,50,213,Z1,UB64,130,400,401,1.1,620168,620168_213_1_2_1_2_1_2,...,0.758862,0.770908,0.0,050-213-Z1-UB64-130-400-401-1.1-620168_213_1_2...,0.58701,f,f,f,f,0.70085
1,50,213,Z1,UB64,130,400,401,1.1,620168,620168_213_1_2_1_2_1_2,...,0.750194,0.764378,0.0,050-213-Z1-UB64-130-400-401-1.1-620168_213_1_2...,0.5827,f,f,f,f,0.72571
2,50,213,Z1,UB64,130,400,401,1.1,620168,620168_213_1_2_1_2_1_2,...,0.75432,0.7648,0.0,050-213-Z1-UB64-130-400-401-1.1-620168_213_1_2...,0.59661,f,f,f,f,0.7158
3,50,213,Z1,UB64,130,400,401,1.1,620168,620168_213_1_2_1_2_1_2,...,0.745632,0.764589,0.0,050-213-Z1-UB64-130-400-401-1.1-620168_213_1_2...,0.58418,f,f,f,f,0.71383
4,50,213,Z1,UB64,130,400,401,1.1,620168,620168_213_1_2_1_2_1_2,...,0.77771,0.782471,0.0,050-213-Z1-UB64-130-400-401-1.1-620168_213_1_2...,0.58415,f,f,f,f,0.72771


In [37]:
optimal_bas_df['score_rank'] = optimal_bas_df['std_anomaly_score'].rank(method='max')
optimal_bas_df['score_pct_rank'] = optimal_bas_df['std_anomaly_score'].rank(pct=True)

optimal_bas_df['orig_score'] = original_bas_df['std_anomaly_score']
optimal_bas_df['orig_score_rank'] = original_bas_df['std_anomaly_score'].rank(method='max')
optimal_bas_df['orig_score_pct_rank'] = original_bas_df['std_anomaly_score'].rank(pct=True)

optimal_bas_df.head()

Unnamed: 0,location,series,line,plant,station,robot,controller,tool,StudID,MeasurementData.MeasurementParameter.StudID.value,...,body_repair,assembly_repair,repaired,alert,std_anomaly_score,score_rank,score_pct_rank,orig_score,orig_score_rank,orig_score_pct_rank
0,50,213,Z1,UB64,130,400,401,1.1,620168,620168_213_1_2_1_2_1_2,...,f,f,f,f,0.70085,486.0,0.918715,0.70894,500.0,0.94518
1,50,213,Z1,UB64,130,400,401,1.1,620168,620168_213_1_2_1_2_1_2,...,f,f,f,f,0.72571,495.0,0.935728,0.72905,506.0,0.956522
2,50,213,Z1,UB64,130,400,401,1.1,620168,620168_213_1_2_1_2_1_2,...,f,f,f,f,0.7158,492.0,0.930057,0.72738,505.0,0.953686
3,50,213,Z1,UB64,130,400,401,1.1,620168,620168_213_1_2_1_2_1_2,...,f,f,f,f,0.71383,491.0,0.928166,0.72938,507.0,0.958412
4,50,213,Z1,UB64,130,400,401,1.1,620168,620168_213_1_2_1_2_1_2,...,f,f,f,f,0.72771,497.0,0.938563,0.71121,501.0,0.94707


In [41]:
rank_stats_df = pd.DataFrame()

# gather current DS repairs stats
for index, row in optimal_bas_df[optimal_bas_df.repaired=='t'].iterrows():
    current_data_dict = {'dataset_name': [tse["name"]],
                         'TSE': [row["tse"]], 
                         'fingerprint': [row["fingerprint"]],
                         'timestamp': [row["timestamp"]],
                         'original_score': [row["orig_score"]],
                         'optimal_score': [row["std_anomaly_score"]],
                         'original_rank': [row["orig_score_rank"]],
                         'optimal_rank': [row["score_rank"]],
                         'original_pct_rank': [row["orig_score_pct_rank"]],
                         'optimal_pct_rank': [row["score_pct_rank"]],
                         'assembly_repair': [row["assembly_repair"]]}

    current_data_df = pd.DataFrame(current_data_dict, columns = ['dataset_name','TSE','fingerprint','timestamp','original_score','optimal_score','original_rank','optimal_rank','original_pct_rank','optimal_pct_rank','assembly_repair'])
    rank_stats_df = rank_stats_df.append(current_data_df, ignore_index=True)

rank_stats_df.head()

Unnamed: 0,dataset_name,TSE,fingerprint,timestamp,original_score,optimal_score,original_rank,optimal_rank,original_pct_rank,optimal_pct_rank,assembly_repair
0,TSE2,050-213-Z1-UB64-130-400-401-1.1-620168_213_1_2...,3cf3e81634e702ded214481702429bda47b9db9a,2020-10-27T18:56:09.000000+0100,0.48404,0.56728,430.0,370.0,0.812854,0.698488,f
1,TSE2,050-213-Z1-UB64-130-400-401-1.1-620168_213_1_2...,bddcb3043709d606761122ce326e78e4578318aa,2020-10-27T17:45:07.000000+0100,0.52594,0.55953,480.0,354.0,0.907372,0.663516,f
2,TSE2,050-213-Z1-UB64-130-400-401-1.1-620168_213_1_2...,6b8e2fb4607f914d93cdb319bc7648d8bd65e205,2020-10-27T16:02:45.000000+0100,0.51709,0.50513,471.0,277.0,0.890359,0.521739,f
3,TSE2,050-213-Z1-UB64-130-400-401-1.1-620168_213_1_2...,dc6deeb80f1bda49c625bf3de91b36bb00bda916,2020-10-27T14:14:17.000000+0100,0.42337,0.57779,338.0,387.0,0.638941,0.729679,f
4,TSE2,050-213-Z1-UB64-130-400-401-1.1-620168_213_1_2...,6d96773a343618ee4b658e7f865c9d604e8e4e3a,2020-10-27T11:03:22.000000+0100,0.44691,0.55315,374.0,337.0,0.706994,0.634216,f


In [54]:
print(current_data_df['TSE'].loc[0])

050-213-Z1-UB64-130-400-401-1.1-620168_213_1_2_1_2_1_2


In [57]:
current_data_df = rank_stats_df
# gather ds rank stats
cur_ds_stats_dict = {'dataset_name': [current_data_df['dataset_name'].loc[0]],
                     'TSE': [current_data_df['TSE'].loc[0]],
                     'median_rank_diff': [current_data_df['optimal_rank'].median() - current_data_df['original_rank'].median()],
                     'avg_rank_diff': [current_data_df['optimal_rank'].mean() - current_data_df['original_rank'].mean()],
                     'max_rank_diff': [current_data_df['optimal_rank'].max() - current_data_df['original_rank'].max()],
                     'min_rank_diff': [current_data_df['optimal_rank'].min() - current_data_df['original_rank'].min()],
                     'median_pct_rank_diff': [current_data_df['optimal_pct_rank'].median() - current_data_df['original_pct_rank'].median()],
                     'avg_pct_rank_diff': [current_data_df['optimal_pct_rank'].mean() - current_data_df['original_pct_rank'].mean()],
                     'max_pct_rank_diff': [current_data_df['optimal_pct_rank'].max() - current_data_df['original_pct_rank'].max()],
                     'min_pct_rank_diff': [current_data_df['optimal_pct_rank'].min() - current_data_df['original_pct_rank'].min()],                       
                     'median_optimal_rank': [current_data_df['optimal_rank'].median()],
                     'median_original_rank': [current_data_df['original_rank'].median()],
                     'median_optimal_score': [current_data_df['optimal_score'].median()],
                     'median_original_score': [current_data_df['original_score'].median()],
                     'total_repaired': [current_data_df.shape[0]],
                     'total_assembly': [current_data_df[current_data_df.assembly_repair == 't'].shape[0]]}

current_data_df = pd.DataFrame(current_data_dict, columns = ['dataset_name','TSE','fingerprint','timestamp','original_score','optimal_score','original_rank','optimal_rank','original_pct_rank','optimal_pct_rank','assembly_repair'])
rank_stats_df = rank_stats_df.append(current_data_df, ignore_index=True)



In [58]:
print(cur_ds_stats_dict)

{'dataset_name': ['TSE2'], 'TSE': ['050-213-Z1-UB64-130-400-401-1.1-620168_213_1_2_1_2_1_2'], 'median_rank_diff': [-1.0], 'avg_rank_diff': [46.81818181818181], 'max_rank_diff': [-54.0], 'min_rank_diff': [194.0], 'median_pct_rank_diff': [-0.00472589792060496], 'avg_pct_rank_diff': [0.08755799965629851], 'max_pct_rank_diff': [-0.10207939508506614], 'min_pct_rank_diff': [0.3667296786389414], 'median_optimal_rank': [337.0], 'median_original_rank': [338.0], 'median_optimal_score': [0.55315], 'median_original_score': [0.42336999999999997], 'total_repaired': [11], 'total_assembly': [6]}


In [62]:
rank_stats_df[rank_stats_df.original_score > 0.2].shape[0]

11

In [63]:
print(new_input_fields)

['StickoutActual', 'LMLiftHeightActual', 'DropTimeActual', 'WeldTimeActual']


In [65]:
list(dict.fromkeys(new_input_fields))

['StickoutActual', 'LMLiftHeightActual', 'DropTimeActual', 'WeldTimeActual']

In [73]:
mylist = params_dict["original-input-features"]
print(mylist + new_input_fields)

['MeasurementData.MeasurementParameter.CleanVoltageActual.value', 'MeasurementData.MeasurementParameter.LMPositionActual.value', 'WeldCurrentActualPositive', 'LMPenetrationActual', 'WeldEnergyActual', 'WeldVoltageActual', 'PilotVoltageActual', 'MeasurementData.MeasurementParameter.WeldCurrentActualNegative.value', 'resistance_part2_range', 'lift_pos_min', 'lift_pos_mean', 'voltage_SNR', 'resistance_part3_SNR', 'StickoutActual', 'LMLiftHeightActual', 'DropTimeActual', 'WeldTimeActual']


In [75]:
list(dict.fromkeys(mylist + new_input_fields))

['MeasurementData.MeasurementParameter.CleanVoltageActual.value',
 'MeasurementData.MeasurementParameter.LMPositionActual.value',
 'WeldCurrentActualPositive',
 'LMPenetrationActual',
 'WeldEnergyActual',
 'WeldVoltageActual',
 'PilotVoltageActual',
 'MeasurementData.MeasurementParameter.WeldCurrentActualNegative.value',
 'resistance_part2_range',
 'lift_pos_min',
 'lift_pos_mean',
 'voltage_SNR',
 'resistance_part3_SNR',
 'StickoutActual',
 'LMLiftHeightActual',
 'DropTimeActual',
 'WeldTimeActual']

In [76]:
df = pd.DataFrame(data=np.random.normal(loc=100, scale=50, size=(8,2)),
                  columns=('Parks', 'Schools'),
                  index=['San Francisco', 'San Diego', 'Los Angeles', \
                       'New York', 'Chicago', 'Denver', 'Seattle', 'Portland']
                 )

In [77]:
df = df.astype(int)

In [78]:
df

Unnamed: 0,Parks,Schools
San Francisco,114,185
San Diego,113,45
Los Angeles,73,105
New York,103,49
Chicago,84,197
Denver,145,64
Seattle,97,115
Portland,118,128


In [82]:
df['score_rank'] = df['Parks'].rank(method='max',ascending=False)

In [83]:
df

Unnamed: 0,Parks,Schools,score_rank
San Francisco,114,185,3.0
San Diego,113,45,4.0
Los Angeles,73,105,8.0
New York,103,49,5.0
Chicago,84,197,7.0
Denver,145,64,1.0
Seattle,97,115,6.0
Portland,118,128,2.0


In [14]:
print("Loading repairs importances dataset...")
repairs_importances_dataset = api.get_dataset(repairs_importances_dataset_id)
print("Loading normal importances dataset...")
normal_importances_dataset = api.get_dataset(normal_importances_dataset_id)

Loading repairs importances dataset...
Loading normal importances dataset...


In [15]:
print(repairs_importances_dataset)

{'code': 200, 'resource': 'dataset/6018403179b77d363b00e5fe', 'location': 'https://daimler.dev.bigml.com/io/andromeda/dataset/6018403179b77d363b00e5fe', 'object': {'all_fields': True, 'category': 0, 'cluster': None, 'cluster_status': False, 'code': 200, 'columns': 185, 'configuration': None, 'configuration_status': False, 'correlations': {}, 'created': '2021-02-01T17:53:53.949000', 'creator': 'bigml09', 'credits': 1.4428863525390625, 'dataset_origin_status': True, 'description': '', 'download': {'code': 0, 'excluded_input_fields': [], 'header': True, 'input_fields': [], 'message': '', 'preview': [], 'separator': ','}, 'evaluation': None, 'excluded_fields': [], 'execution_id': '6018401079b77d364700fb2d', 'execution_status': True, 'field_types': {'categorical': 17, 'datetime': 0, 'items': 0, 'numeric': 164, 'preferred': 125, 'text': 4, 'total': 185}, 'fields': {'000000': {'column_number': 0, 'datatype': 'int8', 'name': 'location', 'optype': 'numeric', 'order': 0, 'preferred': False, 'pro

In [36]:
import pandas as pd

## Importances dataframes

In [41]:
# init empty arrays as temporary variables in further loop
field_names=[]
importances_means=[]
importances_medians=[]
importances_maxes=[]
importances_mean_diffs=[]
importances_median_diffs=[]


# loop over fields, retrieve name and stats + build dictionnary for further dataframe
for field_id in repairs_importances_dataset["object"]["fields"]:
    field_name = repairs_importances_dataset["object"]["fields"][field_id]["name"]
    # for importances fields only:
    if 'importance' in field_name:
        # if importance is greater than 0
        if repairs_importances_dataset["object"]["fields"][field_id]["summary"]["mean"] > 0:
            field_names.append(field_name)
            # mean
            repair_imp_field_mean = repairs_importances_dataset["object"]["fields"][field_id]["summary"]["mean"]
            importances_means.append(repair_imp_field_mean)
            # median
            repair_imp_field_median = repairs_importances_dataset["object"]["fields"][field_id]["summary"]["median"]
            importances_medians.append(repair_imp_field_median)
            #max
            importances_maxes.append(repairs_importances_dataset["object"]["fields"][field_id]["summary"]["maximum"])
            # mean diff
            importances_mean_diffs.append(repair_imp_field_mean - normal_importances_dataset["object"]["fields"][field_id]["summary"]["mean"])
            # median diff
            importances_median_diffs.append(repair_imp_field_median - normal_importances_dataset["object"]["fields"][field_id]["summary"]["median"])
            

data = {'field_names': field_names,
        'imp_means': importances_means,
        'imp_medians': importances_medians,
        'imp_maxes': importances_maxes,
        'imp_mean_diffs': importances_mean_diffs,
        'imp_median_diffs': importances_median_diffs}


rep_importances_df = pd.DataFrame (data, columns = ['field_names','imp_means','imp_medians','imp_maxes','imp_mean_diffs','imp_median_diffs'])
            
print(rep_importances_df)

                                          field_names  imp_means  imp_medians  \
0   MeasurementData.MeasurementParameter.LMPositio...    0.00883      0.00853   
1   MeasurementData.MeasurementParameter.PilotCurr...    0.00007      0.00007   
2   MeasurementData.MeasurementParameter.WeldToolM...    0.02570      0.02337   
3   MeasurementData.MeasurementParameter.WeldToolS...    0.01790      0.01574   
4   MeasurementData.MeasurementParameter.WeldToolS...    0.02470      0.02355   
5   MeasurementData.MeasurementParameter.WeldToolS...    0.03192      0.02897   
6                       PilotVoltageActual importance    0.01123      0.01184   
7                           StickoutActual importance    0.00881      0.00935   
8                         WeldEnergyActual importance    0.00936      0.00958   
9                           WeldTimeActual importance    0.01439      0.01036   
10                       WeldVoltageActual importance    0.01360      0.01310   
11                      LMLi

In [43]:
print(rep_importances_df.head())

                                         field_names  imp_means  imp_medians  \
0  MeasurementData.MeasurementParameter.LMPositio...    0.00883      0.00853   
1  MeasurementData.MeasurementParameter.PilotCurr...    0.00007      0.00007   
2  MeasurementData.MeasurementParameter.WeldToolM...    0.02570      0.02337   
3  MeasurementData.MeasurementParameter.WeldToolS...    0.01790      0.01574   
4  MeasurementData.MeasurementParameter.WeldToolS...    0.02470      0.02355   

   imp_maxes  imp_mean_diffs  imp_median_diffs  
0    0.01086         0.00002           0.00145  
1    0.00008        -0.00003           0.00001  
2    0.03829        -0.00332          -0.00276  
3    0.02422         0.00127           0.00092  
4    0.02993         0.00186           0.00398  
