Imports

In [28]:
import logging
import ssl
import urllib3

from elasticsearch import Elasticsearch, JSONSerializer
from elasticsearch.connection import create_ssl_context
from elasticsearch.helpers import parallel_bulk
from elasticsearch.helpers import scan
import numpy as np
import pandas as pd

urllib3.disable_warnings() # This is insecure
logger = logging.getLogger(__name__)

Elastic search connection procedures:

In [2]:
def get_elastic_client(server="local", write=False):
    if server == "local":
        if write:
            serializer = NpJSONSerializer()
        else:
            serializer = JSONSerializer()
        return Elasticsearch(host="localhost",
                             port=9200,
                             serializer=serializer)
    elif server in {"dev", "horizon", "prod"}:
        # All the other servers are remote hosts with similar configs
        if server == "prod":
            host = "daimler-elastic.vpc.bigml.com"
        elif server == "horizon":
            host = "daimler-elastic.horizon.bigml.com"
        else:
            host = "daimler-elastic.dev.bigml.com"

        if write:
            serializer = NpJSONSerializer()
        else:
            serializer = JSONSerializer()
        # Set up ssl context to disable cert verification
        ssl_context = create_ssl_context()
        ssl_context.check_hostname = False
        ssl_context.verify_mode = ssl.CERT_NONE
        return Elasticsearch(host=host,
                             port=443,
                             http_auth=("dev", "paroafCa"),
                             serializer=serializer,
                             ssl_context=ssl_context,
                             use_ssl=True)
    else:
        logger.warning("unknown server '%s'", server)
        return None

Instanciate Elastic Search connection to dev:

In [3]:
ES = get_elastic_client("dev")
print(ES)

<Elasticsearch([{}])>


## 1) Generate referential dataframe

In [62]:
# DEFINE QUERY
# execute query to get referential welds data
ref_welds_query = {
  "size" : 1,
  "query" : {
    "bool" : {
      "must" : [
        {
          "terms" : {
            "uniqueID.keyword" : [
                  "h902-030tsb101-kf130.m050g9sub64sps1.1.1",
                  "h902-030tsb201-kf130.m050g9sub64sps1.2.1",
                  "h902-030tsb301-kf130.m050g9sub64sps1.1.1",
                  "h902-030tsb401-kf130.m050g9sub64sps1.2.1",
                  "h902-030tsb501-kf130.m050g9sub64sps1.1.1",
                  "h902-040tsb101-kf130.m050g9sub64sps1.1.1",
                  "h902-040tsb201-kf130.m050g9sub64sps1.1.1",
                  "h902-040tsb201-kf130.m050g9sub64sps1.2.1",
                  "h902-040tsb301-kf130.m050g9sub64sps1.1.1",
                  "h902-040tsb301-kf130.m050g9sub64sps1.2.1",
                  "h902-070tsb101-kf130.m050g9sub63sps2.1.1"
             ],
            "boost" : 1.0
          }
        },
        {
          "term" : {
            "FaultCode.keyword" : {
              "value" : "0",
              "boost" : 1.0
            }
          }
        }
      ],
      "adjust_pure_negative" : True,
      "boost" : 1.0
    }
  },
  "_source" : False,
  "stored_fields" : "_none_",
  "aggregations" : {
    "groupby" : {
      "composite" : {
        "size" : 10000,
        "sources" : [
          {
            "unique_id_b" : {
              "terms" : {
                "field" : "uniqueID.keyword",
                "missing_bucket" : True,
                "order" : "asc"
              }
            }
          },
          {
            "extension_b" : {
              "terms" : {
                "field" : "MeasurementData.MeasurementParameter.StudID.value.keyword",
                "missing_bucket" : True,
                "order" : "asc"
              }
            }
          },
          {
            "period_b" : {
              "date_histogram" : {
                "field" : "timestamp",
                "missing_bucket" : True,
                "calendar_interval": "month",
                "format": "yyyy-MM"
              }
            }
          }
        ]
      },
      "aggregations" : {
        "voltage_b" : {
          "extended_stats" : {
            "field" : "WeldVoltageActual",
            "sigma" : 2
          }
        },
        "time_b" : {
          "extended_stats" : {
            "field" : "WeldTimeActual",
            "sigma" : 2
          }
        },
        "current_b" : {
          "extended_stats" : {
            "field" : "WeldCurrentActualPositive",
            "sigma" : 2
          }
        },
        "penetration_b" : {
          "extended_stats" : {
            "field" : "LMPenetrationActual",
            "sigma" : 2
          }
        },
        "stickout_b" : {
          "extended_stats" : {
            "field" : "StickoutActual",
            "sigma" : 2
          }
        },
        "drop_time_b" : {
          "extended_stats" : {
            "field" : "DropTimeActual",
            "sigma" : 2
          }
        },
        "lift_b" : {
          "extended_stats" : {
            "field" : "LMLiftHeightActual",
            "sigma" : 2
          }
        },
        "energy_b" : {
          "extended_stats" : {
            "field" : "WeldEnergyActual",
            "sigma" : 2
          }
        }
      }
    }
  }
}

In [63]:
REQUEST_TIMEOUT = 100.  # Timeout for the elastic queries

In [64]:
elastic_response = ES.search(index="ml_toolbox_raw_data",
                           body=ref_welds_query,
                           request_timeout=REQUEST_TIMEOUT)

In [65]:
weld_stats_df = pd.DataFrame()

for bucket in elastic_response["aggregations"]["groupby"]["buckets"]:
    # gather current bucket key information
    key_tool = bucket["key"]["unique_id_b"]
    key_extension = bucket["key"]["extension_b"]
    key_period = bucket["key"]["period_b"]
    
    # gather current bucket stats information
    num_elements = bucket["doc_count"]
    penetration_ref = bucket["penetration_b"]["avg"]
    penetration_up_limit = bucket["penetration_b"]["std_deviation_bounds"]["upper"]
    penetration_low_limit = bucket["penetration_b"]["std_deviation_bounds"]["lower"]
    drop_time_ref = bucket["drop_time_b"]["avg"]
    drop_time_up_limit = bucket["drop_time_b"]["std_deviation_bounds"]["upper"]
    drop_time_low_limit = bucket["drop_time_b"]["std_deviation_bounds"]["lower"] 
    voltage_ref = bucket["voltage_b"]["avg"]
    voltage_up_limit = bucket["voltage_b"]["std_deviation_bounds"]["upper"]
    voltage_low_limit = bucket["voltage_b"]["std_deviation_bounds"]["lower"]
    time_ref = bucket["time_b"]["avg"]
    time_up_limit = bucket["time_b"]["std_deviation_bounds"]["upper"]
    time_low_limit = bucket["time_b"]["std_deviation_bounds"]["lower"]
    lift_ref = bucket["lift_b"]["avg"]
    lift_up_limit = bucket["lift_b"]["std_deviation_bounds"]["upper"]
    lift_low_limit = bucket["lift_b"]["std_deviation_bounds"]["lower"]
    current_ref = bucket["current_b"]["avg"]
    current_up_limit = bucket["current_b"]["std_deviation_bounds"]["upper"]
    current_low_limit = bucket["current_b"]["std_deviation_bounds"]["lower"]
    energy_ref = bucket["energy_b"]["avg"]
    energy_up_limit = bucket["energy_b"]["std_deviation_bounds"]["upper"]
    energy_low_limit = bucket["energy_b"]["std_deviation_bounds"]["lower"]
    stickout_ref = bucket["stickout_b"]["avg"]
    stickout_up_limit = bucket["stickout_b"]["std_deviation_bounds"]["upper"]
    stickout_low_limit = bucket["stickout_b"]["std_deviation_bounds"]["lower"]

    
    # gather results into a dictionnary to create a dataframe
    current_data_dict = {'key_tool': [key_tool], 'key_extension': [key_extension], 
                         'key_period': [key_period], 'num_elements': [num_elements], 
                         'penetration_ref': [penetration_ref], 'penetration_up_limit': [penetration_up_limit], 'penetration_low_limit': [penetration_low_limit],
                         'drop_time_ref': [drop_time_ref], 'drop_time_up_limit': [drop_time_up_limit], 'drop_time_low_limit': [drop_time_low_limit],
                         'voltage_ref': [voltage_ref], 'voltage_up_limit': [voltage_up_limit], 'voltage_low_limit': [voltage_low_limit],
                         'time_ref': [time_ref], 'time_up_limit': [time_up_limit], 'time_low_limit': [time_low_limit],
                         'lift_ref': [lift_ref], 'lift_up_limit': [lift_up_limit], 'lift_low_limit': [lift_low_limit],
                         'current_ref': [current_ref], 'current_up_limit': [current_up_limit], 'current_low_limit': [current_low_limit],
                         'enery_ref': [energy_ref], 'energy_up_limit': [energy_up_limit], 'energy_low_limit': [energy_low_limit],
                         'stickout_ref': [stickout_ref], 'stickout_up_limit': [stickout_up_limit], 'stickout_low_limit': [stickout_low_limit]}
     
    # create dataframe with current feature set results
    cur_bucket_df = pd.DataFrame(current_data_dict, columns = ['key_tool', 'key_extension', 'key_period', 'num_elements', 
                                                               'penetration_ref', 'penetration_up_limit', 'penetration_low_limit',
                                                               'drop_time_ref', 'drop_time_up_limit', 'drop_time_low_limit',
                                                               'voltage_ref', 'voltage_up_limit', 'voltage_low_limit',
                                                               'time_ref', 'time_up_limit', 'time_low_limit',
                                                               'lift_ref', 'lift_up_limit', 'lift_low_limit',
                                                               'current_ref', 'current_up_limit', 'current_low_limit',
                                                               'enery_ref', 'energy_up_limit', 'energy_low_limit',
                                                               'stickout_ref', 'stickout_up_limit', 'stickout_low_limit'])
    
    weld_stats_df = weld_stats_df.append(cur_bucket_df, ignore_index=True)

In [66]:
print(weld_stats_df.shape)

(9253, 28)


In [67]:
weld_stats_df.head()

Unnamed: 0,key_tool,key_extension,key_period,num_elements,penetration_ref,penetration_up_limit,penetration_low_limit,drop_time_ref,drop_time_up_limit,drop_time_low_limit,...,lift_low_limit,current_ref,current_up_limit,current_low_limit,enery_ref,energy_up_limit,energy_low_limit,stickout_ref,stickout_up_limit,stickout_low_limit
0,h902-030tsb101-kf130.m050g9sub64sps1.1.1,610061_213_1_1_1_1_1_1,2019-11,25,-0.562,-0.322801,-0.801199,9.408,10.233436,8.582564,...,1.492962,757.6,766.141663,749.058337,624.88,684.503673,565.256327,3.5876,3.712728,3.462472
1,h902-030tsb101-kf130.m050g9sub64sps1.1.1,610061_213_1_1_1_1_1_1,2019-12,1762,-0.71609,-0.602375,-0.829805,9.43479,10.279023,8.590557,...,1.489491,750.0,750.0,750.0,670.053348,709.892554,630.214143,3.648519,3.748114,3.548923
2,h902-030tsb101-kf130.m050g9sub64sps1.1.1,610061_213_1_1_1_1_1_1,2020-01,2420,-0.704624,-0.560547,-0.848701,9.463512,10.315916,8.611109,...,1.495087,750.0,750.0,750.0,668.416942,707.463816,629.370068,3.654182,3.777445,3.530918
3,h902-030tsb101-kf130.m050g9sub64sps1.1.1,610061_213_1_1_1_1_1_1,2020-02,2983,-0.691183,-0.56019,-0.822177,9.408683,10.256725,8.56064,...,1.494969,750.0,750.0,750.0,660.513242,695.18597,625.840513,3.707674,3.865725,3.549622
4,h902-030tsb101-kf130.m050g9sub64sps1.1.1,610061_213_1_1_1_1_1_1,2020-03,954,-0.668501,-0.543347,-0.793655,9.357442,10.142769,8.572115,...,1.496322,750.0,750.0,750.0,663.560797,691.505709,635.615884,3.768092,3.867872,3.668312


In [68]:
weld_stats_df.to_csv('/Users/guillem/Data/Customers/Daimler/references/tool_reference_stats_v2_1.csv', index = False, header=True)

#### Round 2!

In [69]:
# DEFINE QUERY
# execute query to get referential welds data
ref_welds_query_2 = {
  "size" : 1,
  "query" : {
    "bool" : {
      "must" : [
        {
          "terms" : {
            "uniqueID.keyword" : [
                  "h902-070tsb201-kf130.m050g9sub63sps2.1.1",
                  "h902-080tsb101-kf130.m050g9sub63sps2.1.1",
                  "h902-090tsb101-kf130.m050g9sub63sps2.1.1",
                  "h902-100tsb101-kf130.m050g9sub63sps2.1.1",
                  "h902-100tsb201-kf130.m050g9sub63sps2.1.1",
                  "h902-130tsb101-kf130.m050g9sub63sps3.1.1",
                  "h902-140tsb101-kf130.m050g9sub63sps3.1.1",
                  "h902-140tsb201-kf130.m050g9sub63sps3.1.1",
                  "h902-140tsb301-kf130.m050g9sub63sps3.1.1",
                  "h902-140tsb401-kf130.m050g9sub63sps3.1.1"
             ],
            "boost" : 1.0
          }
        },
        {
          "term" : {
            "FaultCode.keyword" : {
              "value" : "0",
              "boost" : 1.0
            }
          }
        }
      ],
      "adjust_pure_negative" : True,
      "boost" : 1.0
    }
  },
  "_source" : False,
  "stored_fields" : "_none_",
  "aggregations" : {
    "groupby" : {
      "composite" : {
        "size" : 10000,
        "sources" : [
          {
            "unique_id_b" : {
              "terms" : {
                "field" : "uniqueID.keyword",
                "missing_bucket" : True,
                "order" : "asc"
              }
            }
          },
          {
            "extension_b" : {
              "terms" : {
                "field" : "MeasurementData.MeasurementParameter.StudID.value.keyword",
                "missing_bucket" : True,
                "order" : "asc"
              }
            }
          },
          {
            "period_b" : {
              "date_histogram" : {
                "field" : "timestamp",
                "missing_bucket" : True,
                "calendar_interval": "month",
                "format": "yyyy-MM"
              }
            }
          }
        ]
      },
      "aggregations" : {
        "voltage_b" : {
          "extended_stats" : {
            "field" : "WeldVoltageActual",
            "sigma" : 2
          }
        },
        "time_b" : {
          "extended_stats" : {
            "field" : "WeldTimeActual",
            "sigma" : 2
          }
        },
        "current_b" : {
          "extended_stats" : {
            "field" : "WeldCurrentActualPositive",
            "sigma" : 2
          }
        },
        "penetration_b" : {
          "extended_stats" : {
            "field" : "LMPenetrationActual",
            "sigma" : 2
          }
        },
        "stickout_b" : {
          "extended_stats" : {
            "field" : "StickoutActual",
            "sigma" : 2
          }
        },
        "drop_time_b" : {
          "extended_stats" : {
            "field" : "DropTimeActual",
            "sigma" : 2
          }
        },
        "lift_b" : {
          "extended_stats" : {
            "field" : "LMLiftHeightActual",
            "sigma" : 2
          }
        },
        "energy_b" : {
          "extended_stats" : {
            "field" : "WeldEnergyActual",
            "sigma" : 2
          }
        }
      }
    }
  }
}

In [70]:
elastic_response = ES.search(index="ml_toolbox_raw_data",
                           body=ref_welds_query_2,
                           request_timeout=REQUEST_TIMEOUT)

In [71]:
weld_stats_df = pd.DataFrame()

for bucket in elastic_response["aggregations"]["groupby"]["buckets"]:
    # gather current bucket key information
    key_tool = bucket["key"]["unique_id_b"]
    key_extension = bucket["key"]["extension_b"]
    key_period = bucket["key"]["period_b"]
    
    # gather current bucket stats information
    num_elements = bucket["doc_count"]
    penetration_ref = bucket["penetration_b"]["avg"]
    penetration_up_limit = bucket["penetration_b"]["std_deviation_bounds"]["upper"]
    penetration_low_limit = bucket["penetration_b"]["std_deviation_bounds"]["lower"]
    drop_time_ref = bucket["drop_time_b"]["avg"]
    drop_time_up_limit = bucket["drop_time_b"]["std_deviation_bounds"]["upper"]
    drop_time_low_limit = bucket["drop_time_b"]["std_deviation_bounds"]["lower"] 
    voltage_ref = bucket["voltage_b"]["avg"]
    voltage_up_limit = bucket["voltage_b"]["std_deviation_bounds"]["upper"]
    voltage_low_limit = bucket["voltage_b"]["std_deviation_bounds"]["lower"]
    time_ref = bucket["time_b"]["avg"]
    time_up_limit = bucket["time_b"]["std_deviation_bounds"]["upper"]
    time_low_limit = bucket["time_b"]["std_deviation_bounds"]["lower"]
    lift_ref = bucket["lift_b"]["avg"]
    lift_up_limit = bucket["lift_b"]["std_deviation_bounds"]["upper"]
    lift_low_limit = bucket["lift_b"]["std_deviation_bounds"]["lower"]
    current_ref = bucket["current_b"]["avg"]
    current_up_limit = bucket["current_b"]["std_deviation_bounds"]["upper"]
    current_low_limit = bucket["current_b"]["std_deviation_bounds"]["lower"]
    energy_ref = bucket["energy_b"]["avg"]
    energy_up_limit = bucket["energy_b"]["std_deviation_bounds"]["upper"]
    energy_low_limit = bucket["energy_b"]["std_deviation_bounds"]["lower"]
    stickout_ref = bucket["stickout_b"]["avg"]
    stickout_up_limit = bucket["stickout_b"]["std_deviation_bounds"]["upper"]
    stickout_low_limit = bucket["stickout_b"]["std_deviation_bounds"]["lower"]

    
    # gather results into a dictionnary to create a dataframe
    current_data_dict = {'key_tool': [key_tool], 'key_extension': [key_extension], 
                         'key_period': [key_period], 'num_elements': [num_elements], 
                         'penetration_ref': [penetration_ref], 'penetration_up_limit': [penetration_up_limit], 'penetration_low_limit': [penetration_low_limit],
                         'drop_time_ref': [drop_time_ref], 'drop_time_up_limit': [drop_time_up_limit], 'drop_time_low_limit': [drop_time_low_limit],
                         'voltage_ref': [voltage_ref], 'voltage_up_limit': [voltage_up_limit], 'voltage_low_limit': [voltage_low_limit],
                         'time_ref': [time_ref], 'time_up_limit': [time_up_limit], 'time_low_limit': [time_low_limit],
                         'lift_ref': [lift_ref], 'lift_up_limit': [lift_up_limit], 'lift_low_limit': [lift_low_limit],
                         'current_ref': [current_ref], 'current_up_limit': [current_up_limit], 'current_low_limit': [current_low_limit],
                         'enery_ref': [energy_ref], 'energy_up_limit': [energy_up_limit], 'energy_low_limit': [energy_low_limit],
                         'stickout_ref': [stickout_ref], 'stickout_up_limit': [stickout_up_limit], 'stickout_low_limit': [stickout_low_limit]}
     
    # create dataframe with current feature set results
    cur_bucket_df = pd.DataFrame(current_data_dict, columns = ['key_tool', 'key_extension', 'key_period', 'num_elements', 
                                                               'penetration_ref', 'penetration_up_limit', 'penetration_low_limit',
                                                               'drop_time_ref', 'drop_time_up_limit', 'drop_time_low_limit',
                                                               'voltage_ref', 'voltage_up_limit', 'voltage_low_limit',
                                                               'time_ref', 'time_up_limit', 'time_low_limit',
                                                               'lift_ref', 'lift_up_limit', 'lift_low_limit',
                                                               'current_ref', 'current_up_limit', 'current_low_limit',
                                                               'enery_ref', 'energy_up_limit', 'energy_low_limit',
                                                               'stickout_ref', 'stickout_up_limit', 'stickout_low_limit'])
    
    weld_stats_df = weld_stats_df.append(cur_bucket_df, ignore_index=True)

In [72]:
print(weld_stats_df.shape)

(5205, 28)


In [73]:
weld_stats_df.to_csv('/Users/guillem/Data/Customers/Daimler/references/tool_reference_stats_v2_2.csv', index = False, header=True)

## 2) Obtain given tools historic welds relevant data

In [112]:
all_welds_list_query = {
  "size" : 1000,
  "query" : {
    "terms" : {
      "uniqueID.keyword" : [
        "h902-130tsb401-kf130.m050g9sub64sps4.1.1",
        "h902-170tsb101-kf130.m050g9sub64sps5.1.1",
        "h902-110tsb201-kf130.m050g9sub64sps3.2.1",
        "h902-060tsb401-kf130.m050g9sub64sps2.2.1",
        "h902-040tsb201-kf130.m050g9sub64sps1.1.1",
        "h902-110tsb101-kf130.m050g9sub64sps3.1.1",
        "h902-140tsb101-kf130.m050g9sub63sps3.1.1",
        "h902-140tsb301-kf130.m050g9sub63sps3.1.1",
        "h902-130tsb101-kf130.m050g9sub63sps3.1.1",
        "h902-090tsb101-kf130.m050g9sub63sps2.1.1",
        "h902-080tsb201-kf130.m050g9sub63sps2.1.1",
        "h902-080tsb101-kf130.m050g9sub63sps2.1.1",
        "h902-070tsb201-kf130.m050g9sub63sps2.1.1"
      ],
      "boost" : 1.0
    }
  },
  "_source" : {
    "includes" : [
      "uniqueID",
      "MeasurementData.MeasurementParameter.StudID.value",
      "WeldVoltageActual",
      "WeldTimeActual",
      "WeldCurrentActualPositive",
      "LMPenetrationActual",
      "StickoutActual",
      "DropTimeActual",
      "LMLiftHeightActual",
      "WeldEnergyActual",
      "FaultCode",
      "timestamp"
    ],
    "excludes" : [ ]
  },
  "sort" : [
    {
      "_doc" : {
        "order" : "asc"
      }
    }
  ]
}

In [113]:
elastic_response_all = ES.search(index="ml_toolbox_raw_data",
                           body=all_welds_list_query,
                           request_timeout=REQUEST_TIMEOUT)

This time we needed to scan over the ElasticSearch max size of 10k, this might take a few minutes... 

In [141]:
from elasticsearch.helpers import scan

all_welds_df = pd.DataFrame()

for elastic_response_all in scan(ES, index="ml_toolbox_raw_data", 
                                  query=all_welds_list_query, 
                                  scroll='10m',
                                  raise_on_error=True,
                                  size=5000,
                                  request_timeout=REQUEST_TIMEOUT):

    # gather current bucket key information
    tool = elastic_response_all["_source"]["uniqueID"]
    extension = elastic_response_all["_source"]["MeasurementData.MeasurementParameter.StudID.value"]
    timestamp = elastic_response_all["_source"]["timestamp"]
    
    # gather current bucket stats information
    penetration = elastic_response_all["_source"]["LMPenetrationActual"]
    drop_time = elastic_response_all["_source"]["DropTimeActual"] 
    voltage = elastic_response_all["_source"]["WeldVoltageActual"]
    time = elastic_response_all["_source"]["WeldTimeActual"]
    lift = elastic_response_all["_source"]["LMLiftHeightActual"]
    current = elastic_response_all["_source"]["WeldCurrentActualPositive"]
    energy = elastic_response_all["_source"]["WeldEnergyActual"]
    stickout = elastic_response_all["_source"]["StickoutActual"]
    faultcode = elastic_response_all["_source"]["FaultCode"]

    # gather results into a dictionnary to create a dataframe
    current_data_dict = {'tool': [tool], 'extension': [extension], 'timestamp': [timestamp], 
                         'penetration': [penetration], 'drop_time': [drop_time],
                         'voltage': [voltage], 'time': [time], 'lift': [lift], 'current': [current], 
                         'enery': [energy_ref], 'stickout': [stickout], 'faultcode': [faultcode]}
     
    # create dataframe with current feature set results
    cur_iteration_df = pd.DataFrame(current_data_dict, columns = ['tool', 'extension', 'timestamp', 
                        'penetration', 'drop_time', 'voltage', 'time', 'lift', 'current', 
                        'enery', 'stickout', 'faultcode'])
    
    all_welds_df = all_welds_df.append(cur_iteration_df, ignore_index=True)

all_welds_df.to_csv('/home/bigml/guillem/anomaly-test-util/tmp/my_tools_all_welds_.csv', index = False, header=True)

NotFoundError: NotFoundError(404, 'search_phase_execution_exception', 'No search context found for id [78721774]')

In [115]:
all_welds_df = pd.DataFrame()

for bucket in elastic_response_all["hits"]["hits"]:
    # gather current bucket key information
    tool = bucket["_source"]["uniqueID"]
    extension = bucket["_source"]["MeasurementData.MeasurementParameter.StudID.value"]
    timestamp = bucket["_source"]["timestamp"]
    
    # gather current bucket stats information
    penetration = bucket["_source"]["LMPenetrationActual"]
    drop_time = bucket["_source"]["DropTimeActual"] 
    voltage = bucket["_source"]["WeldVoltageActual"]
    time = bucket["_source"]["WeldTimeActual"]
    lift = bucket["_source"]["LMLiftHeightActual"]
    current = bucket["_source"]["WeldCurrentActualPositive"]
    energy = bucket["_source"]["WeldEnergyActual"]
    stickout = bucket["_source"]["StickoutActual"]
    faultcode = bucket["_source"]["FaultCode"]

    # gather results into a dictionnary to create a dataframe
    current_data_dict = {'tool': [tool], 'extension': [extension], 'timestamp': [timestamp], 
                         'penetration': [penetration], 'drop_time': [drop_time],
                         'voltage': [voltage], 'time': [time], 'lift': [lift], 'current': [current], 
                         'enery': [energy_ref], 'stickout': [stickout], 'faultcode': [faultcode]}
     
    # create dataframe with current feature set results
    cur_bucket_df = pd.DataFrame(current_data_dict, columns = ['tool', 'extension', 'timestamp', 
                         'penetration', 'drop_time', 'voltage', 'time', 'lift', 'current', 
                         'enery', 'stickout', 'faultcode'])
    
    all_welds_df = all_welds_df.append(cur_bucket_df, ignore_index=True)

In [116]:
print(all_welds_df.shape)

(1000, 12)


In [77]:
all_welds_df.head()

Unnamed: 0,tool,extension,timestamp,penetration,drop_time,voltage,time,lift,current,enery,stickout,faultcode
0,h902-090tsb101-kf130.m050g9sub63sps2.1.1,610865_213_3_1_1_2_1_2,[2020-01-22 00:39:31],-0.69,8.9,20.3,73.9,2.5,830,1274.75,3.4,0
1,h902-090tsb101-kf130.m050g9sub63sps2.1.1,610583_213_1_1_1_2_1_2,[2020-04-118 12:08:24],-0.69,10.0,19.6,60.0,2.49,860,1274.75,2.98,0
2,h902-080tsb201-kf130.m050g9sub63sps2.1.1,620584_213_1_1_1_1_1_1,[2020-05-136 04:06:51],-0.69,10.0,21.1,75.0,2.5,840,1274.75,2.91,0
3,h902-090tsb101-kf130.m050g9sub63sps2.1.1,610583_213_3_1_1_2_1_2,[2020-06-161 11:00:34],-0.69,10.0,20.1,60.0,2.49,860,1274.75,2.96,0
4,h902-140tsb301-kf130.m050g9sub63sps3.1.1,620142_213_3_1_1_2_1_1,[2020-02-48 16:46:03],-1.15,9.5,26.4,45.5,0.9,840,1274.75,2.86,0


Simple scan use: