Imports

In [1]:
import logging
import ssl
import urllib3

from elasticsearch import Elasticsearch, JSONSerializer
from elasticsearch.connection import create_ssl_context
from elasticsearch.helpers import parallel_bulk
from elasticsearch.helpers import scan
import numpy as np
import pandas as pd

urllib3.disable_warnings() # This is insecure
logger = logging.getLogger(__name__)

Elastic search connection procedures:

In [2]:
def get_elastic_client(server="local", write=False):
    if server == "local":
        if write:
            serializer = NpJSONSerializer()
        else:
            serializer = JSONSerializer()
        return Elasticsearch(host="localhost",
                             port=9200,
                             serializer=serializer)
    elif server in {"dev", "horizon", "prod"}:
        # All the other servers are remote hosts with similar configs
        if server == "prod":
            host = "daimler-elastic.vpc.bigml.com"
        elif server == "horizon":
            host = "daimler-elastic.horizon.bigml.com"
        else:
            host = "daimler-elastic.dev.bigml.com"

        if write:
            serializer = NpJSONSerializer()
        else:
            serializer = JSONSerializer()
        # Set up ssl context to disable cert verification
        ssl_context = create_ssl_context()
        ssl_context.check_hostname = False
        ssl_context.verify_mode = ssl.CERT_NONE
        return Elasticsearch(host=host,
                             port=443,
                             http_auth=("dev", "paroafCa"),
                             serializer=serializer,
                             ssl_context=ssl_context,
                             use_ssl=True,
                             request_timeout=50,
                             timeout=50,
                             max_retries=5, 
                             retry_on_timeout=True)
    else:
        logger.warning("unknown server '%s'", server)
        return None

Instanciate Elastic Search connection to dev:

In [3]:
ES = get_elastic_client("dev")
print(ES)

<Elasticsearch([{}])>


## 1) Query

In [29]:
get_tool_info = {
  "size" : 10000,
  "query" : {
    "range" : {
      "timestamp" : {
        "from" : "2020-11-02",
        "to" : "2020-11-04",
        "include_lower" : True,
        "include_upper" : True,
        "boost" : 1.0
      }
    }
  },
  "_source" : {
    "includes" : [
      "location",
      "series",
      "line",
      "plant",
      "station",
      "robot",
      "controller",
      "tool",
      "MeasurementData.MeasurementParameter.StudID.value",
      "uniqueID",
      "MeasurementData.MeasurementParameter.Devicetype.value",
      "studType",
      "studSize",
      "toolType",
      "DeviceGeneration"
    ],
    "excludes" : [ ]
  },
  "sort" : [
    {
      "_doc" : {
        "order" : "asc"
      }
    }
  ]
}

In [30]:
REQUEST_TIMEOUT = 100.  # Timeout for the elastic queries

In [32]:
elastic_response = ES.search(index="ml_toolbox_raw_data",
                           body=get_tool_info,
                           request_timeout=REQUEST_TIMEOUT)

In [24]:
print(elastic_response)

{'took': 2356, 'timed_out': False, '_shards': {'total': 26, 'successful': 26, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10000, 'relation': 'gte'}, 'max_score': None, 'hits': [{'_index': 'raw-te1500-2020.11', '_type': '_doc', '_id': '79a986f7de9dc07f5928e56539fc2537d45dd007', '_score': None, '_source': {'MeasurementData.MeasurementParameter.StudID.value': '610455_213_3_1_1_2_1_2', 'studType': '6S17', 'controller': '101', 'line': 'Z1', 'toolType': 'TH510', 'tool': '1.1', 'DeviceGeneration': 'TE', 'robot': '100', 'MeasurementData.MeasurementParameter.Devicetype.value': 'TE1500', 'series': '213', 'plant': 'UB64', 'studSize': 'M6x21_MAT', 'station': '140', 'location': '050', 'uniqueID': 'h902-140tsb101-kf130.m050g9sub64sps4.1.1'}, 'sort': [221]}, {'_index': 'raw-te1500-2020.11', '_type': '_doc', '_id': '56a2b6c4497d582488d064cc05b32d7eb00b2252', '_score': None, '_source': {'MeasurementData.MeasurementParameter.StudID.value': '620508_213_3_1_1_1_1_2', 'studType': '6S17', 'contr

In [38]:
weld_stats_df = pd.DataFrame()

for bucket in elastic_response["hits"]["hits"]:
    # gather current bucket key information
    uniqueid = bucket["_source"]["uniqueID"]
    location = bucket["_source"]["location"]
    series = bucket["_source"]["series"]
    line = bucket["_source"]["line"]
    plant = bucket["_source"]["plant"]
    station = bucket["_source"]["station"]
    robot = bucket["_source"]["robot"]
    controller = bucket["_source"]["controller"]
    tool = bucket["_source"]["tool"]
    toolType = bucket["_source"]["toolType"]
    studType = bucket["_source"]["studType"]
    studSize = bucket["_source"]["studSize"]
    DeviceGeneration = bucket["_source"]["DeviceGeneration"]
    Devicetype = bucket["_source"]["MeasurementData.MeasurementParameter.Devicetype.value"]
    studid = bucket["_source"]["MeasurementData.MeasurementParameter.StudID.value"]
    
    
    current_data_dict = {"uniqueid" : [uniqueid],
                         "studid" : [studid],
                         "location" : [location],
                         "series" : [series],
                         "line": [line],
                         "plant" : [plant],
                         "station" : [station],
                         "robot" : [robot],
                         "controller" : [controller],
                         "tool" : [tool],
                         "toolType" : [toolType],
                         "studType" : [studType],
                         "studSize" : [studSize],
                         "deviceGeneration" : [DeviceGeneration],
                         "deviceType" : [Devicetype]}
    
    
    
    
    # create dataframe with current feature set results
    cur_bucket_df = pd.DataFrame(current_data_dict, columns = ["uniqueid",
                                                               "studid",
                                                               "location",
                                                               "series",
                                                               "line",
                                                               "plant",
                                                               "station",
                                                               "robot",
                                                               "controller",
                                                               "tool",
                                                               "toolType",
                                                               "studType",
                                                               "studSize",
                                                               "deviceGeneration",
                                                               "deviceType"])
    
    weld_stats_df = weld_stats_df.append(cur_bucket_df, ignore_index=True)

In [41]:
print(weld_stats_df.shape)

(10000, 15)


In [42]:
weld_stats_df.head()

Unnamed: 0,uniqueid,studid,location,series,line,plant,station,robot,controller,tool,toolType,studType,studSize,deviceGeneration,deviceType
0,h902-140tsb101-kf130.m050g9sub64sps4.1.1,610455_213_3_1_1_2_1_2,50,213,Z1,UB64,140,100,101,1.1,TH510,6S17,M6x21_MAT,TE,TE1500
1,h902-140tsb101-kf130.m050g9sub64sps4.1.1,620508_213_3_1_1_1_1_2,50,213,Z1,UB64,140,100,101,1.1,TH510,6S17,M6x21_MAT,TE,TE1500
2,h902-140tsb101-kf130.m050g9sub64sps4.1.1,620606_213_1_1_1_1_1_2,50,213,Z1,UB64,140,100,101,1.1,TH510,6S17,M6x21_MAT,TE,TE1500
3,h902-140tsb101-kf130.m050g9sub64sps4.1.1,610457_213_1_1_1_1_1_2,50,213,Z1,UB64,140,100,101,1.1,TH510,6S17,M6x21_MAT,TE,TE1500
4,h902-140tsb101-kf130.m050g9sub64sps4.1.1,610953_213_1_1_1_1_1_2,50,213,Z1,UB64,140,100,101,1.1,TH510,6S17,M6x21_MAT,TE,TE1500


In [43]:
weld_stats_df.to_csv('/Users/guillem/Data/Customers/Daimler/anomalies-analysis/215_tests/all_tool_welds.csv', index = False, header=True)

# round2!

In [44]:
num_batch = 1
welds = []

for current_weld in scan(ES, index="ml_toolbox_raw_data", 
                                  query=get_tool_info, 
                                  scroll='15m',
                                  raise_on_error=True,
                                  size=5000,
                                  request_timeout=REQUEST_TIMEOUT):

    if num_batch % 10000 == 0:
      print("Current batch %s" % num_batch)
    
    num_batch = num_batch + 1

    # gather current bucket key information
    welds.append(current_weld["_source"])

print("Loop finished, building dataframe ")
print(len(welds))

Current batch 10000
Current batch 20000
Current batch 30000
Current batch 40000
Current batch 50000
Current batch 60000
Current batch 70000
Current batch 80000
Current batch 90000
Current batch 100000
Current batch 110000
Current batch 120000
Current batch 130000
Current batch 140000
Current batch 150000
Current batch 160000
Current batch 170000
Current batch 180000
Current batch 190000
Current batch 200000
Current batch 210000
Current batch 220000
Current batch 230000
Current batch 240000
Current batch 250000
Current batch 260000
Current batch 270000
Current batch 280000
Current batch 290000
Current batch 300000
Current batch 310000
Current batch 320000
Current batch 330000
Current batch 340000
Current batch 350000
Current batch 360000
Current batch 370000
Current batch 380000
Current batch 390000
Current batch 400000
Current batch 410000
Current batch 420000
Current batch 430000
Current batch 440000
Current batch 450000
Current batch 460000
Current batch 470000
Current batch 480000
C

In [46]:
## add welds list to dataframe
tool_info_welds_df = pd.DataFrame.from_records(welds)

In [47]:
tool_info_welds_df.to_csv('/Users/guillem/Data/Customers/Daimler/anomalies-analysis/215_tests/all_tool_welds.csv', index = False, header=True)