Imports

In [11]:
import logging
import ssl
import urllib3
import json

from elasticsearch import Elasticsearch, JSONSerializer
from elasticsearch.connection import create_ssl_context
from elasticsearch.helpers import parallel_bulk
from elasticsearch.helpers import scan
import numpy as np
import pandas as pd

urllib3.disable_warnings() # This is insecure
logger = logging.getLogger(__name__)

Elastic search connection procedures:

In [None]:
def get_elastic_client(server="local", write=False):
    if server == "local":
        if write:
            serializer = NpJSONSerializer()
        else:
            serializer = JSONSerializer()
        return Elasticsearch(host="localhost",
                             port=9200,
                             serializer=serializer)
    elif server in {"dev", "horizon", "prod"}:
        # All the other servers are remote hosts with similar configs
        if server == "prod":
            host = "daimler-elastic.vpc.bigml.com"
        elif server == "horizon":
            host = "daimler-elastic.horizon.bigml.com"
        else:
            host = "daimler-elastic.dev.bigml.com"

        if write:
            serializer = NpJSONSerializer()
        else:
            serializer = JSONSerializer()
        # Set up ssl context to disable cert verification
        ssl_context = create_ssl_context()
        ssl_context.check_hostname = False
        ssl_context.verify_mode = ssl.CERT_NONE
        return Elasticsearch(host=host,
                             port=443,
                             http_auth=("dev", "paroafCa"),
                             serializer=serializer,
                             ssl_context=ssl_context,
                             use_ssl=True,
                             request_timeout=50,
                             timeout=50,
                             max_retries=5, 
                             retry_on_timeout=True)
    else:
        logger.warning("unknown server '%s'", server)
        return None

Instanciate Elastic Search connection to dev:

In [18]:
ES = get_elastic_client("dev")
print(ES)

<Elasticsearch([{}])>


## 1) Get Repaired Welds

In [21]:
# DEFINE QUERY
with open('elastic_query_all_repairs.json', "r") as f:
      query_repair_welds = json.load(f)

In [22]:
REQUEST_TIMEOUT = 100.  # Timeout for the elastic queries

In [20]:
repaired_welds_df = pd.DataFrame()

num_batch = 1
welds = []

for current_weld in scan(ES, index="ml_toolbox_raw_data", 
                                  query=query_repair_welds, 
                                  scroll='15m',
                                  raise_on_error=True,
                                  size=5000,
                                  request_timeout=REQUEST_TIMEOUT):

    if num_batch % 10000 == 0:
      print("Current batch %s" % num_batch)
    
    num_batch = num_batch + 1

    # gather current bucket key information
    welds.append(current_weld["_source"])


## build dataframe from welds list
print("Loop finished, building dataframe ")
print(len(welds))

repaired_welds_df = pd.DataFrame.from_records(welds)

Current batch 10000
Current batch 20000
Loop finished, building dataframe 
22979


## 2) Get successful welds

In [28]:
# DEFINE QUERY
with open('elastic_query_random_welds.json', "r") as f:
      query_random_welds = json.load(f)

with open('elastic_query_random_welds2.json', "r") as f:
      query_random_welds2 = json.load(f)
        
with open('elastic_query_random_welds3.json', "r") as f:
      query_random_welds3 = json.load(f)

In [27]:
num_batch = 1
welds = []

for current_weld in scan(ES, index="ml_toolbox_raw_data", 
                                  query=query_random_welds, 
                                  scroll='15m',
                                  raise_on_error=True,
                                  size=5000,
                                  request_timeout=REQUEST_TIMEOUT):

    if num_batch % 10000 == 0:
      print("Current batch %s" % num_batch)
    
    num_batch = num_batch + 1

    # gather current bucket key information
    welds.append(current_weld["_source"])

print("Loop finished, building dataframe ")
print(len(welds))

Current batch 10000
Current batch 20000
Current batch 30000
Loop finished, building dataframe 
32783


In [29]:
## append welds list to dataframe
repaired_welds_df = repaired_welds_df.append(pd.DataFrame.from_records(welds), ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [30]:
num_batch = 1
welds = []

for current_weld in scan(ES, index="ml_toolbox_raw_data", 
                                  query=query_random_welds2, 
                                  scroll='15m',
                                  raise_on_error=True,
                                  size=5000,
                                  request_timeout=REQUEST_TIMEOUT):

    if num_batch % 10000 == 0:
      print("Current batch %s" % num_batch)
    
    num_batch = num_batch + 1

    # gather current bucket key information
    welds.append(current_weld["_source"])

print("Loop finished, building dataframe ")
print(len(welds))

Current batch 10000
Current batch 20000
Current batch 30000
Loop finished, building dataframe 
32805


In [31]:
## append welds list to dataframe
repaired_welds_df = repaired_welds_df.append(pd.DataFrame.from_records(welds), ignore_index=True)

In [32]:
num_batch = 1
welds = []

for current_weld in scan(ES, index="ml_toolbox_raw_data", 
                                  query=query_random_welds3, 
                                  scroll='15m',
                                  raise_on_error=True,
                                  size=5000,
                                  request_timeout=REQUEST_TIMEOUT):

    if num_batch % 10000 == 0:
      print("Current batch %s" % num_batch)
    
    num_batch = num_batch + 1

    # gather current bucket key information
    welds.append(current_weld["_source"])

print("Loop finished, building dataframe ")
print(len(welds))

Current batch 10000
Current batch 20000
Current batch 30000
Loop finished, building dataframe 
33827


In [33]:
## append welds list to dataframe
repaired_welds_df = repaired_welds_df.append(pd.DataFrame.from_records(welds), ignore_index=True)

In [35]:
print(repaired_welds_df.shape)

(122394, 199)


In [37]:
repaired_welds_df.to_csv('/Users/guillem/Data/Customers/Daimler/insights/repairs_insights_dataset.csv', index = False, header=True)