In [None]:
import sys
import json
import pandas as pd
from uuid import uuid4

import ipywidgets as widgets
from IPython.display import display
from IPython.core.display import Markdown

import rafal

# Config loading : modify in the config.json file
config = json.load(open('config.json'))
user= config['Rafal.user']
url = config['Rafal.url']

pwd_exists = bool(config.get('Rafal.password', None)) 
wPwd= widgets.Password(value= '',
                      placeholder= 'from config' if pwd_exists else 'Enter password',
                      disabled= pwd_exists
                     )

display(widgets.HBox([widgets.Label('Password for Rafal API :'), wPwd]))

# proxy parameters
proxies = (config['proxies'] if config['proxies']['http'] or config['proxies']['https'] 
           else None)

In [None]:
# get password
pwd= config['Rafal.password'] if pwd_exists else wPwd.value
if not pwd:
    print('Please write your password first !')

# open a new rafal session and connect
session= rafal.Session(verbose= True)
session.connect(url= url, login= user, pwd= pwd, proxies= proxies)

# Python version
print (f'Python version: {sys.version}')
print (f'Pandas version: {pd.__version__}')

swagger_link= Markdown(f"[Swagger on {session.url}]({session.url}/docs/swagger-ui/index.html?url=/assets/swagger.json#/)")

# list all modules
result= session.request(endPoint= '/modules', http= 'GET', verbose= True)
print('Modules :\n', result)

In [None]:
result

In [None]:
swagger_link

## Monitoring Module

### using monitoring_flat

In [None]:
module= 'monitoring_flat'

## Rafal logs

Table description

* **dateTime**: a timestamp of when the logging event was performed
* **level**: the level of the logging event
* **message**: the log message
* **threadName**: the name of the thread that generated the logging event
* **className**: the name of the class that generated the logging event
* **context keys**: Array of keys for the logging context (ie: “server-id,user,id”)
* **context value**: Array of values for the logging context (ie: “69c0ace2-f5c7-4368-bd18-12e8fec588a1, guillaume, 2”)
* **stack trace**: location information for errors logged by the log system


### requesting using *rafal-logging* pivot

Use the following Rafal request content :

In [None]:
my_uuid1= str(uuid4())
request_content= {
    "metric": None,
    "postaggregaggr": None,
    "with_facet": True,
    "filter_on_query": None,
    "batch": False,
    "batch_description": None,
    "currency": None,
    "pivot": "rafal-logging",
    "by": [
        "dateTime",
        "className",
        "message",
        "level",
        "stackTrace",
        "threadName",
        "context_key",
        "context_value"
    ],
    "where": {
        "@and": []
    },
    "versioning": [],
    "query_id": my_uuid1
}

In [None]:
# get rafal_logging pivot response
resp= session.request(endPoint= f'/module/{module}/query', http= 'POST', jsonText= request_content, output= 'json')
len(resp)

In [None]:
resp[:2000]

In [None]:
# convert the data in Dataframe
df= pd.read_json(resp, orient='records',lines= True)
df.head()

In [None]:
# reshape Dataframe
cols= ['className', 'level', 'message', 'stackTrace', 'threadName']
df2= pd.pivot_table(df, index= ['dateTime'] + cols, 
               columns= ['context_key'], values= 'context_value', aggfunc= sum, fill_value= ''
                   ).reset_index(cols).sort_index()
print(f"{len(df2)} records")
df2

## Endpoint records

Table description

* **accessTime**: time and date at which the endpoint was called
* **accessDate**: date at which the endpoint was called
* **finishTime**: time and date at which the endpoint answered.
* **serverInstance**: server UUID.
* **serverHostname**: server IP and port.
* **ClientHostName**: client host name
* **requestId**: per-run request unique id.
* **requestBodyMimetype**: the type of the data (ie: application/json, text/csv, etc.)
* **requestMethod**: request method performed (GET, POST, DELETE,...)
* **requestPath**: the request endpoint used
* **requestBody**: request body provided (if sane to store)
* **userName**: name of the user that performed the call to the endpoint.
* **endpointDuration**: amount of time in milliseconds it took the endpoint to respond.
* **resultBodyMimetype**: the type of the result (ie: application/json)
* **resultCode**: code returned by the endpoint.
* **queryIds**: vector that contains the id of every clickhouse query executed by the endpoint. Ability to cross reference that with the query log table to get additional information on the queries the endpoint ran.
* **errorJson**: in case of error, the Json returned.
* **errorId**: if any, the error ID
* **errorType**: if any, the error type
* **errorMessage**: if any, the error message

### requesting using *endpoint_records* pivot

### please wait 20 sec before capturing this request to let the system log the 1st notebook request

In [None]:
import time
time.sleep(20)

Use the following Rafal request content :

In [None]:
my_uuid= str(uuid4())
request_content= {
    "metrics": ["count"],
    "with_facet": False,
    "pivot": "endpoint_records",
    "by": [
        "access_time", 
        "access_date", 
        "server_id", 
        "server_hostname", 
        "client_hostname", 
        "username", 
        "request_id", 
        "request_body_mimetype", 
        "request_method", 
        "request_path", 
        "request_body", 
        "result_body_mimetype", 
        "result_code", 
        "error_json", 
        "error_id", 
        "error_type", 
        "error_message", 
        "finish_time"
    ],
    "where": {
        "request_path": { "match": "^/[^dlp]" }
    },
    "versioning": [],
    "query_id": my_uuid
}

In [None]:
request_content

In [None]:
# get rafal_logging pivot response
df0= session.request(endPoint= f'/module/{module}/query', http= 'POST', jsonText= request_content, output= 'Dataframe')
df0.head()

In [None]:
# convert the data in Dataframe
df= df0.set_index('access_time').sort_index()
df

In [None]:
# records nb by user & result_code
df.groupby(['username', 'result_code']).request_id.count().unstack(0)

In [None]:
# records nb by user & endPoint
df.groupby(['username', 'request_path']).request_id.count().unstack(0)

In [None]:
# select requests with Rafal error
df[df.error_id != '']

In [None]:
# select all "pivot" requests
df[df.request_body.str.contains('"pivot"')]

In [None]:
# select this notebook first monitoring request & print the request body
# please wait 20 sec before capturing your request, or else it may not be available in monitoring records
row= df[df.request_body.str.contains(my_uuid1)]
row.T

In [None]:
row.request_body.values

In [None]:
# print the request body
if not row.empty:
    my_requestId= row.request_id.iloc[0]
    print(f"print request_body for request_id = {my_requestId} :")
    print(json.loads(row.request_body.iloc[0]))
else:
    my_requestId= None

## endpoint_database_queries: Capturing all the queries created from a given request

Table *endpoint_query_inspection(_flat)* description

* **accessTime**: time and date at which the endpoint was called
* **accessDate**: date at which the endpoint was called
* **finishTime**: time and date at which the endpoint answered.
* **serverInstance**: server UUID.
* **serverHostname**: server IP and port.
* **ClientHostName**: client host name
* **requestId**: per-run request unique id.
* **requestBodyMimetype**: the type of the data (ie: application/json, text/csv, etc.)
* **requestMethod**: request method performed (GET, POST, DELETE,...)
* **requestPath**: the request endpoint used
* **requestBody**: request body provided (if sane to store)
* **userName**: name of the user that performed the call to the endpoint.
* **endpointDuration**: amount of time in milliseconds it took the endpoint to respond.
* **resultBodyMimetype**: the type of the result (ie: application/json)
* **resultCode**: code returned by the endpoint.
* **queryIds**: vector that contains the id of every clickhouse query executed by the endpoint. Ability to cross reference that with the query log table to get additional information on the queries the endpoint ran.
* **errorJson**: in case of error, the Json returned.
* **errorId**: if any, the error ID
* **errorType**: if any, the error type
* **errorMessage**: if any, the error message

## Capturing all the queries created by a specific request

Table description

The *endpoint_query_inspection* table registers two kinds of queries:
* *Initial* queries that were run directly by the client.
* *Child* queries that were initiated by other queries (for distributed query execution). For these types of queries, information about the parent queries is shown in the initial_* columns.*

All these queries belong to a *record_endpoint* identified by a `request_id`

### Dimensions
* type (Enum8) — Type of event that occurred when executing the query. Values:
  * 'QueryStart' = 1 — Successful start of query execution.
  * 'QueryFinish' = 2 — Successful end of query execution.
* 'ExceptionBeforeStart' = 3 — Exception before the start of query execution.
* 'ExceptionWhileProcessing' = 4 — Exception during the query execution.
* event_date (Date) — Query starting date.
* event_time (DateTime) — Query starting time.
* query_start_time (DateTime) — Start time of query execution.
 
* query (String) — Query string.
* exception (String) — Exception message.
* stack_trace (String) — Stack trace (a list of methods called before the error occurred). An empty string, if the query is completed successfully.
* is_initial_query (UInt8) — Query type. Possible values:
  - 1 — Query was initiated by the client.
  - 0 — Query was initiated by another query for distributed query execution.
* user (String) — Name of the user who initiated the current query.
* query_id (String) — ID of the query.
* address (IPv6) — IP address that was used to make the query.
* port (UInt16) — The client port that was used to make the query.
* initial_user (String) — Name of the user who ran the initial query (for distributed query execution).
* initial_query_id (String) — ID of the initial query (for distributed query execution).
* initial_address (IPv6) — IP address that the parent query was launched from.
* initial_port (UInt16) — The client port that was used to make the parent query.
* interface (UInt8) — Interface that the query was initiated from. Possible values:
  * 1 — TCP.
  * 2 — HTTP.
* os_user (String) — OS’s username who runs clickhouse-client.
* client_hostname (String) — Hostname of the client machine where the clickhouse-client or another TCP client is run.
* client_name (String) — The clickhouse-client or another TCP client name.
* client_revision (UInt32) — Revision of the clickhouse-client or another TCP client.
* client_version_major (UInt32) — Major version of the clickhouse-client or another TCP client.
* client_version_minor (UInt32) — Minor version of the clickhouse-client or another TCP client.
* client_version_patch (UInt32) — Patch component of the clickhouse-client or another TCP client version.
* http_method (UInt8) — HTTP method that initiated the query. Possible values:
  * 0 — The query was launched from the TCP interface.
  * 1 — GET method was used.
  * 2 — POST method was used.
* http_user_agent (String) — The UserAgent header passed in the HTTP request.
* quota_key (String) — The “quota key”
* revision (UInt32) — ClickHouse revision.
* thread_numbers (Array(UInt32)) — Number of threads that are participating in query execution.
* ProfileEvents.Names (Array(String)) — Counters that measure different metrics. The description of them could be found in the table system.events
* ProfileEvents.Values (Array(UInt64)) — Values of metrics that are listed in the ProfileEvents.Names column.
* Settings.Names (Array(String)) — Names of settings that were changed when the client ran the query. To enable logging changes * to settings, set the log_query_settings parameter to 1.
* Settings.Values (Array(String)) — Values of settings that are listed in the Settings.Names column.
 
### Facts

* query_duration_ms (UInt64) — Duration of query execution.
* read_rows (UInt64) — Number of read rows.
* read_bytes (UInt64) — Number of read bytes.
* written_rows (UInt64) — For INSERT queries, the number of written rows. For other queries, the column value is 0.
* written_bytes (UInt64) — For INSERT queries, the number of written bytes. For other queries, the column value is 0.
* result_rows (UInt64) — Number of rows in the result.
* result_bytes (UInt64) — Number of bytes in the result.
* memory_usage (UInt64) — Memory consumption by the query.

### requesting using *endpoint_database_queries* pivot

Use the following Rafal request content :

In [None]:
my_uuid3= str(uuid4())
request_content= {
    "metrics": ["count", "sumQueryReadBytes", "sumQueryReadRows", "sumQueryWrittenBytes", "sumQueryWrittenRows", 
                "sumQueryDuration", "sumQueryMemoryUsage", "sumQueryResultBytes", "sumQueryResultRows"],
    "with_facet": False,
    "postaggregaggr": {},
    "pivot": "endpoint_database_queries",
    "by": [
        "access_time", 
        "request_id", 
        "server_id", 
        "is_complete", 
        "server_hostname", 
        "client_hostname", 
        "username", 
        "request_method", 
        "request_path", 
        "request_body", 
        "request_body_mimetype", 
        "result_body_mimetype", 
        "result_code", 
        "endpoint_duration_ms", 
        "error_json", 
        "query_type", 
        "query_context", 
        "query_start_time", 
        "query", 
        "query_exception", 
        "query_stack_trace", 
        "is_initial_query", 
        "query_id", 
        "query_address", 
        "query_port", 
        "initial_query_id", 
        "initial_query_address", 
        "access_date", 
        "finish_time", 
        "error_id", 
        "error_type", 
        "error_message", 
        "query_start_date", 
        "query_finish_time", 
        "initial_query_port", 
        "query_client_hostname", 
        "query_client_name", 
        "query_client_revision", 
        "query_client_version_major", 
        "query_client_version_minor", 
        "query_client_version_patch", 
        "query_http_method", 
        "query_http_user_agent", 
        "query_quota_key", 
        "query_revision"
    ],
    "where": {
        "@and": [
            {
                "request_path": {"match": '^/[^dlp]'}
            },
            {
                "result_code": {"Values": ["200"]}
            }
        ]
    },
    "versioning": [],
    "query_id": my_uuid3
}

In [None]:
# get queries records
df0= session.request(endPoint= f'/module/{module}/query', http= 'POST', jsonText= request_content, 
                     output= 'Dataframe', verbose= False)

In [None]:
df0

In [None]:
# set the index of the Dataframe
df= df0.set_index(['request_id', 'query_id']).sort_index()
df

In [None]:
# Compute some stats for all requests
fields= ['ReadRows', 'ReadBytes', 'WrittenRows', 'WrittenBytes', 'ResultRows',
         'ResultBytes', 'MemoryUsage', 'Duration']

fieldsToSum= {field: pd.NamedAgg(column= 'sumQuery'+field, aggfunc='sum') for field in fields}

# Statistics from queries grouped by Rafal API request
stats= df.groupby(['request_id', 'request_path']).agg(
                            endpoint_duration_ms= pd.NamedAgg(column='endpoint_duration_ms', aggfunc='first'),
                            queries_count= pd.NamedAgg(column='query_address', aggfunc='count'),
                            **fieldsToSum)
stats

### From a specific request, print all related Rafal queries

In [None]:
my_requestId

In [None]:
#list queries from the selected request (query order is random !!)
if my_requestId:
    print(f"request id = {my_requestId} : \n")
    for i, (idx, query) in enumerate(df.loc[my_requestId, 'query'].iteritems()):
        print(f"({i}) {query}\n")

In [None]:
# print related stats
stats.loc[my_requestId].T

### Look for a specific request

Here we want to search for all requests, sent by user *Guillaume*, to the endPoint `'/tables/endpoint_query_inspection/schema'`
and print the latest one.

In [None]:
df1= df[(df.request_path=='/tables/endpoint_query_inspection/schema') & (df.username == 'guillaume')
       ].sort_values('access_time')
df1

In [None]:
# print the last one
print(df1.iloc[-1].query)
df1.iloc[-1]