In [2]:
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan

ES_PAGE_SIZE = 10000

START_DATE = "2022-06-06T10:00:00Z"
END_DATE = "2022-06-13T10:05:00Z"

name = '20220606_20220613.csv'


In [2]:
def recursive_items(dictionary):
    """Unnest dictionary key-value pairs.

    Args:
        dictionary (dict): Dictionary to be evaluated.

    Yields:
        key, value pair
    """    
    for key, value in dictionary.items():
        if type(value) is dict:
            yield from recursive_items(value)
        else:
            yield (key, value)

def list_from_generator(generator):
    """Returns a list containing the generator keys 

    Args:
        generator (generator): Generator from recursive listing

    Returns:
        list: List containing all the nested keys
    """    
    list = []
    for key, value in recursive_items(generator):
        list.append(key)
    return list


es = Elasticsearch(
    cloud_id='netlex:dXMtZWFzdC0xLmF3cy5mb3VuZC5pbyRjZGM5MjVmZDU1MWY0YTY2YjgxNTYyYThkZGE3YTU5MiQzMWI3ZDM3MGUzY2U0OWU5OGVhOGRkNGUxZDJkNjYzOQ==',
    api_key=("wSeQRIEBi9CLWBWhXrYS", "GKhBFW2eQ66Fjd2ZMbmSGg"),
)

result = scan(
    es,
    index='filebeat-netlex-*',
    query={
        "query": {
            "bool": {
                "filter": [
                    {
                        "range": {
                            "@timestamp": {
                                "format": "strict_date_optional_time",
                                "gte": START_DATE,
                                "lte": END_DATE,
                            }
                        }
                    }
                ]
            }
        }
    },
    _source=[
        "@timestamp",
        "message",
        "netlex.company_subdomain",
        "netlex.user_login"
    ],
    size=ES_PAGE_SIZE
)

count = 0
reqs = []

for r in result:
    if 'user_login' not in list_from_generator(r):
        continue
    elif 'company_subdomain' not in list_from_generator(r):
        continue
    req = {
        '@timestamp': r['_source']['@timestamp'],
        'message': r['_source']['message'],
        'netlex.company_subdomain': r['_source']['netlex']['company_subdomain'],
        'netlex.user_login': r['_source']['netlex']['user_login']
    }
    if count % ES_PAGE_SIZE == 0:
        print(count)
    count = count+1
    reqs.append(req)

df = pd.DataFrame(reqs)
df.columns = ['date', 'message', 'subdomain', 'login']
df['message'] = df.message.str.split(']', expand=True).iloc[:, 2]
# df.to_csv(name, index=False)
df.head()

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000


In [11]:
df = pd.read_csv(name)

REG_EMAIL = '([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)'
REG_ID = '[0-9]+'
REG_MESSAGES = '\{(.*?)\}'
REG_KEY = 'key=(.*?)\;context=(.*?)\;method=(.*?)\;runtime=(.*?)'
REG_SIG_ID = '\w+-\w+-\w+-\w+-\w+'
REG_SPREADSHEET = '.+(\.ods)'


df['message'] = df['message'].str.replace('production.INFO: ', '', regex=False)
df['message'] = df['message'].str.strip()
df['message'] = df['message'].str.replace(REG_EMAIL, 'EMAIL', regex=True)
df['message'] = df['message'].str.replace(REG_ID,'ID', regex=True)
df['message'] = df['message'].str.replace(REG_MESSAGES,'MESSAGE', regex=True)
df['message'] = df['message'].str.replace(REG_KEY, 'KEY', regex=True)
df['message'] = df['message'].str.replace(REG_SIG_ID, 'SIGNATURE_ID', regex=True)
df['message'] = df['message'].str.replace(REG_SPREADSHEET, 'SPREADSHEET', regex=True)

mask_rename_document = df['message'].str.contains('document rename ID')
df.loc[mask_rename_document, 'message'] = df.loc[mask_rename_document, 'message'].str.split('-', expand=True)[0] + 'RENAMED_OLD_NEW'
mask_change_owner_document = df['message'].str.contains('change owner ID')
df.loc[mask_change_owner_document, 'message'] = df.loc[mask_change_owner_document, 'message'].str.split('-', expand=True)[0] + 'OWNER_OLD_NEW'
mask_change_owners_document = df['message'].str.contains('Change owner multiple documents')
df.loc[mask_change_owners_document, 'message'] = df.loc[mask_change_owners_document, 'message'].str.split(':', expand=True)[0] + ' NEW_OWNERS'
mask_pdf_error = df['message'].str.contains('Error in PDFConverter in document_id')
df.loc[mask_pdf_error, 'message'] = df.loc[mask_pdf_error, 'message'].str.split(':', expand=True)[0] + ' ERROR_MESSAGE'
mask_signature_request = df['message'].str.contains('signature request sent')
df.loc[mask_signature_request, 'message'] = 'signature request sent ID'
mask_conversion_error = df['message'].str.contains('PDF converter - conversion error')
df.loc[mask_conversion_error, 'message'] = 'PDF converter - conversion error: ERROR_MESSAGE'
mask_ldap_login_attempt = df['message'].str.contains('LDAP login attempt - login')
df.loc[mask_ldap_login_attempt, 'message'] = 'LDAP login attempt - login: LDAP_USER_LOGIN'
mask_login_attempt = df['message'].str.contains('Login attempt with Credentials - login')
df.loc[mask_login_attempt, 'message'] = 'Login attempt with credentials - login: LDAP_USER_LOGIN'
mask_token_error = df['message'].str.contains('signature adobe sign refresh token error Client error')
df.loc[mask_token_error, 'message'] = 'signature adobe sign refresh token error Client error: CLIENT_TOKEN_ERROR_MESSAGE'

df = df.loc[~df['message'].str.contains('production.ERROR')]


In [13]:
sorted(df['message'].unique())

['Change owner multiple documents, ids NEW_OWNERS',
 'Disassociated workflowId "ID" from templateId "ID"',
 'Error in PDFConverter in document_id ERROR_MESSAGE',
 'Invalid credentials',
 'KEYID.ID',
 'LDAP Search failed: ldap_search(): Search: Operations error',
 'LDAP bind failed',
 'LDAP bind failed: ldap_bind(): Unable to bind to server: Invalid credentials',
 'LDAP bind successful',
 'LDAP binding for auth',
 'LDAP binding for search',
 'LDAP login attempt - login: LDAP_USER_LOGIN',
 'LDAP user not found',
 'LDAP user not found in directory',
 'LDAP using fallback bind',
 'LDAP using fallback bind with credentials',
 'Login attempt with SAML - requested',
 'Login attempt with credentials - login: LDAP_USER_LOGIN',
 'NULL returned',
 'PDF converter - already exists',
 'PDF converter - conversion error: ERROR_MESSAGE',
 'PDF converter - converted with cloudconvert',
 'PDF converter - converted with soffice',
 'PDF converter - pdfunite error: [',
 'Questionnaire Configs not mapped bet