In [1]:
import os
import pandas as pd

Loading data

In [2]:
problematics_regarding_time = pd.read_csv("actives_and_problematics_regarding_time_last_30_days_from-27-07-2022.csv")

In [3]:
problematics_regarding_time.drop("ReportId", axis=1, inplace=True)
problematics_regarding_time.rename(columns={"RecordId": "ReportId"}, inplace=True) # record id = report id derived

In [4]:
interest_cols = {
    'Name': "first",
    "ReportId": "first",
    "Description": "first",
    "DurationInMinutes": ["min", "max", "median", "mean", "std", "count"],
    'RowsProcessed': ["sum", "min", "max", "median", "mean", "std"],
    "RowsReturned": ["first", "unique", "count"],
    'ConnectionType': ["first", "unique", "count"],
    'ColumnHeaders': ["first", "unique", "count"], 
    'DashboardId': "first",
    'DashboardName': "first",
    'Format': "first", 
    'GroupedColumnHeaders': ["first", "unique"],
    "OperationType": ["first", "unique", "count"],
    'OsName': ["first", "unique", "count"], 
    'OsVersion': ["first", "unique", "count"],
    'PageStartTime': ["first", "unique"],
    'PageUrl': ["first", "unique", "count"],
    'PreviousPageAppName': ["first", "unique", "count"],
    'PreviousPageEntityType': ["first", "unique", "count"], 
    'PreviousPageUrl': ["first", "unique", "count"],
    'DisplayedFieldEntities': ["first", "unique", "count"],
    'EvaluationTime': ["first", "unique", "count"],
    'DeviceModel': ["first", "unique", "count"], 
    'DevicePlatform': ["first", "unique", "count"]
}

In [5]:
# list(interest_cols.keys())

In [6]:
summary = problematics_regarding_time[list(interest_cols.keys())]\
    .groupby("Name")\
    .agg({k: v for k, v in interest_cols.items() if k!="Name"})\
    .reset_index()

In [7]:
new_column_names = []
for k, v in zip(summary.columns.get_level_values(0), summary.columns.get_level_values(1)):
    columnName = k + ("" if v=="first" else v.capitalize())
    new_column_names.append(columnName)

In [8]:
summary.columns = new_column_names

In [9]:
summary.to_csv("actives_and_problematics_summary.csv")

In [10]:
problematic_reports = list(summary.ReportId.unique())

In [11]:
problematic_reports

['00O6P000000yX5NUAU',
 '00O0b000004AoOOEA0',
 '00O6P000000uM0SUAU',
 '00O2R000003JQyNUAW',
 '00O6P000000ZOcWUAW',
 '00O6P000000ZOcCUAW',
 '00O6P000000uM08UAE',
 '00O6P000000VF5IUAW',
 '00O6P0000016wkaUAA',
 '00O0b000004kTazEAE',
 '00O2R000004Am1WUAS',
 '00O6P000000uLtWUAU']

In [12]:
import re

pattern = re.compile(r'\/lightning\/r\/(?P<report_type>[a-zA-Z]{4,})\/(?P<report_id>[0-9a-zA-Z]{18})')

def filter_run_report_endpoints(pattern, field, url):
    m = re.match(pattern, url)
    if m:
        return m.group(field)

In [13]:
from collections import defaultdict

logdate = ['2022-06-28', "2022-07-05", "2022-07-11", "2022-07-22", "2022-07-27"]
logs = ["Dashboard", "LightningPageView", "LightningPerformance", "LightningError", "Report"]

# used to infer report id over performance logs
common_subset_features = ['USER_ID', 'USER_ID_DERIVED', 'SESSION_KEY', 'LOGIN_KEY', 'ORGANIZATION_ID', 'CLIENT_IP']

verbose = False
chunk_size = 10000

logdf = defaultdict(dict)
for d in logdate:
    for ltng_log in logs: 
        for f in os.listdir(d):
            if f"{ltng_log}_chunk" in f:
                df_iter = pd.read_csv(os.path.join(d, f), chunksize=chunk_size)
                for i, chunk in enumerate(df_iter):
                    if verbose:
                        print("Loading chunk", i+1, "from", ltng_log)
                    tmp = chunk.copy()
                    if ltng_log in [logs[1], logs[3]]:
                        tmp.dropna(subset=["PAGE_URL"], inplace=True)
                        tmp["ReportId"] = tmp.PAGE_URL.apply(lambda url: filter_run_report_endpoints(pattern, "report_id", url))
                        tmp["ReportType"] = tmp.PAGE_URL.apply(lambda url: filter_run_report_endpoints(pattern, "report_type", url))
                        tmp.dropna(subset=["ReportId"], inplace=True)
                    elif ltng_log in [logs[0], logs[4]]: # Dashboards and Reports
                        tmp.rename(columns={"REPORT_ID_DERIVED": "ReportId"}, inplace=True)
                    elif ltng_log == logs[2]: #LightningPerformance logs
                        # needs to extract report id using a common_subset_features
                        tmp["ReportId"] = None
                        pass
                    
                    if not "ReportId" in list(tmp.columns):
                        continue
                    
                    tmp = tmp[tmp.ReportId.apply(lambda report_id: report_id in problematic_reports)]
                    if verbose:
                        print("found", tmp.shape[0], " logs related")
                        
                    if ltng_log in logdf:
                        logdf[ltng_log] = pd.concat((logdf[ltng_log], tmp), axis=0)
                    else:
                        logdf[ltng_log] = tmp

2022-06-28_Dashboard_chunk1.csv
Loading chunk 1 from Dashboard
found 1  logs related
2022-06-28_LightningPageView_chunk1.csv
Loading chunk 1 from LightningPageView
found 60  logs related
Loading chunk 2 from LightningPageView
found 28  logs related
2022-06-28_LightningPageView_chunk2.csv
Loading chunk 1 from LightningPageView
found 21  logs related
2022-06-28_LightningPageView_chunk3.csv
Loading chunk 1 from LightningPageView
found 60  logs related
Loading chunk 2 from LightningPageView
found 66  logs related
Loading chunk 3 from LightningPageView
found 73  logs related
Loading chunk 4 from LightningPageView
found 53  logs related
Loading chunk 5 from LightningPageView
found 56  logs related
Loading chunk 6 from LightningPageView
found 67  logs related
Loading chunk 7 from LightningPageView
found 67  logs related
Loading chunk 8 from LightningPageView
found 56  logs related
Loading chunk 9 from LightningPageView
found 64  logs related
Loading chunk 10 from LightningPageView
found 50  l

In [14]:
logdf.keys()

dict_keys(['Dashboard', 'LightningPageView', 'LightningPerformance', 'LightningError', 'Report'])

In [44]:
for ltng_log in logs:
    print(ltng_log, pd.merge(left=summary, right=logdf[ltng_log], on="ReportId").shape)

Dashboard (1, 85)
LightningPageView (12224, 115)
LightningPerformance (0, 97)
LightningError (6, 104)
Report (53715, 94)
