# What region (or locality) do most of the executions come from?

In [1]:
import os
import pandas as pd

### Loading active reports

In [2]:
active_reports = pd.read_csv('./datasets/active_reports.csv', low_memory=False, keep_default_na=False)

Let's take a look on the active reports table

In [3]:
active_reports.head()

Unnamed: 0,CreatedById,CreatedDate,Description,DeveloperName,FolderName,Format,Id,IsDeleted,LastModifiedById,LastModifiedDate,...,DB_CPU_TIME,NUMBER_BUCKETS,TIMESTAMP_DERIVED,USER_ID_DERIVED,CLIENT_IP,URI_ID_DERIVED,REPORT_ID_DERIVED,ORIGIN,IsActiveSinceCreation,IsActiveSinceLastModification
0,0050b000004KNJdAAO,2021-10-11T06:18:06.000Z,Test Report with only some parameters,AnyQ,Pankaj_Pande,Tabular,00O2R000004F38wUAC,False,0050b000004KNJdAAO,2022-01-04T08:41:10.000,...,20.0,1,2022-06-04T12:35:47.938Z,0050b000004KNJdAAO,168.159.160.201,,00O2R000004F38wUAC,ReportRunFromLightning,True,False
1,0050b000004KNJdAAO,2021-10-11T06:18:06.000Z,Test Report with only some parameters,AnyQ,Pankaj_Pande,Tabular,00O2R000004F38wUAC,False,0050b000004KNJdAAO,2022-01-04T08:41:10.000,...,3540.0,1,2022-06-04T02:23:08.584Z,0050b000004KNJdAAO,168.159.160.201,,00O2R000004F38wUAC,ReportRunFromLightning,True,False
2,0050b000004KNJdAAO,2021-10-11T06:18:06.000Z,Test Report with only some parameters,AnyQ,Pankaj_Pande,Tabular,00O2R000004F38wUAC,False,0050b000004KNJdAAO,2022-01-04T08:41:10.000,...,4690.0,1,2022-06-04T01:04:48.613Z,0050b000004KNJdAAO,168.159.160.201,,00O2R000004F38wUAC,ReportRunFromLightning,True,False
3,0050b000004KNJdAAO,2021-05-26T09:23:29.000Z,Test Report with only some parameters,Main_1,Pankaj_Pande,Tabular,00O2R000003zUb9UAE,False,0050b000004KNJdAAO,2021-09-06T12:24:00.000,...,40.0,1,2022-06-04T22:46:04.525Z,0050b000004KNJdAAO,168.159.160.201,,00O2R000003zUb9UAE,ReportRunFromLightning,True,False
4,0050b000004MLeRAAW,2020-02-15T07:34:10.000Z,,My_Cases_and_Tasks_CARE_t7v,Dell Care Reports,Tabular,00O2R0000043lHKUAY,False,0050b000004MLUkAAO,2021-02-20T17:01:39.000,...,40.0,0,2022-06-04T05:37:05.980Z,0052R00000APEgvQAH,143.166.255.114,,00O2R0000043lHKUAY,ReportRunFromLightning,True,False


### Loading `Lightning event` logs

As you can see, all the logs are loaded because we may analysis by different ways

In [4]:
ltng_elf_path = 'data/Salesforce/ELF'
ltng_logs = {}

for event_log_file in os.listdir(ltng_elf_path):
    log_path = os.path.join(ltng_elf_path, event_log_file)
    log_samples = os.listdir(log_path)
    random_logs = sorted(log_samples)[0]
    print(random_logs)
    log_name = random_logs.split("_", 1)[-1][:-4]
    if log_name.startswith('Lightning'):
        log_name = log_name[len('Lightning'):]
    ltng_logs[log_name] = pd.read_csv(log_path +"/"+ random_logs, nrows=1000, low_memory=False)

2022-06-01_LightningError.csv
2022-06-04_LightningInteraction.csv
2022-06-04_LightningPageView.csv
2022-06-04_LightningPerformance.csv
2022-06-04_Report.csv


Let's take a look on the columns of active reports table 

In [5]:
active_reports.columns

Index(['CreatedById', 'CreatedDate', 'Description', 'DeveloperName',
       'FolderName', 'Format', 'Id', 'IsDeleted', 'LastModifiedById',
       'LastModifiedDate', 'LastReferencedDate', 'LastRunDate',
       'LastViewedDate', 'Name', 'NamespacePrefix', 'OwnerId',
       'ReportTypeApiName', 'SystemModstamp', 'EVENT_TYPE', 'TIMESTAMP',
       'REQUEST_ID', 'ORGANIZATION_ID', 'USER_ID', 'RUN_TIME', 'CPU_TIME',
       'URI', 'SESSION_KEY', 'LOGIN_KEY', 'USER_TYPE', 'REQUEST_STATUS',
       'DB_TOTAL_TIME', 'ENTITY_NAME', 'DISPLAY_TYPE', 'RENDERING_TYPE',
       'REPORT_ID', 'ROW_COUNT', 'NUMBER_EXCEPTION_FILTERS', 'NUMBER_COLUMNS',
       'AVERAGE_ROW_SIZE', 'SORT', 'DB_BLOCKS', 'DB_CPU_TIME',
       'NUMBER_BUCKETS', 'TIMESTAMP_DERIVED', 'USER_ID_DERIVED', 'CLIENT_IP',
       'URI_ID_DERIVED', 'REPORT_ID_DERIVED', 'ORIGIN',
       'IsActiveSinceCreation', 'IsActiveSinceLastModification'],
      dtype='object')

There are many columns on active reports table, but for this analysis, it is needed only `Id`.
About page view table, it is needed only `CLIENT_GEO` and `PAGE_URL`. So, let's use just this column.

In [6]:
pageview_cols = ['CLIENT_GEO', 'PAGE_URL']
report_obj_cols = ['Id']

In [7]:
ltng_pageview = ltng_logs['PageView'][pageview_cols]
report_ids = active_reports[report_obj_cols]

Now, let's take a look on the Lightning page view table

In [8]:
ltng_pageview

Unnamed: 0,CLIENT_GEO,PAGE_URL
0,United States/null,/lightning/page/home
1,United States/Texas,/lightning/page/home
2,United States/Texas,/lightning/page/home
3,United States/Texas,/lightning/page/home
4,United States/Texas,/lightning/settings/personal/LanguageAndTimeZo...
...,...,...
995,United States/null,/one/one.app#eyJjb21wb25lbnREZWYiOiJmb3JjZVNlY...
996,United States/null,/lightning/r/Case/5006P000005ESO1QAO/view
997,United States/null,/lightning/r/Case/5006P000005RdGeQAK/view?ws=%...
998,United States/null,/lightning/o/Case/list?filterName=00B2R000005g...


There may be blank values, so let's clean out the table removing the blank rows

In [9]:
ltng_pageview_cleaned = ltng_pageview.dropna(subset=pageview_cols)

The report ID must be rescued from the `PAGE_URL` column, it is done using a regular expression. Then, the ID is saved on a new column, `REPORT_ID_DERIVED`

In [10]:
import re

pattern = re.compile(r'\/lightning\/r\/(?P<report_type>[a-zA-Z]{4,})\/(?P<report_id>[0-9a-zA-Z]{18})')

def filter_run_report_endpoints(pattern, url):
    m = re.match(pattern, url)
    if m:
        return m.group('report_id')

In [11]:
ltng_pageview_cleaned['REPORT_ID_DERIVED'] =\
    ltng_pageview_cleaned.PAGE_URL.apply(lambda url: filter_run_report_endpoints(pattern, url))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltng_pageview_cleaned['REPORT_ID_DERIVED'] =\


In [12]:
ltng_pageview_filtered = ltng_pageview_cleaned.dropna(subset=['REPORT_ID_DERIVED'])

Let's take a look on the final table

In [13]:
ltng_pageview_filtered

Unnamed: 0,CLIENT_GEO,PAGE_URL,REPORT_ID_DERIVED
38,United States/null,/lightning/r/Case/5006P000005egdhQAA/clone?use...,5006P000005egdhQAA
43,United States/null,/lightning/r/Task/00T6P00000I1FUGUA3/view?ws=%...,00T6P00000I1FUGUA3
62,United States/null,/lightning/r/Asset/02i6P000001frFYQAY/view?ws=...,02i6P000001frFYQAY
63,United States/California,/lightning/r/Case/5006P000005LWHSQA4/view,5006P000005LWHSQA4
79,United States/null,/lightning/r/Case/5002R0000171R9bQAE/view?ws=%...,5002R0000171R9bQAE
...,...,...,...
993,United States/null,/lightning/r/Case/5006P000004EKLzQAO/view,5006P000004EKLzQAO
994,United States/Texas,/lightning/r/WorkOrder/5006P0000052UqbQAE/rela...,5006P0000052UqbQAE
996,United States/null,/lightning/r/Case/5006P000005ESO1QAO/view,5006P000005ESO1QAO
997,United States/null,/lightning/r/Case/5006P000005RdGeQAK/view?ws=%...,5006P000005RdGeQAK


### Back to Active Reports 

In [14]:
report_ids

Unnamed: 0,Id
0,00O2R000004F38wUAC
1,00O2R000004F38wUAC
2,00O2R000004F38wUAC
3,00O2R000003zUb9UAE
4,00O2R0000043lHKUAY
...,...
38280,00O6P000000VEsrUAG
38281,00O2R000004Ii1sUAC
38282,00O2R000004Ii1sUAC
38283,00O2R000003za0CUAQ


The execution location of each report must be rescued from the `Page View` table. In order to do that, a column `CLIENT_GEO_DERIVED` is created on the `Report Ids` table 

In [15]:
report_ids.assign(CLIENT_GEO_DERIVED="")

Unnamed: 0,Id,CLIENT_GEO_DERIVED
0,00O2R000004F38wUAC,
1,00O2R000004F38wUAC,
2,00O2R000004F38wUAC,
3,00O2R000003zUb9UAE,
4,00O2R0000043lHKUAY,
...,...,...
38280,00O6P000000VEsrUAG,
38281,00O2R000004Ii1sUAC,
38282,00O2R000004Ii1sUAC,
38283,00O2R000003za0CUAQ,


In [16]:
def find_report_geo(report_id):
    reports_with_location = ltng_pageview_filtered.loc[ltng_pageview_filtered['REPORT_ID_DERIVED'] == report_id]
    geo_location = reports_with_location['CLIENT_GEO'].tolist() 
    return geo_location[0] if geo_location else float('NaN')

Now, the location of each report can be found crossing the `Id` from `report ids` and `REPORT_ID_DERIVED` from `ltng_pageview_filtered`

In [17]:
report_ids['CLIENT_GEO_DERIVED'] =\
    report_ids.Id.apply(lambda report_id: find_report_geo(report_id))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  report_ids['CLIENT_GEO_DERIVED'] =\


And thats is the result

In [18]:
report_ids

Unnamed: 0,Id,CLIENT_GEO_DERIVED
0,00O2R000004F38wUAC,
1,00O2R000004F38wUAC,
2,00O2R000004F38wUAC,
3,00O2R000003zUb9UAE,
4,00O2R0000043lHKUAY,
...,...,...
38280,00O6P000000VEsrUAG,
38281,00O2R000004Ii1sUAC,
38282,00O2R000004Ii1sUAC,
38283,00O2R000003za0CUAQ,


Removing missing data

In [19]:
report_ids.loc[report_ids['CLIENT_GEO_DERIVED'].notnull()]

Unnamed: 0,Id,CLIENT_GEO_DERIVED
460,00O2R0000043l7WUAQ,United States/null
461,00O2R0000043l7WUAQ,United States/null
462,00O2R0000043l7WUAQ,United States/null
463,00O2R0000043l7WUAQ,United States/null
464,00O2R0000043l7WUAQ,United States/null
...,...,...
37278,00O2R0000043lCbUAI,United States/Texas
37279,00O2R0000043lCbUAI,United States/Texas
37280,00O2R0000043lCbUAI,United States/Texas
37281,00O2R0000043lCbUAI,United States/Texas


Now the reports can be grouped by region 

In [28]:
report_ids.groupby(['CLIENT_GEO_DERIVED'])\
    .agg('count')\
    .sort_values('Id', ascending=False)['Id']\
    .reset_index()\
    .rename(columns={'Id': 'ID count', 'CLIENT_GEO_DERIVED': 'Region'})

Unnamed: 0,Region,ID count
0,United States/Texas,10039
1,United States/null,9265
2,India/Maharashtra,3308
3,Ireland/null,2840
4,United States/California,64
5,United States/North Carolina,48
6,Japan/Saitama,41
7,China/Fujian,14
