In [1]:
import pandas as pd

In [2]:
api_logs = pd.read_csv("API_logs.csv", low_memory=False)

In [3]:
api_logs.shape

(5372177, 27)

In [4]:
pageview_logs = pd.read_csv("LightningPageView_newlogs.csv", low_memory=False)

In [5]:
pageview_logs.shape

(1600715, 52)

In [6]:
cols = set(list(api_logs.columns) + list(pageview_logs.columns))

In [7]:
df = pd.DataFrame({
    "ColumnName" : [c for c in cols], 
    "API": [c in api_logs for c in cols], 
    "PageView": [c in pageview_logs for c in cols]})

In [8]:
df[df.API & df.PageView].ColumnName

7            REQUEST_ID
12            CLIENT_IP
15              USER_ID
18      USER_ID_DERIVED
24            USER_TYPE
31    TIMESTAMP_DERIVED
34           EVENT_TYPE
44          SESSION_KEY
46            LOGIN_KEY
50            TIMESTAMP
66      ORGANIZATION_ID
Name: ColumnName, dtype: object

In [9]:
common_columns = ['TIMESTAMP_DERIVED', 'CLIENT_IP', 'ORGANIZATION_ID', 'USER_TYPE']

In [10]:
api_logs.dropna(subset=common_columns, inplace=True)

In [11]:
pageview_logs.dropna(subset=common_columns, inplace=True)

In [12]:
print(api_logs.shape, pageview_logs.shape)

(4450221, 27) (1600715, 52)


In [13]:
api_pageview_logs = pd.merge(left=api_logs, right=pageview_logs, on=common_columns)

In [16]:
api_pageview_logs[['TIMESTAMP_DERIVED', 'PAGE_URL', 'REQUEST_SIZE']]

Unnamed: 0,TIMESTAMP_DERIVED,PAGE_URL,REQUEST_SIZE
0,2022-06-28T01:00:37.501Z,/lightning/r/Case/5006P000005x5oVQAQ/view,631.0
1,2022-06-28T01:02:21.906Z,/lightning/r/Task/00T6P00000K6l9sUAB/view?ws=%...,654.0
2,2022-06-28T01:08:25.555Z,/lightning/r/Case/5006P000005y3QxQAI/clone?use...,657.0
3,2022-06-28T01:09:56.353Z,/lightning/r/Order/8012R000007zi2jQAA/view?ws=...,657.0
4,2022-06-28T01:12:11.858Z,/lightning/r/Case/5006P000005m81xQAA/related/p...,824.0
...,...,...,...
1161,2022-07-22T02:57:04.384Z,/lightning/r/Report/00O6P0000016uswUAA/view?qu...,662.0
1162,2022-07-22T02:57:11.021Z,/lightning/r/Case/5006P000006d7BKQAY/view?ws=%...,129594.0
1163,2022-07-22T02:57:42.257Z,/lightning/r/Task/00T6P00000MexvZUAR/view?ws=%...,651.0
1164,2022-07-22T02:57:50.203Z,/lightning/r/WorkOrder/0WO6P000001KSlLWAW/view,662.0


In [17]:
sorted(api_pageview_logs)

['API_TYPE',
 'API_VERSION',
 'APP_NAME',
 'BROWSER_NAME',
 'BROWSER_VERSION',
 'CLIENT_GEO',
 'CLIENT_ID',
 'CLIENT_IP',
 'CLIENT_NAME',
 'CONNECTION_TYPE',
 'CPU_TIME',
 'DB_BLOCKS',
 'DB_CPU_TIME',
 'DB_TOTAL_TIME',
 'DEVICE_ID',
 'DEVICE_MODEL',
 'DEVICE_PLATFORM',
 'DEVICE_SESSION_ID',
 'DURATION',
 'EFFECTIVE_PAGE_TIME',
 'EFFECTIVE_PAGE_TIME_DEVIATION',
 'EFFECTIVE_PAGE_TIME_DEVIATION_ERROR_TYPE',
 'EFFECTIVE_PAGE_TIME_DEVIATION_REASON',
 'ENTITY_NAME',
 'EVENT_TYPE_x',
 'EVENT_TYPE_y',
 'GRANDPARENT_UI_ELEMENT',
 'LOGIN_KEY_x',
 'LOGIN_KEY_y',
 'METHOD_NAME',
 'ORGANIZATION_ID',
 'OS_NAME',
 'OS_VERSION',
 'PAGE_APP_NAME',
 'PAGE_CONTEXT',
 'PAGE_ENTITY_ID',
 'PAGE_ENTITY_TYPE',
 'PAGE_START_TIME',
 'PAGE_URL',
 'PARENT_UI_ELEMENT',
 'PREVPAGE_APP_NAME',
 'PREVPAGE_CONTEXT',
 'PREVPAGE_ENTITY_ID',
 'PREVPAGE_ENTITY_TYPE',
 'PREVPAGE_URL',
 'REQUEST_ID_x',
 'REQUEST_ID_y',
 'REQUEST_SIZE',
 'REQUEST_STATUS',
 'RESPONSE_SIZE',
 'ROWS_PROCESSED',
 'RUN_TIME',
 'ReportId',
 'Report

In [15]:
# api_pageview_logs.to_csv("api_pageview_join.csv", index=False)