In [1]:
import pandas as pd

df_http = pd.read_csv('./data/input/polling.csv', index_col = 0)
print(df_http.shape[0])
print(df_http.groupby('status_code').size())
df_http.head()

350445
status_code
0        1281
200    349119
401        45
dtype: int64


Unnamed: 0,creation_time,device_id,error_code,status_code
0,2020-02-26 19:16:40.481,d0460656-95e0-4844-b515-36ef46c2d620,,200
1,2020-02-26 19:16:25.035,d0460656-95e0-4844-b515-36ef46c2d620,,200
2,2020-02-26 18:31:52.126,d0460656-95e0-4844-b515-36ef46c2d620,,200
3,2020-02-26 18:31:17.043,d0460656-95e0-4844-b515-36ef46c2d620,,200
4,2020-02-26 18:31:01.738,d0460656-95e0-4844-b515-36ef46c2d620,,200


In [2]:
df_status = pd.read_csv('./data/input/connectivity_status.csv', index_col = 0)
df_status.head()

Unnamed: 0,creation_time,status,device_id
0,2020-02-26 19:31:29.998,OFFLINE,00083c70-7f54-4324-94bd-b0cb0be78baf
1,2020-02-26 19:31:39.677,ONLINE,00083c70-7f54-4324-94bd-b0cb0be78baf
2,2020-02-26 20:08:42.789,OFFLINE,000fec74-a5b4-40fc-b93c-eef3b2afb87e
3,2020-02-26 20:09:03.448,ONLINE,000fec74-a5b4-40fc-b93c-eef3b2afb87e
4,2020-02-26 00:30:03.439,OFFLINE,001d3c67-99fd-43b7-8207-f420fa303a49


In [3]:
df_orders = pd.read_csv('./data/input/orders.csv', index_col = 0)

print(df_orders.shape[0]) # rows count pre-cleaning
print(df_orders['device_id'].isna().sum()) # 26 orders have not been dispatched to a device: we can get rid of those rows since we're focusing on metrics related to dispatched orders only

df_orders = df_orders.loc[df_orders.device_id.notna()]
print(df_orders.shape[0]) # rows count post-cleaning
df_orders.head()

2357
26
2331


Unnamed: 0,order_creation_time,order_id,device_id
5,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862
6,2020-02-26 10:34:36,102492697,2bb11f99-ab21-4628-abe6-b919da8fbf34
7,2020-02-26 11:32:47,102500373,2aec0e20-e1d8-4323-9b12-f066856488a7
8,2020-02-26 11:46:17,102503173,37638585-a181-4265-aeb2-d9e284bb30c3
9,2020-02-26 11:38:40,102501909,53d16c33-5980-4ad8-9032-f85d686d2855


In [193]:
df = pd.merge(df_orders, df_http, left_on = 'device_id', right_on = 'device_id', how = 'left', indicator = True)
# df_orders is our fact table

print(df.groupby('_merge').size()) # 71 distinct orders are associated with a device_id missing in the polling data: these orders won't have polling events (0)
inc_orders = list(df.loc[df._merge == 'left_only', 'order_id'].unique()) # here's the list of inconsistent for further insights

_merge
left_only          71
right_only          0
both          4710456
dtype: int64


In [79]:
# pd.period_range(start='2017-01-01', end='2018-01-01', freq='M')

In [80]:
# pd.period_range(start=pd.Period('2017Q1', freq='Q'), # left anchor endpoint: Q1 end
#                 end=pd.Period('2017Q2', freq='Q'), freq='M') # right anchor endpoint: Q2 end

In [194]:
df[['order_creation_time', 'creation_time']] = df[['order_creation_time', 'creation_time']].apply(pd.to_datetime) # convert time columns to DateTime type
print(df.dtypes)
df.head()

order_creation_time    datetime64[ns]
order_id                        int64
device_id                      object
creation_time          datetime64[ns]
error_code                     object
status_code                   float64
_merge                       category
dtype: object


Unnamed: 0,order_creation_time,order_id,device_id,creation_time,error_code,status_code,_merge
0,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 05:29:34.363,,200.0,both
1,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 05:28:41.657,,200.0,both
2,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 05:27:51.879,,200.0,both
3,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 05:27:35.925,,200.0,both
4,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 05:26:34.578,,200.0,both


In [175]:
def cut_df(
    df: pd.DataFrame,
    m_os: int
) -> pd.DataFrame:
    """
    """
    if m_os < 0:
        df_cut = df.loc[df['creation_time'].between(df['order_creation_time'] + pd.DateOffset(minutes = m_os), df['order_creation_time'])]
        df_cut['timeslice'] = f"{abs(m_os)}m_before"

    elif m_os > 0:
        df_cut = df.loc[df['creation_time'].between(df['order_creation_time'], df['order_creation_time'] + pd.DateOffset(minutes = m_os))]
        df_cut['timeslice'] = f"{abs(m_os)}m_after"
        
    else:
        raise ValueError('`m_os` param cannot be 0')
    return df_cut

In [183]:
m_os_target = [-3, -60, 3]

df_list = [cut_df(df, i) for i in m_os_target]
df_global = pd.concat(df_list, axis = 0, ignore_index = True)

# print(df_global.groupby('timeslice').size())
df_global.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cut['timeslice'] = f"{abs(m_os)}m_before"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cut['timeslice'] = f"{abs(m_os)}m_after"


Unnamed: 0,order_creation_time,order_id,device_id,creation_time,error_code,status_code,_merge,timeslice
0,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 00:08:12.125,ECONNABORTED,0.0,both,3m_before
1,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 00:06:46.961,,200.0,both,3m_before
2,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 00:06:20.304,,200.0,both,3m_before
3,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 00:06:18.986,,200.0,both,3m_before
4,2020-02-26 11:32:47,102500373,2aec0e20-e1d8-4323-9b12-f066856488a7,2020-02-26 11:32:44.816,,200.0,both,3m_before


In [178]:
# my_df = cut_df(df, -3)
# my_df.head()

In [184]:
df_report = df_global.groupby(['order_id', 'timeslice', 'status_code', 'error_code'], dropna = False).size().reset_index(name = 'counts')
df_report.head()

Unnamed: 0,order_id,timeslice,status_code,error_code,counts
0,102452116,3m_after,200.0,,1
1,102452116,3m_before,0.0,ECONNABORTED,1
2,102452116,3m_before,200.0,,3
3,102452116,60m_before,0.0,ECONNABORTED,1
4,102452116,60m_before,200.0,,31


In [185]:
import numpy as np
df_pv_sc = df_report.pivot_table(index = ['order_id', 'timeslice'], columns = 'status_code', values = 'counts', aggfunc = np.sum)#, fill_value = 0)
df_pv_sc['polling_events'] = df_pv_sc.loc[:, list(df_pv_sc.columns)].sum(axis = 1) # creating the polling events count column
df_pv_sc.head()

Unnamed: 0_level_0,status_code,0.0,200.0,401.0,polling_events
order_id,timeslice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
102452116,3m_after,,1.0,,1.0
102452116,3m_before,1.0,3.0,,4.0
102452116,60m_before,1.0,31.0,,32.0
102452190,3m_after,,2.0,,2.0
102452190,3m_before,1.0,3.0,,4.0


In [186]:
df_pv_ec = df_report.pivot_table(index = ['order_id', 'timeslice'], columns = 'error_code', values = 'counts', aggfunc = np.sum)#, fill_value = 0)
# we don't need to compute count of responses without error codes. since it has to be equal to 200 responses count
df_pv_ec.head()

Unnamed: 0_level_0,error_code,ECONNABORTED,GENERIC_ERROR
order_id,timeslice,Unnamed: 2_level_1,Unnamed: 3_level_1
102452116,3m_before,1.0,
102452116,60m_before,1.0,
102452190,3m_before,1.0,
102452190,60m_before,1.0,
102453036,60m_before,1.0,


In [188]:
df_final = pd.concat([df_pv_sc, df_pv_ec], axis = 1)
df_final = df_final.fillna(0).astype(int)
df_final.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,0.0,200.0,401.0,polling_events,ECONNABORTED,GENERIC_ERROR
order_id,timeslice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
102452116,3m_after,0,1,0,1,0,0
102452116,3m_before,1,3,0,4,1,0
102452116,60m_before,1,31,0,32,1,0
102452190,3m_after,0,2,0,2,0,0
102452190,3m_before,1,3,0,4,1,0


In [7]:
df_report = df.groupby(['order_id', 'status_code', 'error_code'], dropna = False).size().reset_index(name = 'counts')
# groupby by default is excluding those groups whose grouping columns are NaN (in this specific case successful polling events, obviously not associated with an error code)
df_report.head()

# df_report.to_csv('./data/output/report.csv', index = False) # this csv is honestly good enough

Unnamed: 0,order_id,status_code,error_code,counts
0,102452116,0.0,ECONNABORTED,11
1,102452116,0.0,GENERIC_ERROR,1
2,102452116,200.0,,771
3,102452190,0.0,ECONNABORTED,11
4,102452190,0.0,GENERIC_ERROR,1


# HERE

In [205]:
testa = df[['order_id', 'order_creation_time', 'creation_time']].sort_values(['order_id', 'creation_time'], ascending = [True, False])
testa['order_pollingevent_diff'] = df['creation_time'] - df['order_creation_time']
# print(testa.head())
testa.groupby('order_id').min('order_pollingevent_diff')

Unnamed: 0_level_0,order_creation_time,creation_time,order_pollingevent_diff
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
102452116,2020-02-26 00:08:19,2020-02-26 00:00:02.639,-1 days +23:51:43.639000
102452190,2020-02-26 00:08:41,2020-02-26 00:00:02.639,-1 days +23:51:21.639000
102453036,2020-02-26 00:13:57,2020-02-26 00:00:02.639,-1 days +23:46:05.639000
102453649,2020-02-26 00:17:31,2020-02-26 00:00:02.639,-1 days +23:42:31.639000
102453774,2020-02-26 00:18:22,2020-02-26 00:00:02.639,-1 days +23:41:40.639000
...,...,...,...
102664036,2020-02-26 23:46:29,2020-02-26 00:00:01.902,-1 days +00:13:32.902000
102664046,2020-02-26 23:46:26,2020-02-26 19:23:51.323,-1 days +19:37:25.323000
102664423,2020-02-26 23:49:32,2020-02-26 11:16:14.826,-1 days +11:26:42.826000
102664444,2020-02-26 23:49:36,2020-02-26 11:16:14.826,-1 days +11:26:38.826000


In [19]:
# order_id status_code_<code>_count error_code_<code>_count polling_events_counts

# 102452116, 12, 771, 11, 1, 771, 783

Unnamed: 0,order_id,variable,value
0,102452116,status_code,0.0
1,102452116,status_code,0.0
2,102452116,status_code,200.0
3,102452190,status_code,0.0
4,102452190,status_code,0.0
...,...,...,...
9703,102664444,error_code,GENERIC_ERROR
9704,102664444,error_code,
9705,102665113,error_code,ECONNABORTED
9706,102665113,error_code,GENERIC_ERROR


In [115]:
df_report.groupby('status_code').size()

status_code
0.0      2333
401.0     190
dtype: int64

In [117]:
df.loc[df.order_id == 102452116].head() # 783

Unnamed: 0,order_creation_time,order_id,device_id,creation_time,error_code,status_code,_merge
0,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 05:29:34.363,,200.0,both
1,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 05:28:41.657,,200.0,both
2,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 05:27:51.879,,200.0,both
3,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 05:27:35.925,,200.0,both
4,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862,2020-02-26 05:26:34.578,,200.0,both


In [64]:
df_orders.loc[df_orders.order_id == 102639577]

Unnamed: 0,order_creation_time,order_id,device_id
898,2020-02-26 21:02:40,102639577,2170b1c4-7a49-450a-8569-0fb90cf0ff22


In [65]:
df_http.loc[df_http.device_id == '2170b1c4-7a49-450a-8569-0fb90cf0ff22']

Unnamed: 0,creation_time,device_id,error_code,status_code
