#  Problematic Reports Regarding User Run Reports

---

>Questions being answered in this notebook:
>- [x] What is a problematic report regarding user run reports?

In [45]:
import pandas as pd

In [46]:
active_reports = pd.read_csv("../../datasets/active_reports.csv", low_memory=False)

In [47]:
database_related_logs = pd.read_csv("datasets/Report_newlogs.csv")

In [48]:
active_reports.shape

(9479, 4)

In [49]:
database_related_logs.shape

(367182, 31)

In [50]:
full_logs = pd.merge(left=active_reports, right=database_related_logs, left_on='Id', right_on='ReportId', indicator=True)

In [51]:
full_logs['EstimatedDataConsumedBytes'] = full_logs.AVERAGE_ROW_SIZE * full_logs.ROW_COUNT 

In [52]:
avg_bytes = full_logs.EstimatedDataConsumedBytes.mean()
avg_bytes

2399376.952019355

In [53]:
# converting to MB
# 1 MB = 1048576 Bytes ~ 1e6
avg_bytes / 1048576

2.28822417451797

In [54]:
problematics_regarding_database = full_logs[full_logs.EstimatedDataConsumedBytes > avg_bytes]

In [55]:
problematics_regarding_database['EstimatedDataConsumedMegabytes'] =\
    problematics_regarding_database.EstimatedDataConsumedBytes / 1e6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  problematics_regarding_database['EstimatedDataConsumedMegabytes'] =\


In [61]:
problematics_regarding_database.columns

Index(['Id', 'Name', 'Format', 'LastRunDate', 'EVENT_TYPE', 'TIMESTAMP',
       'REQUEST_ID', 'ORGANIZATION_ID', 'USER_ID', 'RUN_TIME', 'CPU_TIME',
       'URI', 'SESSION_KEY', 'LOGIN_KEY', 'USER_TYPE', 'REQUEST_STATUS',
       'DB_TOTAL_TIME', 'ENTITY_NAME', 'DISPLAY_TYPE', 'RENDERING_TYPE',
       'REPORT_ID', 'ROW_COUNT', 'NUMBER_EXCEPTION_FILTERS', 'NUMBER_COLUMNS',
       'AVERAGE_ROW_SIZE', 'SORT', 'DB_BLOCKS', 'DB_CPU_TIME',
       'NUMBER_BUCKETS', 'TIMESTAMP_DERIVED', 'USER_ID_DERIVED', 'CLIENT_IP',
       'URI_ID_DERIVED', 'ReportId', 'ORIGIN', '_merge',
       'EstimatedDataConsumedBytes', 'EstimatedDataConsumedMegabytes'],
      dtype='object')

In [63]:
agg_map = {
    'EVENT_TYPE': ['count', 'nunique', 'unique'], 
    'USER_ID': ['count', 'nunique', 'unique'], 
    'RUN_TIME': ['sum', 'mean', 'min', 'max', 'std', 'count', 'nunique'], 
    'CPU_TIME': ['sum', 'mean', 'min', 'max', 'std', 'count', 'nunique'],
    'URI': ['count', 'nunique', 'unique'], 
    'SESSION_KEY': ['count', 'nunique', 'unique'], 
    'LOGIN_KEY': ['count', 'nunique', 'unique'], 
    'USER_TYPE': ['count', 'nunique', 'unique'], 
    'REQUEST_STATUS': ['count', 'nunique', 'unique'],
    'DB_TOTAL_TIME': ['sum', 'mean', 'min', 'max', 'std', 'count', 'nunique'],
    'ENTITY_NAME': ['count', 'nunique', 'unique'], 
    'DISPLAY_TYPE': ['count', 'nunique', 'unique'],
    'RENDERING_TYPE': ['count', 'nunique', 'unique'],
    'ROW_COUNT': ['sum', 'mean', 'min', 'max', 'std', 'count', 'nunique'], 
    'NUMBER_EXCEPTION_FILTERS': ['mean', 'min', 'max', 'std', 'nunique'], 
    'NUMBER_COLUMNS': ['mean', 'min', 'max', 'std', 'nunique'],
    'AVERAGE_ROW_SIZE': ['sum', 'mean', 'min', 'max', 'std'], 
    'DB_BLOCKS': ['sum', 'mean', 'min', 'max', 'std'], 
    'DB_CPU_TIME': ['sum', 'mean', 'min', 'max', 'std'],
    'NUMBER_BUCKETS': ['sum', 'mean', 'min', 'max', 'std'],
    'USER_ID_DERIVED': ['count', 'nunique', 'unique'],
    'CLIENT_IP': ['count', 'nunique', 'unique'],
    'URI_ID_DERIVED': ['count', 'nunique', 'unique'],
    'ORIGIN': ['count', 'nunique', 'unique'], 
    'EstimatedDataConsumedBytes': ['sum', 'mean', 'min', 'max', 'std'], 
    'EstimatedDataConsumedMegabytes': ['sum', 'mean', 'min', 'max', 'std']
}

In [74]:
summary = problematics_regarding_database.groupby(['ReportId', 'Name'])\
    .agg(agg_map)

In [83]:
col_names = []
level1 = list(summary.columns.get_level_values(0))
level2 = list(summary.columns.get_level_values(1))
for i in range(len(level1)):
    col_names.append(level1[i] + "_" + level2[i].capitalize())

In [90]:
summary.columns = col_names

In [93]:
summary.shape

(337, 110)

In [96]:
summary.reset_index(inplace=True)

In [97]:
summary.shape

(337, 112)

In [98]:
summary.head()

Unnamed: 0,ReportId,Name,EVENT_TYPE_Count,EVENT_TYPE_Nunique,EVENT_TYPE_Unique,USER_ID_Count,USER_ID_Nunique,USER_ID_Unique,RUN_TIME_Sum,RUN_TIME_Mean,...,EstimatedDataConsumedBytes_Sum,EstimatedDataConsumedBytes_Mean,EstimatedDataConsumedBytes_Min,EstimatedDataConsumedBytes_Max,EstimatedDataConsumedBytes_Std,EstimatedDataConsumedMegabytes_Sum,EstimatedDataConsumedMegabytes_Mean,EstimatedDataConsumedMegabytes_Min,EstimatedDataConsumedMegabytes_Max,EstimatedDataConsumedMegabytes_Std
0,00O0b000004AmRiEAK,DELL Agent Work Report,128,1,[Report],128,23,"[0051P000003jYHi, 0050b000004K7gf, 0056P000000...",417142,3258.921875,...,1145561000.0,8949694.0,3999639.0,14890375.0,3346542.0,1145.56084,8.949694,3.999639,14.890375,3.346542
1,00O0b000004AmRjEAK,Agent Work for Supervisor,68,1,[Report],68,14,"[0051P000003jYWf, 0051P000003gAoM, 0051P000003...",174599,2567.632353,...,350016500.0,5147301.0,2491110.0,9256521.0,2040283.0,350.016497,5.147301,2.49111,9.256521,2.040283
2,00O0b000004AmRmEAK,Completed Chat Sessions,18,1,[Report],18,6,"[0050b000004K3yo, 0052R000009Tm6f, 0050b000004...",1523271,84626.166667,...,52941200.0,2941178.0,2537920.0,3896154.0,447815.5,52.941198,2.941178,2.53792,3.896154,0.447815
3,00O0b000004AmRnEAK,Agent Chat Performance,25,1,[Report],25,9,"[0050b000004KWWz, 0050b000004KDLk, 0050b000004...",613660,24546.4,...,1356453000.0,54258130.0,3959280.0,265083858.0,51443640.0,1356.453205,54.258128,3.95928,265.083858,51.443644
4,00O0b000004AmRrEAK,Chat Average Handle Time,910,1,[Report],910,180,"[0052R00000A00Gq, 0050b000004KEAZ, 0050b000004...",2638869,2899.856044,...,3085156000.0,3390282.0,2400372.0,27538475.0,1963039.0,3085.156298,3.390282,2.400372,27.538475,1.963039


In [9]:
summary.to_csv("../../datasets/problematics_regarding_database_usage.csv")