## Results Analysis

In [2]:
import pandas as pd

In [147]:
user_names_small = ['U8170', 'U3277', 'U8840', 'U7311', 'U1467', 'U1789', 'U8168', 'U1581', 'U7004', 'U9763']
user_names_moderate = ['U5254', 'U9407', 'U1592', 'U1723', 'U1106', 'U3406', 'U342', 'U1653', 
                'U20', 'U250', 'U1450', 'U1164', 'U86']
user_names_most_active = ['U12', 'U13', 'U24', 'U78', 'U207', 'U293', 'U453', 'U679', 'U1289', 'U1480']
loss_file_cols=['day', 'num_events', 'loss', 'red_event']

In [148]:
def load_loss_data(user_name, loadmax = True):
    """
    """
    loss_type = 'max'
    if not loadmax:
        loss_type = 'diff'
        
    loss_file = '../data/users_losses/{}_losses_{}.txt'.format(user_name, loss_type)
    # check if the file exist
    if not os.path.exists(loss_file):
        raise Exception("File: '{}' doesn't exist".format(loss_file))

    return pd.read_csv(loss_file, header=None, names=loss_file_cols)



In [149]:
user_names = user_names_most_active

losses = load_loss_data('U12')

In [150]:
losses.head()

Unnamed: 0,day,num_events,loss,red_event
0,1,8355,1.174328,0
1,1,8355,1.174328,0
2,1,8355,1.174328,0
3,1,8355,1.174328,0
4,1,8355,1.174328,0


In [151]:
loss_days = losses.groupby(['day'])

In [152]:
loss_days.groups.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26])

In [209]:
def process_data(loss_days):
    """
        process loss_days for user and return a list of red_events dataframe
    """
    count = 0
    results = []
    for day, group in loss_days:
        count +=1
        g1 = pd.DataFrame(group)
        g1.reset_index(drop=True, inplace=True)
    #     print('g1:\n', g1.head())

        num_events = g1['num_events'].unique()[0]
        g1 = g1.drop(['num_events'], axis=1)


        # calc the percentile of the entries in the group.
        percentile = [round(1 - x/num_events, 4) for x in g1.index.values]
        percent_s = pd.Series(percentile)
        g1['percentile'] = percent_s.values

        # filter on red_events
        red_events = g1[g1['red_event'] == 1]
        if not red_events.empty:
#             print('day:', day, '- num_events:', num_events)
#             print('red_events:\n', red_events)
            results.append((num_events, red_events))
    #     if count > 3:
    #         break
    return results

In [210]:
# test process_data()
process_data(loss_days)

[(4364,     day      loss  red_event  percentile
  6     8  0.106151          1      0.9986
  7     8  0.106151          1      0.9984
  9     8  0.093876          1      0.9979
  10    8  0.092905          1      0.9977
  11    8  0.092905          1      0.9975
  13    8  0.073787          1      0.9970
  17    8  0.065467          1      0.9961),
 (3872,     day      loss  red_event  percentile
  22   12  0.079261          1      0.9943
  23   12  0.079261          1      0.9941
  24   12  0.079261          1      0.9938
  25   12  0.079261          1      0.9935),
 (1427,    day      loss  red_event  percentile
  3   26  0.044917          1      0.9979)]

### Process users and display the red_events percentile in the days
We'll not process 'user_names_small' since they have very few events, and usual review of activities should detect any anomalies. Our testing show that we still have the red_events having higher percentile in the day

In [211]:
user_names = user_names_most_active + user_names_moderate
# user_names = user_names_small
for u in user_names:
    try:
        losses = load_loss_data(u)
        loss_days = losses.groupby(['day'])
        red_events = process_data(loss_days)
        print('======================')
        print('User: {}'.format(u))
        for num_events, anomalies in red_events:
            print('num_events:', num_events)
            print('anomalies:', anomalies)
    except Exception as error:
        print('======================')
        print('Caught an error: ' + repr(error))

User: U12
num_events: 11422
anomalies:    day      loss  red_event  percentile
2   12  0.107368          1      0.9998
3   12  0.107368          1      0.9997
4   12  0.107368          1      0.9996
5   12  0.091715          1      0.9996
num_events: 5497
anomalies:    day      loss  red_event  percentile
8   26  0.033577          1      0.9985
9   26  0.024259          1      0.9984
User: U13
num_events: 23455
anomalies:     day      loss  red_event  percentile
4    12  0.162577          1      0.9998
32   12  0.047115          1      0.9986
User: U24
num_events: 18941
anomalies:    day      loss  red_event  percentile
0   13  0.033617          1      1.0000
2   13  0.027589          1      0.9999
3   13  0.005599          1      0.9998
7   13  0.002262          1      0.9996
num_events: 16675
anomalies:     day      loss  red_event  percentile
39   15  0.000072          1      0.9977
User: U78
num_events: 21831
anomalies:    day      loss  red_event  percentile
1   12  0.005118      