In [44]:
import time
import redis
from collections import defaultdict
import pandas as pd

from dec import subscriber, publisher, constants as C, statistics

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
rc = redis.StrictRedis(host='localhost', port=6379, db=0)
pubsub = rc.pubsub()
pubsub.subscribe(['events'])

In [4]:
# Empty message
message = pubsub.get_message()

message

{'type': 'subscribe', 'pattern': None, 'channel': b'events', 'data': 1}

In [5]:
message = pubsub.get_message()

message

{'type': 'message',
 'pattern': None,
 'channel': b'events',
 'data': b"[{'clip': '8256', 'country': 'FR', 'event_id': '4756d77a-66a8-4eae-aedb-48845c6a76bf', 'publisher_id': '6', 'viewable_time': 14.7, 'timestamp': 1531732501.492831}, {'clip': '0609', 'country': 'JP', 'event_id': '46d2fbd2-9d37-4918-a932-03603ac90d56', 'publisher_id': '7', 'viewable_time': 5.1, 'timestamp': 1531732501.492831}, {'clip': '3972', 'country': 'IT', 'event_id': 'a19a29f2-85ff-449f-801d-d1d9c824c705', 'publisher_id': '0', 'viewable_time': 21.7, 'timestamp': 1531732501.492831}, {'clip': '2712', 'country': 'RU', 'event_id': '9a673a09-bbbf-4f5f-8fd9-7064321863fb', 'publisher_id': '6', 'viewable_time': 15.2, 'timestamp': 1531732501.492831}, {'clip': '4314', 'country': 'EN', 'event_id': 'a000bdcd-f51c-4e43-94be-7610f8fe9dca', 'publisher_id': '7', 'viewable_time': 1.2, 'timestamp': 1531732501.492831}, {'clip': '6926', 'country': 'FR', 'event_id': 'e89aea42-abc2-4fe6-86d4-aa13de956c22', 'publisher_id': '9', 'viewab

In [6]:
events_to_process = eval(message['data'])

events_to_process

[{'clip': '8256',
  'country': 'FR',
  'event_id': '4756d77a-66a8-4eae-aedb-48845c6a76bf',
  'publisher_id': '6',
  'viewable_time': 14.7,
  'timestamp': 1531732501.492831},
 {'clip': '0609',
  'country': 'JP',
  'event_id': '46d2fbd2-9d37-4918-a932-03603ac90d56',
  'publisher_id': '7',
  'viewable_time': 5.1,
  'timestamp': 1531732501.492831},
 {'clip': '3972',
  'country': 'IT',
  'event_id': 'a19a29f2-85ff-449f-801d-d1d9c824c705',
  'publisher_id': '0',
  'viewable_time': 21.7,
  'timestamp': 1531732501.492831},
 {'clip': '2712',
  'country': 'RU',
  'event_id': '9a673a09-bbbf-4f5f-8fd9-7064321863fb',
  'publisher_id': '6',
  'viewable_time': 15.2,
  'timestamp': 1531732501.492831},
 {'clip': '4314',
  'country': 'EN',
  'event_id': 'a000bdcd-f51c-4e43-94be-7610f8fe9dca',
  'publisher_id': '7',
  'viewable_time': 1.2,
  'timestamp': 1531732501.492831},
 {'clip': '6926',
  'country': 'FR',
  'event_id': 'e89aea42-abc2-4fe6-86d4-aa13de956c22',
  'publisher_id': '9',
  'viewable_time':

## Statistics
We would like to see a bunch of statistics saved and updated in Redis
1. total sum of viewable_time per publisher (viewable_time_sum_per_publisher)
2. the top 10 publishers by events count (top_n_publisher_by_count)
3. the number of uniques clips per publisher (unique_clips_count_per_publisher)
4. total sum of clips per country viewed by day and by night (clips_count_per_country_day_night)

In [7]:
# 1
statistics.viewable_time_sum_per_publisher(events_to_process)

[{'publisher_id': '0', 'viewable_time': 1625.6999999999996},
 {'publisher_id': '1', 'viewable_time': 1671.0000000000007},
 {'publisher_id': '2', 'viewable_time': 1161.4999999999998},
 {'publisher_id': '3', 'viewable_time': 1534.1999999999996},
 {'publisher_id': '4', 'viewable_time': 1553.9999999999993},
 {'publisher_id': '5', 'viewable_time': 1655.0000000000002},
 {'publisher_id': '6', 'viewable_time': 1480.1999999999998},
 {'publisher_id': '7', 'viewable_time': 1323.4},
 {'publisher_id': '8', 'viewable_time': 1434.0999999999997},
 {'publisher_id': '9', 'viewable_time': 1362.7999999999997}]

In [9]:
# 2
statistics.top_n_publisher_by_count(events_to_process, n = 3)

[{'publisher_id': '5', 'count': 115},
 {'publisher_id': '6', 'count': 109},
 {'publisher_id': '0', 'count': 107}]

In [10]:
# 3
statistics.unique_clips_count_per_publisher(events_to_process)

[{'publisher_id': '0', 'unique_clips_count': 106},
 {'publisher_id': '1', 'unique_clips_count': 103},
 {'publisher_id': '2', 'unique_clips_count': 74},
 {'publisher_id': '3', 'unique_clips_count': 99},
 {'publisher_id': '4', 'unique_clips_count': 103},
 {'publisher_id': '5', 'unique_clips_count': 115},
 {'publisher_id': '6', 'unique_clips_count': 108},
 {'publisher_id': '7', 'unique_clips_count': 96},
 {'publisher_id': '8', 'unique_clips_count': 102},
 {'publisher_id': '9', 'unique_clips_count': 91}]

In [11]:
# 4
statistics.clips_count_per_country_day_night(events_to_process)

[{'country': 'CH', 'daynight': 'day', 'count': 112},
 {'country': 'DE', 'daynight': 'day', 'count': 129},
 {'country': 'EN', 'daynight': 'day', 'count': 104},
 {'country': 'FR', 'daynight': 'day', 'count': 111},
 {'country': 'IT', 'daynight': 'day', 'count': 109},
 {'country': 'JP', 'daynight': 'day', 'count': 122},
 {'country': 'NE', 'daynight': 'day', 'count': 99},
 {'country': 'RU', 'daynight': 'day', 'count': 96},
 {'country': 'US', 'daynight': 'day', 'count': 118}]

## Single step job

These will be the main steps of a single run:

1. Get a new message from the publisher
2. If the message is not empty, continue, else reloop
3. Compute the above statistics for the new events
4. Read the previous computed statistics
5. Update all the statistics
6. Write the updated statistics

This will be the format of the persisted statistics:




In [62]:
statistics_persisted = {
    'statistics': {
        'viewable_time_sum_per_publisher': {
            '<PUBID>': '<actual_total_sum>',
            # ...
        },
        'top_n_publisher_by_count': {
            'data': '<actual_data>',
            'publishers': '<PUBID_1>,...,<PUBID_i>,...,<PUBID_N>',
        },
        'unique_clips_count_per_publisher': {
            'data': {
                '<PUBID>': '<CLIP_1>,...,<CLIP_N>',
                # ...
            },
            'counts': {
                '<PUBID>': 'N',
                # ...
            },
        },
        'clips_count_per_country_day_night': {
            '<COUNTRY_1>': {
                'day': 'X',
                'night': 'Y'
            },
            # ...
        }
    },
    'last_update_timestamp': '<last_update_timestamp_value>'
}

In [46]:
last_stats_str = rc.get('statistics')

try:
    last_stats = eval(last_stats_str)
except TypeError:
    last_stats = {}

In [72]:
viewable_time = statistics.viewable_time_sum_per_publisher(events_to_process)
top_pub = statistics.top_n_publisher_by_count(events_to_process, n=10)
unique_clips_count = statistics.unique_clips_count_per_publisher(events_to_process)
clips_count = statistics.clips_count_per_country_day_night(events_to_process)

In [47]:
pd.DataFrame(viewable_time)

Unnamed: 0,publisher_id,viewable_time
0,0,1625.7
1,1,1671.0
2,2,1161.5
3,3,1534.2
4,4,1554.0
5,5,1655.0
6,6,1480.2
7,7,1323.4
8,8,1434.1
9,9,1362.8


In [48]:
last_stats

{}

In [49]:
# Update viewable_time
stats = last_stats.get('statistics', {})

last_viewable_time = stats.get('viewable_time_sum_per_publisher', pd.DataFrame(columns=[C.PUBLISHER_ID, C.VIEWABLE_TIME]))

In [50]:
last_viewable_time

Unnamed: 0,publisher_id,viewable_time


In [63]:
updated_df = (pd.merge(last_viewable_time, pd.DataFrame(viewable_time), how='outer', on=[C.PUBLISHER_ID])
              .set_index([C.PUBLISHER_ID])
              .sum(axis=1)
              .reset_index()
              .rename(columns={0: C.VIEWABLE_TIME})
             )

list(updated_df.T.to_dict().values())

[{'publisher_id': '0', 'viewable_time': 1625.6999999999996},
 {'publisher_id': '1', 'viewable_time': 1671.0000000000007},
 {'publisher_id': '2', 'viewable_time': 1161.4999999999998},
 {'publisher_id': '3', 'viewable_time': 1534.1999999999996},
 {'publisher_id': '4', 'viewable_time': 1553.9999999999993},
 {'publisher_id': '5', 'viewable_time': 1655.0000000000002},
 {'publisher_id': '6', 'viewable_time': 1480.1999999999998},
 {'publisher_id': '7', 'viewable_time': 1323.4},
 {'publisher_id': '8', 'viewable_time': 1434.0999999999997},
 {'publisher_id': '9', 'viewable_time': 1362.7999999999997}]

In [66]:
# Update top_pub
stats = last_stats.get('statistics', {})

last_top_pub_dict = stats.get('top_pub', {})
last_top_pub = last_top_pub_dict.get('data', pd.DataFrame(columns=[C.PUBLISHER_ID, 'count']))

In [67]:
updated_df = (pd.merge(last_top_pub, pd.DataFrame(top_pub), how='outer', on=[C.PUBLISHER_ID])
              .set_index([C.PUBLISHER_ID])
              .sum(axis=1)
              .reset_index()
              .rename(columns={0: 'count'})
             )

list(updated_df.T.to_dict().values())

[{'publisher_id': '5', 'count': 115.0},
 {'publisher_id': '6', 'count': 109.0},
 {'publisher_id': '0', 'count': 107.0},
 {'publisher_id': '1', 'count': 103.0},
 {'publisher_id': '4', 'count': 103.0},
 {'publisher_id': '8', 'count': 102.0},
 {'publisher_id': '3', 'count': 100.0},
 {'publisher_id': '7', 'count': 96.0},
 {'publisher_id': '9', 'count': 91.0},
 {'publisher_id': '2', 'count': 74.0}]

In [70]:
# Update top_pub
stats = last_stats.get('statistics', {})

last_unique_clips_count_dict = stats.get('unique_clips_count_per_publisher', {})

last_unique_clips_count = last_unique_clips_count_dict.get('data', pd.DataFrame(columns=[C.PUBLISHER_ID, 'clips']))
last_unique_clips_count

Unnamed: 0,publisher_id,clips


In [73]:
unique_clips_count

[{'publisher_id': '0',
  'unique_clips': {'0041',
   '0221',
   '0531',
   '0790',
   '0961',
   '1000',
   '1019',
   '1139',
   '1155',
   '1286',
   '1310',
   '1450',
   '1526',
   '1750',
   '1789',
   '1816',
   '1852',
   '1881',
   '1907',
   '1992',
   '2062',
   '2114',
   '2360',
   '2685',
   '2725',
   '2803',
   '2949',
   '2950',
   '3167',
   '3244',
   '3449',
   '3616',
   '3812',
   '3836',
   '3923',
   '3972',
   '4091',
   '4147',
   '4240',
   '4436',
   '4504',
   '4526',
   '4542',
   '4569',
   '4581',
   '4806',
   '4895',
   '4927',
   '4984',
   '4988',
   '5037',
   '5086',
   '5092',
   '5236',
   '5315',
   '5323',
   '5342',
   '5624',
   '5703',
   '5743',
   '5801',
   '5831',
   '5926',
   '6007',
   '6121',
   '6148',
   '6159',
   '6590',
   '6835',
   '6908',
   '7042',
   '7062',
   '7106',
   '7242',
   '7256',
   '7293',
   '7342',
   '7404',
   '7443',
   '7574',
   '7754',
   '7761',
   '7811',
   '7937',
   '7955',
   '8036',
   '8057',
   '

In [122]:
def special_sum(lis):
    head = lis[0]
    if isinstance(head, set) is False:
        head = set()
    if len(lis) > 1:
        return head.union(special_sum(lis[1:]))
    else:
        return head

updated_df = (pd.merge(last_unique_clips_count, pd.DataFrame(unique_clips_count), how='outer', on=[C.PUBLISHER_ID])
              .set_index([C.PUBLISHER_ID])
              .aggregate(special_sum, axis=1)
              .reset_index()
              .rename(columns={0: 'unique_clips'})
             )

updated_df
# list(updated_df.T.to_dict().values())

Unnamed: 0,publisher_id,unique_clips
0,0,"{3836, 6148, 6908, 5236, 7955, 8515, 7342, 452..."
1,1,"{4993, 6439, 1694, 3946, 1516, 6404, 4174, 485..."
2,2,"{4215, 8176, 3292, 8135, 2191, 0250, 3584, 879..."
3,3,"{5144, 4242, 1878, 1278, 3741, 9204, 8614, 329..."
4,4,"{5953, 7716, 4716, 5267, 6908, 1533, 1925, 645..."
5,5,"{9651, 6998, 4181, 0440, 6475, 7818, 2113, 056..."
6,6,"{8875, 3800, 3292, 7591, 9519, 6712, 7331, 482..."
7,7,"{1965, 4860, 3876, 6858, 6579, 3629, 1774, 380..."
8,8,"{6064, 1853, 7010, 7509, 5862, 7571, 6156, 001..."
9,9,"{4852, 6924, 3479, 2191, 3272, 8755, 5704, 907..."
