In [1]:
import time
import redis
import pandas as pd
from datetime import datetime

from dec import subscriber, publisher, constants as C, statistics

In [2]:
%load_ext autoreload
%autoreload 2

# Main

In [8]:
subscriber.main()

Subscriber started. Establishing connection to REDIS...
Subscribed to channel 'events'.
Pulling a new message...
No message found in the queue.
Pulling a new message...
No message found in the queue.
Pulling a new message...
A new message has been pulled.
Computing statistics related to the new data...
Reading old statistics...
No statistics valid found.
Updating all the statistics...
Writing updated statistics...
Full step completed.
Pulling a new message...
A new message has been pulled.
Computing statistics related to the new data...
Reading old statistics...
Last statistics parsed.
Updating all the statistics...
Writing updated statistics...
Full step completed.
Pulling a new message...
A new message has been pulled.
Computing statistics related to the new data...
Reading old statistics...
Last statistics parsed.
Updating all the statistics...
Writing updated statistics...
Full step completed.
Pulling a new message...
A new message has been pulled.
Computing statistics related to t

KeyboardInterrupt: 

# Notebook test

In [3]:
rc = redis.StrictRedis(host='localhost', port=6379, db=0)
pubsub = rc.pubsub()
pubsub.subscribe(['events'])

In [None]:
# Empty message
message = pubsub.get_message()

message

In [None]:
message = pubsub.get_message()

message

In [None]:
events_to_process = eval(message['data'])

events_to_process

## Statistics
We would like to see a bunch of statistics saved and updated in Redis
1. total sum of viewable_time per publisher (viewable_time_sum_per_publisher)
2. the top 10 publishers by events count (top_n_publisher_by_count)
3. the number of uniques clips per publisher (unique_clips_count_per_publisher)
4. total sum of clips per country viewed by day and by night (clips_count_per_country_day_night)

In [None]:
# 1
statistics.viewable_time_sum_per_publisher(events_to_process)

In [None]:
# 2
statistics.top_n_publisher_by_count(events_to_process, n = 3)

In [None]:
# 3
statistics.unique_clips_count_per_publisher(events_to_process)

In [None]:
# 4
statistics.clips_count_per_country_day_night(events_to_process)

## Single step job

These will be the main steps of a single run:

1. Get a new message from the publisher
2. If the message is not empty, continue, else reloop
3. Compute the above statistics for the new events
4. Read the previous computed statistics
5. Update all the statistics
6. Write the updated statistics

This will be the format of the persisted statistics:




In [None]:
statistics_persisted = {
    'statistics': {
        'viewable_time_sum_per_publisher': {
            '<PUBID>': '<actual_total_sum>',
            # ...
        },
        'top_n_publisher_by_count': {
            'data': '<actual_data>',
            'publishers': '<PUBID_1>,...,<PUBID_i>,...,<PUBID_N>',
        },
        'unique_clips_count_per_publisher': {
            'data': {
                '<PUBID>': '<CLIP_1>,...,<CLIP_N>',
                # ...
            },
            'counts': {
                '<PUBID>': 'N',
                # ...
            },
        },
        'clips_count_per_country_day_night': {
            '<COUNTRY_1>': {
                'day': 'X',
                'night': 'Y'
            },
            # ...
        }
    },
    'last_update_timestamp': '<last_update_timestamp_value>'
}

In [7]:
rc.delete('statistics')

1

In [None]:
last_stats_str = rc.get('statistics')

try:
    last_stats = eval(last_stats_str)
except TypeError:
    last_stats = {}

In [None]:
viewable_time = statistics.viewable_time_sum_per_publisher(events_to_process)
top_pub = statistics.top_n_publisher_by_count(events_to_process, n=10)
unique_clips_count = statistics.unique_clips_count_per_publisher(events_to_process)
clips_count = statistics.clips_count_per_country_day_night(events_to_process)

In [None]:
last_stats

In [None]:
stats = last_stats.get('statistics', {})

# 1. Viewable time
updated_viewable_time = subscriber.update_viewable_time(stats, viewable_time)
updated_viewable_time_list = list(updated_viewable_time.T.to_dict().values())

# 2. Top pub
updated_top_pub = subscriber.update_top_pub(stats, top_pub)
updated_top_pub_list = list(updated_top_pub.T.to_dict().values())

# 3. Unique clips count
updated_unique_clips_count = subscriber.update_unique_clips_count(stats, unique_clips_count)
updated_unique_clips_count_list = list(updated_unique_clips_count.T.to_dict().values())

# 4. Clips count
updated_clips_count = subscriber.update_clips_count(stats, clips_count)
updated_clips_count_list = list(updated_clips_count.T.to_dict().values())

In [None]:
updated_viewable_time

In [None]:
updated_top_pub

In [None]:
','.join(updated_top_pub[C.PUBLISHER_ID].values[:10])

In [None]:
updated_unique_clips_count

In [None]:
updated_clips_count

In [None]:
updated_stats = {
    'statistics': {
        'viewable_time_sum_per_publisher': updated_viewable_time_list,
        'top_n_publisher_by_count': {
            'data': updated_top_pub_list,
            'publishers': ','.join(updated_top_pub[C.PUBLISHER_ID].values[:10]),
        },
        'unique_clips_count_per_publisher': {
            'data': updated_unique_clips_count_list,
        },
        'clips_count_per_country_day_night': updated_clips_count_list
    },
    'last_update_timestamp': datetime.now().timestamp()
}

In [None]:
updated_stats

In [None]:
updated_stats = subscriber.update_stats(last_stats, viewable_time, top_pub, unique_clips_count, clips_count)

updated_stats

In [None]:
rc.set('statistics', updated_stats)

## Second run

In [None]:
message = pubsub.get_message()

message

In [None]:
try:
    events_to_process = eval(message['data'])
    subscriber.single_step_run(events_to_process)
except TypeError:
    # No data read
    pass

# Test code

In [None]:
# Update viewable_time
stats = last_stats.get('statistics', {})

last_viewable_time = stats.get('viewable_time_sum_per_publisher', pd.DataFrame(columns=[C.PUBLISHER_ID, C.VIEWABLE_TIME]))

In [None]:
last_viewable_time

In [None]:
updated_df = (pd.merge(last_viewable_time, pd.DataFrame(viewable_time), how='outer', on=[C.PUBLISHER_ID])
              .set_index([C.PUBLISHER_ID])
              .sum(axis=1)
              .reset_index()
              .rename(columns={0: C.VIEWABLE_TIME})
             )

list(updated_df.T.to_dict().values())

In [None]:
# Update top_pub
stats = last_stats.get('statistics', {})

last_top_pub_dict = stats.get('top_pub', {})
last_top_pub = last_top_pub_dict.get('data', pd.DataFrame(columns=[C.PUBLISHER_ID, 'count']))

In [None]:
updated_df = (pd.merge(last_top_pub, pd.DataFrame(top_pub), how='outer', on=[C.PUBLISHER_ID])
              .set_index([C.PUBLISHER_ID])
              .sum(axis=1)
              .reset_index()
              .rename(columns={0: 'count'})
             )

list(updated_df.T.to_dict().values())

In [None]:
# Update top_pub
stats = last_stats.get('statistics', {})

last_unique_clips_count_dict = stats.get('unique_clips_count_per_publisher', {})

last_unique_clips_count = last_unique_clips_count_dict.get('data', pd.DataFrame(columns=[C.PUBLISHER_ID, 'clips']))
last_unique_clips_count

In [None]:
unique_clips_count

In [None]:
def special_sum(lis):
    head = lis[0]
    if isinstance(head, set) is False:
        head = set()
    if len(lis) > 1:
        return head.union(special_sum(lis[1:]))
    else:
        return head

updated_df = (pd.merge(last_unique_clips_count, pd.DataFrame(unique_clips_count), how='outer', on=[C.PUBLISHER_ID])
              .set_index([C.PUBLISHER_ID])
              .aggregate(special_sum, axis=1)
              .reset_index()
              .rename(columns={0: 'unique_clips'})
             )

list(updated_df.T.to_dict().values())

In [None]:
# Update clips_count
stats = last_stats.get('statistics', {})

data_str = stats.get('clips_count_per_country_day_night', None)
last_clips_count = pd.DataFrame(data_str) if data_str else pd.DataFrame(columns=[C.COUNTRY, 'daynight', 'count'])

last_clips_count

In [None]:
updated_df = (pd.merge(last_clips_count, pd.DataFrame(clips_count), how='outer', on=[C.COUNTRY, 'daynight'])
              .set_index([C.COUNTRY, 'daynight'])
              .sum(axis=1)
              .reset_index()
              .rename(columns={0: 'count'})
             )

list(updated_df.T.to_dict().values())