# Phase 1: Topics of Interest Generation

#### Imports

In [106]:
from collections import defaultdict
from math import log

import numpy as np
import datetime
import random


In [107]:
## Force the pseudo random generators initial state for reproductable resutls
seed = 5
random.seed(seed)
np.random.seed(seed)

#### Sample data

This sample data is extracted from the AnalyticsEvent model. for this use case the following fields are needed:
* userid - The user this record refers to 
* context.url - The url visited by the user
* context.keywords - The keywords assigned to the url(/page)
* events.eventDate - The event date and time

In [108]:
# This are the all 13 pages that our target website has. Also known as the corpus
#
website_pages = [
    { "url": "/page1", "keywords": ['liferay','bloated','literay','plugin system','features'] },
    { "url": "/page2", "keywords": ['liferay','china','dalian'] },
    { "url": "/page3", "keywords": ['dalian','liferay','pictures'] },
    { "url": "/page4", "keywords": ['china','job fair','liferay dalian'] },
    { "url": "/page5", "keywords": ['symposium','liferay','mobile','mobile strategies'] },
    { "url": "/page6", "keywords": ['dalian'] },
    { "url": "/page7", "keywords": ['dxp', 'pokemon go', 'inflation'] },
    { "url": "/page8", "keywords": ['dxp', '7.2', 'alpha'] },
    { "url": "/page9", "keywords": ['gratis', 'churning'] },
    { "url": "/page10", "keywords": ['graveyard', '7.2', 'DXP'] },
    { "url": "/page11", "keywords": ['gotcha', 'encryption'] },
    { "url": "/page12", "keywords": ['globalization', 'economy'] },
    { "url": "/page13", "keywords": ['dalian', 'christmas'] }
]

In [109]:
def random_date(start, end):
    """
    Generate a random datetime between `start` and `end`
    courtesy of: https://stackoverflow.com/a/8170651/8356352
    """
    return start + datetime.timedelta(
        # Get a random amount of seconds between `start` and `end`
        seconds=random.randint(0, int((end - start).total_seconds())),
    )

def generate_page_visits(user, number_pages, all_website_pages, start_date, end_date, with_replacement=True):
    """
    Generate a list of analytics events for a given user picking from a set of pages and distribute them over
    a span of time. The generated list can contains duplicates or not according to the with_replacement flag
    
    user: The userid to generate the page views for
    number_pages: number of page visits (or analytics events) to generate
    all_website_pages: the list of all pages to pick from. Also known as the corpus
    start_date: the start date
    end_date: the end date
    with_replacement: set to True to generate list with duplicates False otherwise. Data flowing into the 
        analytics events table contains duplicates, a user can visit a page multiple times
    """
    page_visits = []
    
    page_views = np.random.choice(all_website_pages, number_pages, replace=with_replacement)
    
    for page in page_views:
        event_date = random_date(start_date, end_date)
        page_visits += [{"userid": user, "url": page['url'], "keywords": page['keywords'], "eventDate": event_date}]
    
    return sorted(page_visits, key=lambda x: x['eventDate'])

def filter_unique_page_visits(page_view_stream, include_user_filter=False):
    """
    Helper method to filter a stream of page views for duplicates.
    
    page_view_stream: a chunk of analytics event data. Can be a single user page visits up until the whole table
    unclude_user_filter: flag to indicate if we should filter per page 'url' or 'userid'+'url' combination
    """
    unique_views = []
    
    if include_user_filter:
        unique_views = list({p['userid']+p['url']:p for p in page_view_stream}.values())
    else:
        unique_views = list({p['url']:p for p in page_view_stream}.values())
    
    return unique_views

def select_field(page_view_stream, field):
    """
    Helper function that works like a select statement. Used to extract from the analytics events dictionary to
    the format used by the algorithm
    
    page_view_stream: a chunk of analytics events data.
    field: the field to select. (ie. 'keywords')
    """
    return [x[field] for x in page_view_stream]

def get_user_activity(analytics_events, userid):
    """
    Helper function to extract from the analytics event format to the one used by the algorithm.
    """
    temp = filter(lambda p: p['userid']==userid, analytics_events)
    
    return {userid: select_field(temp, 'keywords')}

### Back to the original notebook - The Algorithm

The only change in this funciton is the addition of the debug flag to make it less verbose

In [110]:
def calculate_inverse_document_frequency(entire_document_list, \
                                         individual_document_list, \
                                         entire_document_list_multiplier=1.0, \
                                         individual_document_list_multiplier=2.0, \
                                         debug=False):
    """
    Input:
        entire_document_list 
            List of Document's Keywords: [[keyword1, keyword2, ...], [keyword1, keyword2, keyword3], ...]
            This is for all the URLs
        individual_document_list
            This is just the documents keywords for the individual user. This should be a subset of the entire_document_list
        
    Output:
        Inverse Document Frequency Dict of each word: { (keyword1 : 0.0000243), (keyword2 : 0.003134), ... }
        Note that this will be weighted accordingly
    """
    
    label_document_count = defaultdict(float)
    label_document_idf = dict()
    document_count = (len(entire_document_list) * entire_document_list_multiplier) \
                     + (len(individual_document_list) * individual_document_list_multiplier)
    
    # Iterate through each word in each document, and get the # of times the word occurs in the list.
    # TODO: May need to lump together stemmed/lemmatized versions of the terms.
    
    for document_labels_list in entire_document_list:
        for label in set(document_labels_list):
            
            label_lower_case = label.lower()
            label_document_count[label_lower_case] += 1 * entire_document_list_multiplier
            
            # Debugging
            if debug:
                print("Updating: %s => %d" % (label_lower_case, label_document_count[label_lower_case]))
    
    if debug:
        print("\n")
    
    # Now to account for the individual's pages viewed
    for document_labels_list in individual_document_list:
        for label in set(document_labels_list):
            
            label_lower_case = label.lower()
            label_document_count[label_lower_case] += 1 * individual_document_list_multiplier
            
            # Debugging
            if debug:
                print("Updating: %s => %d" % (label_lower_case, label_document_count[label_lower_case]))
    
    # Convert to Inverse-Log-Scores
    if debug:
        print("Calculating Inverse Log Scores")
    for label, count in label_document_count.items():
        inverse_log_score = log(document_count / count)
        if debug:
            print("%s : %f" % (label, inverse_log_score))
        label_document_idf[label] = inverse_log_score

    return label_document_idf

The core of the algorithm is converted to a function `compute_score`, to make it reusable. An additional `display_items_cout` flag is used to limit the number of keywords per user.

In [111]:
def compute_score(user_page_views, corpus, display_items_count=None):
    user_to_individual_labels_score = dict()

    # Calculate TD-IDF for every term
    for user, document_label_list in user_page_views.items():

        # We're simplifying what we nee
        label_to_count = defaultdict(int)
        label_to_score = dict()

        # We currently weigh the individual's history, twice as much as the overall page counts.
        idf_map = calculate_inverse_document_frequency(corpus, user_page_views[user], 1.0, 2.0, debug=False)

        # Calculate document label counts for each label
        for label_list in document_label_list:
            for label in label_list:
                label = label.lower()
                label_to_count[label] += 1

        # Calcualte scores with summed labels    
        for label, count in label_to_count.items():
            score = count * idf_map[label]
            label_to_score[label] = score

        user_to_individual_labels_score[user] = label_to_score


    # Print results
    print("Final Results")
    print("-------------")
    
    for user, label_to_score in sorted(user_to_individual_labels_score.items()):

        print("\n%s" % user)
        sorted_labels = [label for label in sorted(label_to_score, key=label_to_score.get, reverse=True)]

        if (display_items_count != None):
            sorted_labels = sorted_labels[0:display_items_count]

        scores = []
        for label in sorted_labels:
            scores += [label_to_score[label]]
            print('\t{:20} : {:>5.4f}'.format(label, label_to_score[label]))
        
        print ("\n\tDistinct scores for user {}: {} out of {} keywords".format(user, \
                                                                             len(set(scores)), \
                                                                             len(sorted_labels)))

## Algorithm on Sample Data

In [112]:
# Description: Obtain list of entries.
# Input: SQL Server
# Output: [[user, url, labels, date, ...]]
# entry_list = get_entry_list()

# Description: Convert list of entries to map of user to labels
# Need to consider: Do I only do this for unique pages? Do I update the count per visit?
# Current Implementation: Just unique pages
# Input: [[user, url, labels, date, ...]]
# Output: { {user: [[label1, label2, label3], [label1, label2, label3], ...]}, {user2: [...]} }
# user_to_document_label_map = get_user_to_document_label_list()

# For now, we're using hard-coded values

# These are label for all the webpages (under the same domain)
all_webpage_labels_list = [['liferay','bloated','literay','plugin system','features'], \
                           ['liferay','china','dalian'], \
                           ['dalian','liferay','pictures'], \
                           ['china','job fair','liferay dalian'], \
                           ['symposium','liferay','mobile','mobile strategies'], \
                           ['dalian'], \
                           ['dxp', 'pokemon go', 'inflation'], \
                           ['dxp', '7.2', 'alpha'], \
                           ['gratis', 'churning'], \
                           ['graveyard', '7.2', 'DXP'],
                           ['gotcha', 'encryption']]

# This is each individual's labels
adam_webpage_label_list = [['dalian','liferay','pictures'], \
                           ['symposium','liferay','mobile','mobile strategies'], \
                           ['globalization', 'economy']]

betty_webpage_label_list = [['dalian'], \
                            ['dalian', 'christmas'], \
                            ['liferay','bloated','literay','plugin system','features'], \
                            ['liferay','china','dalian'], \
                            ['dalian','liferay','pictures']]

user_to_document_label_list = dict()
user_to_document_label_list['Adam'] = adam_webpage_label_list
user_to_document_label_list['Betty'] = betty_webpage_label_list

### Test1: baseline
Verify the algorithm has the same behavior as the original one.

**NOTE:** This a verification task. further tests will be executed on the data generated in the format of the analytics events

In [113]:
compute_score(user_page_views=user_to_document_label_list, corpus=all_webpage_labels_list)

Final Results
-------------

Adam
	globalization        : 2.1401
	economy              : 2.1401
	pictures             : 1.7346
	symposium            : 1.7346
	mobile               : 1.7346
	mobile strategies    : 1.7346
	liferay              : 1.5075
	dalian               : 1.2238

	Distinct scores for user Adam: 4 out of 8 keywords

Betty
	dalian               : 2.5865
	christmas            : 2.3514
	liferay              : 2.2258
	bloated              : 1.9459
	literay              : 1.9459
	plugin system        : 1.9459
	features             : 1.9459
	pictures             : 1.9459
	china                : 1.6582

	Distinct scores for user Betty: 5 out of 9 keywords


Expected output: The same as before the modifications described above.

### Prepare data
We are going to generate some data in a format compatible with the data available in the AnalyiticsEvent datasource. That datasource is fed with the `analytics-js` and `analytics-java` clients.

Generated data is the stream of page visits for a set of `users` in the data range defined by `sd` (start date), `ed` (end date). Each users will have a page visits list randomly picked up from 1 to 15 page views.

In [114]:
users = ['alice', 'bob', 'adam','betty','carol']

sd = datetime.datetime(2017, 12, 1, 0, 0, 0)
ed = datetime.datetime(2017, 12, 31, 23, 59, 0)

analyticsEvent = []

for user in users:
    number_pages = np.random.randint(1, 15)

    analyticsEvent += generate_page_visits(
        user=user, 
        all_website_pages=website_pages, 
        number_pages=number_pages, 
        start_date=sd, 
        end_date=ed)

print ("Generated {} analytics events".format(len(analyticsEvent)))

Generated 32 analytics events


#### Explore the generated data

In [142]:
for user in users:
    total_pages = len(list(filter(lambda p: p['userid']==user, analyticsEvent)))
    
    distinct_page_views = len(list(filter(lambda p: p['userid']==user, \
                              filter_unique_page_visits(page_view_stream=analyticsEvent, include_user_filter=True))))
    
    print ("User {:10} page views: {:5} total, {:5} distinct\n".format(user, total_pages, distinct_page_views))
    
    for page in list(filter(lambda p: p['userid']==user, analyticsEvent)):
        print ("\t {:20}  {}".format(page['url'], page['keywords']))
    
    print("\n")

User alice      page views:     4 total,     3 distinct

	 /page7                ['dxp', 'pokemon go', 'inflation']
	 /page1                ['liferay', 'bloated', 'literay', 'plugin system', 'features']
	 /page10               ['graveyard', '7.2', 'DXP']
	 /page7                ['dxp', 'pokemon go', 'inflation']


User bob        page views:     9 total,     7 distinct

	 /page5                ['symposium', 'liferay', 'mobile', 'mobile strategies']
	 /page1                ['liferay', 'bloated', 'literay', 'plugin system', 'features']
	 /page8                ['dxp', '7.2', 'alpha']
	 /page1                ['liferay', 'bloated', 'literay', 'plugin system', 'features']
	 /page6                ['dalian']
	 /page12               ['globalization', 'economy']
	 /page13               ['dalian', 'christmas']
	 /page8                ['dxp', '7.2', 'alpha']
	 /page2                ['liferay', 'china', 'dalian']


User adam       page views:     8 total,     7 distinct

	 /page5                ['s

#### Distinct keywords

In [116]:
all_keywords = []

for page in website_pages:
    all_keywords += page['keywords']

all_distict_keywords = set(all_keywords)

print("There {} distinct keywords, used {} times".format(len(all_distict_keywords), len(all_keywords)))

There 27 distinct keywords, used 36 times


### Fixed corpus vs dynamic one

#### 0. Fixed Corpus
The _fixed corpus_ comes from a well known data source that is not the analytics event table. In Liferay terms speaking this could be the `AssetEntry` table for a give n asset type.
This source matches 1:1 with the pages available on the target website

In [117]:
len(website_pages)

13

---
#### 1. Dynamic corpus: RAW

This _dynamic corpus_ is extracted from the user activity as found in the analytics event table 
If we take the raw table content without any filtering over the time window selected, the corpus size is now:

In [118]:
len(analyticsEvent)

32

and includes lots of duplicates

---

#### 2. Dynamic corpus: unique page visits

Nevertheless if we filter per _unique_ page views the result is the _dynamic corpus_ does not include all the pages available in the _fixed_ one.

In [119]:
len(filter_unique_page_visits(page_view_stream=analyticsEvent, include_user_filter=False))

11

---

#### 3. Dynamic corpus: unique page visits per (userid, url) paris
Again filtering the analyitcs event data for unique page view per user the outcome is again different:

In [120]:
len(filter_unique_page_visits(page_view_stream=analyticsEvent, include_user_filter=True))

25

## Tests #1

Prepare the user data from the `analyticsEvent` to the format required by the algorithm

In [121]:
user_to_document_label_list = dict()

for user in users:
    user_to_document_label_list.update(get_user_activity(userid=user, analytics_events=analyticsEvent))

### Test1: Raw user analytics events data, Fixed corpus
This test run the algorithm against the raw analytics data for the users and uses a fixed corpus for the IDF computation.
* raw analytics events /user: No filters applied. For every user, analytics data, may contain duplicates
* fixed corpus: The _corpus_ does NOT comes from the analytics event table. Rather another system (asset engine?) syncronize the assets (pages) available on the target website.

In [122]:
compute_score(user_page_views=user_to_document_label_list, \
              corpus=select_field(website_pages, 'keywords'), \
              display_items_count=6)

Final Results
-------------

adam
	liferay              : 3.5296
	globalization        : 3.5157
	economy              : 3.5157
	dalian               : 3.1941
	symposium            : 2.2687
	mobile               : 2.2687

	Distinct scores for user adam: 4 out of 6 keywords

alice
	pokemon go           : 2.8702
	inflation            : 2.8702
	dxp                  : 2.5419
	bloated              : 1.9459
	literay              : 1.9459
	plugin system        : 1.9459

	Distinct scores for user alice: 3 out of 6 keywords

betty
	7.2                  : 4.7757
	graveyard            : 4.6518
	dxp                  : 4.3944
	gotcha               : 3.7741
	encryption           : 3.7741
	liferay              : 3.5818

	Distinct scores for user betty: 5 out of 6 keywords

bob
	liferay              : 3.7963
	bloated              : 3.6491
	literay              : 3.6491
	plugin system        : 3.6491
	features             : 3.6491
	alpha                : 3.6491

	Distinct scores for user bob: 2 out of 6

### Test2: Raw user analytics events data, Dynamic corpus version 1.
This test run the algorithm against the raw analytics data for the users and uses a dynamic corpus for the IDF computation.
* raw analytics events /user: No filters applied. For every user, analytics data, may contain duplicates
* dynamic corpus: The _corpus_ does is defined as in "1. Dynamic Corpus: RAW" paragraph

In [123]:
compute_score(user_page_views=user_to_document_label_list, \
              corpus=select_field(analyticsEvent, 'keywords'), \
              display_items_count=6)

Final Results
-------------

adam
	globalization        : 3.8506
	economy              : 3.8506
	dalian               : 3.4895
	liferay              : 3.3067
	symposium            : 2.4849
	mobile               : 2.4849

	Distinct scores for user adam: 4 out of 6 keywords

alice
	pokemon go           : 3.4859
	inflation            : 3.4859
	dxp                  : 2.7489
	graveyard            : 1.8971
	bloated              : 1.6094
	literay              : 1.6094

	Distinct scores for user alice: 4 out of 6 keywords

betty
	7.2                  : 4.9728
	graveyard            : 4.9460
	gotcha               : 4.3190
	encryption           : 4.3190
	dxp                  : 4.2435
	dalian               : 3.7296

	Distinct scores for user betty: 5 out of 6 keywords

bob
	alpha                : 3.9322
	dalian               : 3.6119
	liferay              : 3.4700
	bloated              : 3.2189
	literay              : 3.2189
	plugin system        : 3.2189

	Distinct scores for user bob: 4 out of 6

### Test3: Raw user analytics events data, Dynamic corpus version 2.
This test run the algorithm against the raw analytics data for the users and uses a dynamic corpus for the IDF computation.
* raw analytics events /user: No filters applied. For every user, analytics data, may contain duplicates
* dynamic corpus: The _corpus_ does is defined as in "2. Dynamic Corpus: unique page visits" paragraph

In [124]:
dynamic_corpus = filter_unique_page_visits(page_view_stream=analyticsEvent, include_user_filter=False)
compute_score(user_page_views=user_to_document_label_list, \
              corpus=select_field(dynamic_corpus, 'keywords'), \
              display_items_count=6)

Final Results
-------------

adam
	globalization        : 3.3728
	economy              : 3.3728
	liferay              : 3.2437
	dalian               : 2.9798
	symposium            : 2.1972
	mobile               : 2.1972

	Distinct scores for user adam: 4 out of 6 keywords

alice
	pokemon go           : 2.6700
	inflation            : 2.6700
	dxp                  : 2.2416
	bloated              : 1.8458
	literay              : 1.8458
	plugin system        : 1.8458

	Distinct scores for user alice: 3 out of 6 keywords

betty
	7.2                  : 4.5256
	graveyard            : 4.4642
	dxp                  : 4.1444
	gotcha               : 3.6491
	encryption           : 3.6491
	liferay              : 3.3942

	Distinct scores for user betty: 5 out of 6 keywords

bob
	liferay              : 3.5296
	bloated              : 3.5157
	literay              : 3.5157
	plugin system        : 3.5157
	features             : 3.5157
	alpha                : 3.5157

	Distinct scores for user bob: 2 out of 6

### Test3: Raw user analytics events data, Dynamic corpus version 3.
This test run the algorithm against the raw analytics data for the users and uses a dynamic corpus for the IDF computation.
* raw analytics events /user: No filters applied. For every user, analytics data, may contain duplicates
* dynamic corpus: The _corpus_ does is defined as in "3. Dynamic Corpus: unique page visits per (userid, url) paris" paragraph

In [125]:
dynamic_corpus = filter_unique_page_visits(page_view_stream=analyticsEvent, include_user_filter=True)
compute_score(user_page_views=user_to_document_label_list, \
              corpus=select_field(dynamic_corpus, 'keywords'), \
              display_items_count=6)

Final Results
-------------

adam
	globalization        : 3.8436
	economy              : 3.8436
	dalian               : 3.0166
	liferay              : 2.8714
	symposium            : 2.3273
	mobile               : 2.3273

	Distinct scores for user adam: 4 out of 6 keywords

alice
	pokemon go           : 3.4095
	inflation            : 3.4095
	dxp                  : 3.0348
	graveyard            : 2.1102
	7.2                  : 1.7047
	bloated              : 1.5506

	Distinct scores for user alice: 5 out of 6 keywords

betty
	7.2                  : 5.2870
	graveyard            : 5.1817
	dxp                  : 4.6704
	gotcha               : 4.3944
	encryption           : 4.3944
	dalian               : 3.2958

	Distinct scores for user betty: 5 out of 6 keywords

bob
	alpha                : 3.9389
	7.2                  : 3.3635
	dalian               : 3.1594
	bloated              : 3.1280
	literay              : 3.1280
	plugin system        : 3.1280

	Distinct scores for user bob: 4 out of 6

## Test #2

for this round of tests we make sure we filter user data to be unique

In [126]:
user_to_document_label_filtered_list = dict()

unique_userid_page_analytics_event = filter_unique_page_visits(page_view_stream=analyticsEvent, include_user_filter=True)

for user in users:
    user_to_document_label_filtered_list.update(get_user_activity(userid=user, analytics_events=unique_userid_page_analytics_event))

In [127]:
test_corpus_list = [
    website_pages,
    analyticsEvent,
    filter_unique_page_visits(page_view_stream=analyticsEvent, include_user_filter=False),  
    filter_unique_page_visits(page_view_stream=analyticsEvent, include_user_filter=True)    
]

for test_corpus in test_corpus_list:
    compute_score(user_page_views=user_to_document_label_filtered_list, \
                  corpus=select_field(test_corpus, 'keywords'), \
                  display_items_count=6)    
    print("\n")

Final Results
-------------

adam
	liferay              : 3.2437
	dalian               : 2.9798
	symposium            : 2.1972
	mobile               : 2.1972
	mobile strategies    : 2.1972
	christmas            : 2.1972

	Distinct scores for user adam: 3 out of 6 keywords

alice
	dxp                  : 1.9971
	pokemon go           : 1.8458
	inflation            : 1.8458
	bloated              : 1.8458
	literay              : 1.8458
	plugin system        : 1.8458

	Distinct scores for user alice: 2 out of 6 keywords

betty
	7.2                  : 3.0082
	liferay              : 2.9798
	dalian               : 2.9798
	dxp                  : 2.6999
	gotcha               : 2.1972
	encryption           : 2.1972

	Distinct scores for user betty: 4 out of 6 keywords

bob
	liferay              : 2.9798
	dalian               : 2.9798
	symposium            : 2.1972
	mobile               : 2.1972
	mobile strategies    : 2.1972
	bloated              : 2.1972

	Distinct scores for user bob: 2 out of 6

### Summary

Previous tests show how the different combinations of _corpus_ and filters applied to analytics event data affects the result of a _single_ iteration of the algorithm.

In particular, is worth to note:
* effects of keyword ordering
* scores ranges and deltas
* how filtering on user activity (Test #2) affects selected keywords
* all the numbers above refers to a _single_ run of the algorithm set with with different initial parameters/inputs

**Additional notes:**
* Generated user activity is random and might not be the reflect the behavior of a user browsing a website according to his/her interests. 