# Phase 1: Topics of Interest Generation

## Helper Functions

In [5]:
def calculate_inverse_document_frequency(entire_document_list, individual_document_list, \
                                         entire_document_list_multiplier=1.0, individual_document_list_multiplier=2.0):
    """
    Input:
        entire_document_list 
            List of Document's Keywords: [[keyword1, keyword2, ...], [keyword1, keyword2, keyword3], ...]
            This is for all the URLs
        individual_document_list
            This is just the documents keywords for the individual user. This should be a subset of the entire_document_list
        
    Output:
        Inverse Document Frequency Dict of each word: { (keyword1 : 0.0000243), (keyword2 : 0.003134), ... }
        Note that this will be weighted accordingly
    """
    from collections import defaultdict
    from math import log
    
    label_document_count = defaultdict(float)
    label_document_idf = dict()
    document_count = (len(entire_document_list) * entire_document_list_multiplier) \
                     + (len(individual_document_list) * individual_document_list_multiplier)
    
    # Iterate through each word in each document, and get the # of times the word occurs in the list.
    # TODO: May need to lump together stemmed/lemmatized versions of the terms.
    
    for document_labels_list in entire_document_list:
        for label in set(document_labels_list):
            
            label_lower_case = label.lower()
            label_document_count[label_lower_case] += 1 * entire_document_list_multiplier
            
            # Debugging
            print("Updating: %s => %d" % (label_lower_case, label_document_count[label_lower_case]))
    
    print("\n")
    
    # Now to account for the individual's pages viewed
    for document_labels_list in individual_document_list:
        for label in set(document_labels_list):
            
            label_lower_case = label.lower()
            label_document_count[label_lower_case] += 1 * individual_document_list_multiplier
            
            # Debugging
            print("Updating: %s => %d" % (label_lower_case, label_document_count[label_lower_case]))
    
    # Convert to Inverse-Log-Scores
    print("Calculating Inverse Log Scores")
    for label, count in label_document_count.items():
        inverse_log_score = log(document_count / count)
        print("%s : %f" % (label, inverse_log_score))
        label_document_idf[label] = inverse_log_score

    return label_document_idf

## Algorithm on Sample Data

In [8]:
from collections import defaultdict

# Description: Obtain list of entries.
# Input: SQL Server
# Output: [[user, url, labels, date, ...]]
# entry_list = get_entry_list()

# Description: Convert list of entries to map of user to labels
# Need to consider: Do I only do this for unique pages? Do I update the count per visit?
# Current Implementation: Just unique pages
# Input: [[user, url, labels, date, ...]]
# Output: { {user: [[label1, label2, label3], [label1, label2, label3], ...]}, {user2: [...]} }
# user_to_document_label_map = get_user_to_document_label_list()

# For now, we're using hard-coded values

# These are label for all the webpages (under the same domain)
all_webpage_labels_list = [['liferay','bloated','literay','plugin system','features'], \
                           ['liferay','china','dalian'], \
                           ['dalian','liferay','pictures'], \
                           ['china','job fair','liferay dalian'], \
                           ['symposium','liferay','mobile','mobile strategies'], \
                           ['dalian'], \
                           ['dxp', 'pokemon go', 'inflation'], \
                           ['dxp', '7.2', 'alpha'], \
                           ['gratis', 'churning'], \
                           ['graveyard', '7.2', 'DXP'],
                           ['gotcha', 'encryption']]

# This is each individual's labels
adam_webpage_label_list = [['dalian','liferay','pictures'], \
                           ['symposium','liferay','mobile','mobile strategies'], \
                           ['globalization', 'economy']]

betty_webpage_label_list = [['dalian'], \
                            ['dalian', 'christmas'], \
                            ['liferay','bloated','literay','plugin system','features'], \
                            ['liferay','china','dalian'], \
                            ['dalian','liferay','pictures']]

user_to_document_label_list = dict()
user_to_document_label_list['Adam'] = adam_webpage_label_list
user_to_document_label_list['Betty'] = betty_webpage_label_list

user_to_individual_labels_score = dict()

# Calculate TD-IDF for every term
for user, document_label_list in user_to_document_label_list.items():
    
    # We're simplifying what we nee
    label_to_count = defaultdict(int)
    label_to_score = dict()
    
    # We currently weigh the individual's history, twice as much as the overall page counts.
    idf_map = calculate_inverse_document_frequency(all_webpage_labels_list, \
                                                              user_to_document_label_list[user],
                                                             1.0, 2.0)
    
    # Calculate document label counts for each label
    for label_list in document_label_list:
        for label in label_list:
            label_to_count[label] += 1
        
    # Calcualte scores with summed labels    
    for label, count in label_to_count.items():
        score = count * idf_map[label]
        label_to_score[label] = score
        
    user_to_individual_labels_score[user] = label_to_score
    

# Print results
print("\nFinal Results")
print("-------------")
for user, label_to_score in sorted(user_to_individual_labels_score.items()):
    
    print("\n%s" % user)
    sorted_labels = [label for label in sorted(label_to_score, key=label_to_score.get, reverse=True)]
    
    for label in sorted_labels:
        print('\t{:20} : {:>5.4f}'.format(label, label_to_score[label]))

Updating: features => 1
Updating: literay => 1
Updating: liferay => 1
Updating: plugin system => 1
Updating: bloated => 1
Updating: liferay => 2
Updating: china => 1
Updating: dalian => 1
Updating: liferay => 3
Updating: dalian => 2
Updating: pictures => 1
Updating: china => 2
Updating: job fair => 1
Updating: liferay dalian => 1
Updating: liferay => 4
Updating: symposium => 1
Updating: mobile => 1
Updating: mobile strategies => 1
Updating: dalian => 3
Updating: dxp => 1
Updating: pokemon go => 1
Updating: inflation => 1
Updating: 7.2 => 1
Updating: dxp => 2
Updating: alpha => 1
Updating: churning => 1
Updating: gratis => 1
Updating: 7.2 => 2
Updating: dxp => 3
Updating: graveyard => 1
Updating: encryption => 1
Updating: gotcha => 1


Updating: liferay => 6
Updating: dalian => 5
Updating: pictures => 3
Updating: liferay => 8
Updating: symposium => 3
Updating: mobile => 3
Updating: mobile strategies => 3
Updating: economy => 2
Updating: globalization => 2
Calculating Inverse Log Scores
