In [131]:
from IPython.core.interactiveshell import InteractiveShell

# Set shell to show all lines of output
InteractiveShell.ast_node_interactivity = 'all'


In [132]:
books = [
    ['00000000-0000-0000-0000-000000000001', [50, 75, 150, 30, 40, 15]],
    ['00000000-0000-0000-0000-000000000002', [15, 20, 100]],
    ['00000000-0000-0000-0000-000000000003', [50, 60, 70]],
    ['00000000-0000-0000-0000-000000000004', [5, 150, 30, 45, 15]],
    ['00000000-0000-0000-0000-000000000005', [40, 15]]
]

In [133]:
book_index = {book[0]: idx for idx, book in enumerate(books)}
index_book = {idx: book for book, idx in book_index.items()}

In [134]:
from collections import Counter, OrderedDict

def count_items(l):
    """Return ordered dictionary of counts of objects in `l`"""
    
    # Create a counter object
    counts = Counter(l)
    
    # Sort by highest count first and place in ordered dictionary
    counts = sorted(counts.items(), key = lambda x: x[1], reverse = True)
    counts = OrderedDict(counts)
    
    return counts

In [135]:
from itertools import chain 
unique_wikilinks = list(chain(*[list(set(book[1])) for book in books]))

wikilink_counts = count_items(unique_wikilinks)

unique_wikilinks_books = list(chain(*[list(set(link for link in book[1])) for book in books]))
# Count the number of books linked to by other books
wikilink_book_counts = count_items(unique_wikilinks_books)

In [136]:
wikilinks = unique_wikilinks
wikilink_counts = count_items(wikilinks)

In [137]:
# Limit to greater than 3 links
links = [t[0] for t in wikilink_counts.items()]

#### Most Linked-to Books

As a final bit of exploration, let's look at the books that are mentioned the most by other books on Wikipedia. We'll take the set of links for each book so that we don't have multiple counts for books that are linked to by another book more than once. 

In [138]:
unique_wikilinks_books = list(chain(*[list(set(link for link in book[1])) for book in books]))
wikilink_book_counts = count_items(unique_wikilinks_books)

## Wikilinks to Index

As with the books, we need to map the Wikilinks to integers. We'll also create the reverse mapping.

In [139]:
link_index = {link: idx for idx, link in enumerate(links)}
index_link = {idx: link for link, idx in link_index.items()}

In [140]:
pairs = []

# Iterate through each book
for book in books:
    # Iterate through the links in the book
    pairs.extend((book_index[book[0]], link_index[link]) for link in book[1] if link in links)

In [141]:
pairs_set = set(pairs)

In [142]:
import numpy as np
import random
random.seed(100)

def generate_batch(pairs, n_positive = 50, negative_ratio = 1.0):
    """Generate batches of samples for training"""
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    
    # This creates a generator
    while True:
        # randomly choose positive examples
        for idx, (book_id, link_id) in enumerate(random.sample(pairs, n_positive)):
            batch[idx, :] = (book_id, link_id, 1)

        # Increment idx by 1
        idx += 1
        
        # Add negative examples until reach batch size
        while idx < batch_size:
            
            # random selection
            random_book = random.randrange(len(books))
            random_link = random.randrange(len(links))
            
            # Check to make sure this is not a positive example
            if (random_book, random_link) not in pairs_set:
                
                # Add to batch and increment index
                batch[idx, :] = (random_book, random_link, -1)
                idx += 1
                
        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {'book': batch[:, 0], 'link': batch[:, 1]}, batch[:, 2]

In [143]:
from keras.layers import Input, Embedding, Dot, Reshape, Dense
from keras.models import Model

In [144]:
def book_embedding_model(embedding_size = 50, classification = False):
    """Model to embed books and wikilinks using the functional API.
       Trained to discern if a link is present in a article"""
    
    # Both inputs are 1-dimensional
    book = Input(name = 'book', shape = [1])
    link = Input(name = 'link', shape = [1])
    
    # Embedding the book (shape will be (None, 1, 50))
    book_embedding = Embedding(name = 'book_embedding',
                               input_dim = len(book_index),
                               output_dim = embedding_size)(book)
    
    # Embedding the link (shape will be (None, 1, 50))
    link_embedding = Embedding(name = 'link_embedding',
                               input_dim = len(link_index),
                               output_dim = embedding_size)(link)
    
    # Merge the layers with a dot product along the second axis (shape will be (None, 1, 1))
    merged = Dot(name = 'dot_product', normalize = True, axes = 2)([book_embedding, link_embedding])
    
    # Reshape to be a single number (shape will be (None, 1))
    merged = Reshape(target_shape = [1])(merged)
    
    # If classifcation, add extra layer and loss function is binary cross entropy
    if classification:
        merged = Dense(1, activation = 'sigmoid')(merged)
        model = Model(inputs = [book, link], outputs = merged)
        model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    # Otherwise loss function is mean squared error
    else:
        model = Model(inputs = [book, link], outputs = merged)
        model.compile(optimizer = 'Adam', loss = 'mse')
    
    return model

# Instantiate model and show parameters
model = book_embedding_model()
model.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
book (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
link (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
book_embedding (Embedding)      (None, 1, 50)        250         book[0][0]                       
__________________________________________________________________________________________________
link_embedding (Embedding)      (None, 1, 50)        600         link[0][0]                       
____________________________________________________________________________________________

In [145]:
n_positive = 1

gen = generate_batch(pairs, n_positive, negative_ratio = 2)

# Train
h = model.fit_generator(gen, epochs = 6, 
                        steps_per_epoch = len(pairs) // n_positive,
                        verbose = 2)

Epoch 1/6
 - 0s - loss: 1.0053
Epoch 2/6
 - 0s - loss: 0.8345
Epoch 3/6
 - 0s - loss: 0.6695
Epoch 4/6
 - 0s - loss: 0.5650
Epoch 5/6
 - 0s - loss: 0.4440
Epoch 6/6
 - 0s - loss: 0.4050


In [146]:
book_layer = model.get_layer('book_embedding')
book_weights = book_layer.get_weights()[0]
book_weights = book_weights / np.linalg.norm(book_weights, axis = 1).reshape((-1, 1))
np.sum(np.square(book_weights[0]))

1.0

In [156]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

def find_similar(name, weights, n = 3):
    """Find n most similar items (or least) to name based on embeddings. Option to also plot the results"""

    index = book_index
    rindex = index_book
    
    # Check to make sure `name` is in index
    try:
        # Calculate dot product between book and all others
        dists = np.dot(weights, weights[index[name]])
    except KeyError:
        print(f'{name} Not Found.')
        return
    
    # Sort distance indexes from smallest to largest
    sorted_dists = np.argsort(dists)
        
    closest = sorted_dists[-(n + 1):]

    res = [{'courseId': rindex[c], 'similarity': f'{dists[c]:.{2}}'}  for c in reversed(closest) if rindex[c] != name]
    return res
        
    

In [158]:
import json
REQUEST = json.dumps({ 'path' : { 'courseId': '' } })

caa06c23-ddd5-4440-2e02-08d80eb2d339 Not Found.


In [149]:
# GET /recommendations/:courseId
req = json.loads(REQUEST)
courseId = req['path']['courseId']

recommendations = find_similar(courseId, book_weights)
print(json.dumps(recommendations))

Not Found.


'null'

In [None]:
# jupyter kernelgateway --api='kernel_gateway.notebook_http' --seed_uri='main.ipynb' --port 9090