# Merge required data

In [1]:
import pandas as pd
import os

cols = ["userID", "movieID", "rating"]
ratings = pd.read_table("movielens-2k/user_ratedmovies.dat", usecols=cols)

ratings["userID"] = ratings.userID#.astype("category").cat.codes
ratings["movieID"] = ratings.movieID#.astype("category").cat.codes
ratings.sort_values(by=["userID", "movieID"], inplace=True)

In [2]:
cols = ["id", "title", "year", "imdbPictureURL"]
movies = pd.read_table("movielens-2k/movies.dat", usecols=cols, encoding="latin-1")
movies["movieID"] = movies.id#.astype("category").cat.codes
movies = movies.drop("id", axis=1)
movies.sort_values(by="movieID", inplace=True)

In [3]:
genres = pd.read_table("movielens-2k/movie_genres.dat")

concat = lambda a: ", ".join(a)

genres = genres.groupby(by="movieID", as_index=False).agg(concat)
genres["movieID"] = genres.movieID#.astype("category").cat.codes
genres.reset_index(inplace=True)
genres.drop(["index"], axis=1, inplace=True)
genres.sort_values(by="movieID", inplace=True)

In [4]:
pd_movies = ratings\
    .merge(movies, on="movieID")\
    .merge(genres, on="movieID")\
    .dropna(axis=0)
pd_movies.sort_values(by=["userID", "movieID"], inplace=True)
pd_movies.to_csv("data/movies.csv.bz2", encoding="UTF-8", compression="bz2", index=False)

In [5]:
pd_movies["userID"] = pd_movies.userID.astype('category').cat.codes
pd_movies['movieID'] = pd_movies.movieID.astype('category').cat.codes

In [6]:
pd_movies.reset_index(inplace=True, drop=True)

In [7]:
pd_movies.to_csv("data/movies.csv.bz2", encoding="UTF-8", compression="bz2", index=False)

# Preprocessing data for HPF model

In [8]:
import numpy as np
import pandas as pd

In [9]:
movies_all = pd_movies
movies = movies_all[["userID", "movieID", "rating"]]

In [10]:
# cols = ["userID", "movieID", "rating"]
# movies_all = pd.read_csv("data/movies.csv.bz2")
# # movies = pd.read_csv("data/movies.csv.bz2", usecols=cols)
# movies = movies_all[cols]
# movies.head()

In [11]:
rating_table = movies.pivot_table(index="userID", values="rating", columns="movieID", fill_value=0)
rating = rating_table.values

## Compare ratings with author's result

In [12]:
np.savetxt("data/ratings.txt.gz", rating, fmt="%.1f")

# Compare data used to modeling with author's result

In [13]:
pd_movies_org = pd.read_csv("../recsys-hpf-master/data/movies.csv.bz2")
rating_org = np.loadtxt("../recsys-hpf-master/data/ratings.txt.gz", dtype=np.float32)

In [14]:
pd.concat([pd_movies, pd_movies_org]).drop_duplicates(keep=False).shape[0]==0

True

In [15]:
np.array_equal(rating, rating_org)

True

# HPF modeling

In [16]:
%matplotlib inline

import numpy as np
import edward as ed
import tensorflow as tf
import matplotlib.pyplot as plt

In [17]:
seed = 42
ed.set_seed(seed)
plt.style.use("ggplot")
fname = "data/ratings.txt.gz"

## Data

In [18]:
# data = np.loadtxt(fname, dtype=np.float32)
data = rating
idx = data.nonzero()
tidx = tf.constant(np.column_stack(idx))
y = data[idx]
n,m = data.shape
%xdel data
y = np.ceil(y)
print(n,m)

2113 9936


In [19]:
data_org = np.loadtxt("data/ratings.txt.gz", dtype=np.float32)
idx = data_org.nonzero()
tidx = tf.constant(np.column_stack(idx))
y_org = data_org[idx]
n,m = data_org.shape
%xdel data_org
y_org = np.ceil(y_org)
print(n,m)

2113 9936


In [20]:
np.array_equal(y, y_org)

True

## Model

In [21]:
from edward.models import Poisson, Gamma
from edward.models import PointMass, Empirical

In [22]:
k = 20
n_iter = 500
t = 500

#### Priors ####

act = Gamma(1.0, 1.0, sample_shape=n) # Users activity (ξ_u)
pref = Gamma(1.0, act, sample_shape=k) # Users preference (θ_uk)

pop = Gamma(0.3, 0.3, sample_shape=m) # Items popularity (η_i)
attr = Gamma(1.0, pop, sample_shape=k) # Items attribute (β_ik)

like = Poisson(tf.gather_nd(tf.matmul(pref, attr, transpose_a=True), tidx)) # y_ui


#### Posteriors ####

qact = Empirical(
    tf.nn.softplus(tf.Variable(tf.random_normal([t,n]))),
)
qpref = PointMass(
    tf.nn.softplus(tf.Variable(tf.random_normal([k,n]))),
)
qpop = Empirical(
    tf.nn.softplus(tf.Variable(tf.random_normal([t,m]))),
)
qattr = PointMass(
    tf.nn.softplus(tf.Variable(tf.random_normal([k,m]))),
)

## Inference

In [23]:
inference_e = ed.Gibbs(
    {act:qact, pop:qpop}, 
    data={like:y, pref:qpref, attr:qattr},
)

inference_m = ed.MAP(
    {pref:qpref, attr:qattr},
    data={like:y, act:qact, pop:qpop},
)

inference_e.initialize()
inference_m.initialize(n_iter=n_iter, optimizer="rmsprop")

tf.global_variables_initializer().run()

  not np.issubdtype(value.dtype, np.float) and \


In [None]:
loss = np.empty(n_iter, dtype=np.float32)

for i in range(n_iter):
    info_dict_e = inference_e.update()
    info_dict_m = inference_m.update()
    
    loss[i] = info_dict_m["loss"]
    
    inference_m.print_progress(info_dict_m)

405/500 [ 81%] ████████████████████████       ETA: 22s | Loss: 1575262.625

In [None]:
fig = plt.figure(figsize=(15,6))
ax = fig.add_subplot(111)
ax.plot(loss / loss.max())
ax.set_title("Loss")
ax.set_xlabel("Iteration")
fig.savefig("images/loss.png", transparent=True)

## Save

In [None]:
sess = ed.get_session()

In [None]:
pref_0=sess.run(qpref)
attr_0=sess.run(qattr)

In [None]:
np.save("data/loss", loss)

In [None]:
np.savez("data/act-pop", act=sess.run(qact), pop=sess.run(qpop))

In [None]:
np.savez("data/pref-attr", pref=sess.run(qpref), attr=sess.run(qattr))

# Results

In [None]:
import numpy as np
import pandas as pd
import seaborn
import matplotlib.pyplot as plt

In [None]:
np.random.seed(42)

In [None]:
pref_attr = np.load("data/pref-attr.npz")
rating = np.loadtxt("data/ratings.txt.gz", dtype=np.float32) # Rating matrix
pref = pref_attr["pref"]
attr = pref_attr["attr"]
k = pref.shape[0]
n,m = rating.shape

In [None]:
print(n,m)

In [None]:
pref_attr_org = np.load("../recsys-hpf-master/data/pref-attr.npz")
rating_org = np.loadtxt("../recsys-hpf-master/data/ratings.txt.gz", dtype=np.float32)
pref_org = pref_attr_org["pref"]
attr_org = pref_attr_org["attr"]
k = pref_org.shape[0]
n,m = rating_org.shape
print(k,n,m)

In [None]:
print(np.array_equal(pref, pref_org))
print(np.array_equal(attr, attr_org))

In [None]:
cols = ["movieID", "title", "imdbPictureURL", "year", "genre"]
meta = movies_all[cols]
meta.drop_duplicates(inplace=True)
meta.sort_values(by="movieID", inplace=True)
meta.set_index("movieID", inplace=True)
print(meta.shape)
meta.head()

In [None]:
meta_org = pd.read_csv("../recsys-hpf-master/data/info.csv.bz2", index_col=0)
meta_org.head()

In [None]:
pd.concat([meta, meta_org]).drop_duplicates(keep=False).shape[0] == 0

## Heatmap

In [None]:
def heatmap(attr, meta, n=20, lw=0.2, cmap="Reds"):
    m = meta.shape[0]
    
    ids = np.random.randint(m, size=n)
    ids = ids[ids<=attr.T.shape[0]]
    ids = np.unique(ids)
    attr = attr.T[ids]
    meta = pd.DataFrame(attr, index=meta.iloc[ids].title)
    
    fig = plt.figure(figsize=(15,6))
    ax = fig.add_subplot(111)
    ax = seaborn.heatmap(meta, ax=ax, annot=True, fmt=".2f", linewidths=lw, cmap=cmap)
    ax.set_xlabel("Attributes")
    ax.set_ylabel("Title")
    fig.savefig("images/heatmap.png", transparent=True)

In [None]:
heatmap(attr, meta)

## MDS

In [None]:
def mds(attr, meta, n=20, cmap="Dark2"):
    m = meta.shape[0]
    tohex = plt.cm.colors.rgb2hex
    cmap = plt.cm.get_cmap(cmap)
    kwargs = dict(horizontalalignment="center", weight="bold", verticalalignment="center")
    line = dict(color="grey", linestyle="dashed")
    fig = plt.figure(figsize=(12,5))
    #for i in [1,2]: 
    iy = np.random.randint(k)
    ix = iy
    while ix == iy:
        ix = np.random.randint(k)
    ids = np.unique(np.random.randint(m, size=n))
    ids = ids[ids<=attr.T.shape[0]]
    xs = attr[:,ids][ix]
    ys = attr[:,ids][iy]
        
    text = [meta.iloc[id].title + " ({y})".format(y=meta.iloc[id].year) for id in ids]
  
    ax = fig.add_subplot(1, 1, 1)
    color = lambda v: tohex(cmap(v))
    c = map(color, xs+ys)
    
    ax.scatter(xs, ys-0.02, c=list(c), edgecolors="white")
    for x,y,s in zip(xs,ys,text):
        ax.text(x, y, s, color=tohex(cmap(x+y)), **kwargs)
    
    v = np.median(xs)
    h = np.median(ys)
    ax.axvline(v, **line)
    ax.axhline(h, **line)
    ax.set_ylabel("Attribute vector %d" % iy)
    ax.set_xlabel("Attribute vector %d" % ix)
    #ax.axis("tight")
    
    fig.tight_layout()
    fig.savefig("images/mds.png", transparent=True)

In [None]:
mds(attr, meta)

In [None]:
# Helper functions

def plot_on(fig, meta, title, given=None):
    """
    Plot movies on axes.
    
    Parameters
    ==========
    axs: Matplotlib Axes.
    meta: Movies informations DataFrame.
    given: The rating list.
    """
    
    from urllib.request import urlopen
    from urllib.error import URLError, HTTPError
    from PIL import Image
    
    found = False
    for i,ax in enumerate(fig.axes):
        try:
            url = meta.iloc[i].imdbPictureURL
            img = Image.open(urlopen(url))
            ax.imshow(img)
            found = True
        except (URLError, HTTPError, ValueError):
            kwargs = dict(horizontalalignment="center", verticalalignment="center")
            fontdict = dict(size=15, weight="bold")
            x = 0.5 #round(ax.axes.get_xlim()[0]) / 2
            y = 150 #round(ax.axes.get_ylim()[1]) / 2
            found = False
            ax.text(x, y, "No Picture", fontdict=fontdict, **kwargs)
        
        t,y,g = meta.iloc[i].title, meta.iloc[i].year, meta.iloc[i].genre.split(", ")
        g = "\n".join(g)
        if given is None:
            label = '"{t}"\n{g}\n({y})'.format(t=t,y=y,g=g)
        else:
            r =  given[i]
            label = '"{t}"\n{g}\n({y})\n{r:.1f}'.format(t=t,y=y,g=g,r=r)
        
        ax.set_xlabel(label)
        ax.grid(False)
    
    fig.suptitle(title, fontsize=15, y=1)
    fig.tight_layout()
    return fig

def rated_by(uid, meta, rating, n=5):
    """
    Show movies rated by a given user.
    
    Paramters
    =========
    uid: A user ID (int).
    meta: The DataFrame containing informations about movies.
    rating: The User-Movie rating matrix.
    n: The number of movies to display.
    """
    
    rating = rating[uid]
    ids = rating.nonzero()[0]
    n = n if n <= len(ids) else len(ids)
    ids = np.random.choice(ids, size=n, replace=False)
    rating = rating[ids]
    meta = meta.iloc[ids]
    
    title = 'Movies rated by user "{}"'.format(uid)
    
    fig,axs = plt.subplots(nrows=1, ncols=n, figsize=(3*n, 6), sharey=True)
    fig = plot_on(fig, meta, title, rating)
    
    fig.savefig("images/rated_by.png", transparent=True)

def recommend_to(uid, pref, attr, meta, rating, n=5):
    """
    Show movies recommended to a paticular user.
    
    Paramters:
    ==========
    uid: User ID.
    pref: Users preferences matrix.
    attr: Movies attributes matrix.
    meta: Movies info dataframe.
    rating: Rating matrix.
    """
    poisson = np.random.poisson
    title = 'Movies that user "{}" might like'.format(uid)
    
    # Discard movies already rated by the user,
    m = rating.shape[1] # Number of movies.
    ids = rating[uid].nonzero()[0] # Rated movies IDs.
    ids = np.array([i for i in range(m) if not i in ids]) # Not Rated movies IDs
    pref = pref[:, uid] # User preference
    attr = attr[:, ids]
    
    scores = poisson(np.dot(pref, attr))
    ids = np.array([i for i in np.argsort(-scores) if i in ids]) # Recommended movies IDs
    
    meta = meta.iloc[ids]
    
    fig,axs = plt.subplots(nrows=1, ncols=n, figsize=(3*n, 6), sharey=True)
    fig = plot_on(fig, meta, title)
    
    fig.savefig("images/recommend_to.png", transparent=True)

def similar_to(mid, attr, meta, rating, n=5):
    """
    Show movies closer to a given one int the attribute subspace.
    
    Paramters
    =========
    mid: Movie ID
    attr: Movies attributes matrix.
    meta: Movies informations DataFrame.
    """
    norm = np.linalg.norm
    r = rating[:, mid]
    r = np.median(r[r.nonzero()])
    
    t,y,g = meta.iloc[mid].title, meta.iloc[mid].year, meta.iloc[mid].genre
    title = 'Movies similar to "{t}": {g} ({y}) - {r:.1f}*'.format(t=t, y=y, g=g, r=r)
    
    dist = np.apply_along_axis(norm, 1, attr[:, mid] - attr.T) # Euclidean distance: lower is closer.
    ids = np.argsort(dist)[1:n+1] # Discard the movie itselft
    meta = meta.iloc[ids]
    rating = rating[:, ids]
    #total = np.apply_along_axis(np.count_nonzero, 0, rating) # Uncomment this if you want the mean instead
    #rating = rating.sum(axis=0) / np.where(total == 0, 1, total) # and comment the two following lines.
    rating = np.where(rating == 0, np.nan, rating)
    rating = np.nanmedian(rating, axis=0)
    
    fig,axs = plt.subplots(nrows=1, ncols=n, figsize=(3*n, 6), sharey=True)
    fig = plot_on(fig, meta, title, given=rating)
    
    fig.savefig("images/similar_to.png", transparent=True)

In [None]:
uid = np.random.randint(n)
uid=863

In [None]:
rated_by(uid, meta, rating)

In [None]:
recommend_to(uid, pref, attr, meta, rating)

In [None]:
similar_to(0, attr, meta, rating) # Similar to "Toy Story"

In [None]:
def closer_to(mid):
    norm = np.linalg.norm
    dist = np.apply_along_axis(norm, 1, attr[:, mid] - attr.T) # Euclidean distance: lower is closer.
    ids = np.argsort(dist)[:n+1] # Include the movie itselft
    dist = dist[ids]
    info = meta.iloc[ids]
    info["distance"] = dist
    return info

In [None]:
info = closer_to(0) # Toy Story's ID
info.head(6)