In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import seaborn
import matplotlib.pyplot as plt

In [None]:
np.random.seed(42)

In [None]:
pref_attr = np.load("data/pref-attr.npz")
rating = np.loadtxt("data/ratings.txt.gz", dtype=np.float32) # Rating matrix
pref = pref_attr["pref"]
attr = pref_attr["attr"]
k = pref.shape[0]
n,m = rating.shape

In [None]:
meta = pd.read_csv("data/info.csv.bz2", index_col=0)
meta.head()

### Heatmap

In [None]:
def heatmap(attr, meta, n=20, lw=0.2, cmap="Reds"):
    m = meta.shape[0]
    
    ids = np.random.randint(m, size=n)
    ids = np.unique(ids)
    attr = attr.T[ids]
    meta = pd.DataFrame(attr, index=meta.iloc[ids].title)
    
    fig = plt.figure(figsize=(15,6))
    ax = fig.add_subplot(111)
    ax = seaborn.heatmap(meta, ax=ax, annot=True, fmt=".2f", linewidths=lw, cmap=cmap)
    ax.set_xlabel("Attributes")
    ax.set_ylabel("Title")
    fig.savefig("images/heatmap.png", transparent=True)

In [None]:
heatmap(attr, meta)

### MDS

In [None]:
def mds(attr, meta, n=20, cmap="Dark2"):
    m = meta.shape[0]
    tohex = plt.cm.colors.rgb2hex
    cmap = plt.cm.get_cmap(cmap)
    kwargs = dict(horizontalalignment="center", weight="bold", verticalalignment="center")
    line = dict(color="grey", linestyle="dashed")
    fig = plt.figure(figsize=(12,5))
    #for i in [1,2]: 
    iy = np.random.randint(k)
    ix = iy
    while ix == iy:
        ix = np.random.randint(k)
    ids = np.unique(np.random.randint(m, size=n))
    xs = attr[:,ids][ix]
    ys = attr[:,ids][iy]
        
    text = [meta.iloc[id].title + " ({y})".format(y=meta.iloc[id].year) for id in ids]
  
    ax = fig.add_subplot(1, 1, 1)
    color = lambda v: tohex(cmap(v))
    c = map(color, xs+ys)
    
    ax.scatter(xs, ys-0.02, c=list(c), edgecolors="white")
    for x,y,s in zip(xs,ys,text):
        ax.text(x, y, s, color=tohex(cmap(x+y)), **kwargs)
    
    v = np.median(xs)
    h = np.median(ys)
    ax.axvline(v, **line)
    ax.axhline(h, **line)
    ax.set_ylabel("Attribute vector %d" % iy)
    ax.set_xlabel("Attribute vector %d" % ix)
    #ax.axis("tight")
    
    fig.tight_layout()
    fig.savefig("images/mds.png", transparent=True)

In [None]:
mds(attr, meta)

In [None]:
# Helper functions

def plot_on(fig, meta, title, given=None):
    """
    Plot movies on axes.
    
    Parameters
    ==========
    axs: Matplotlib Axes.
    meta: Movies informations DataFrame.
    given: The rating list.
    """
    
    from urllib.request import urlopen
    from urllib.error import URLError, HTTPError
    from PIL import Image
    
    found = False
    for i,ax in enumerate(fig.axes):
        try:
            url = meta.iloc[i].imdbPictureURL
            img = Image.open(urlopen(url))
            ax.imshow(img)
            found = True
        except (URLError, HTTPError, ValueError):
            kwargs = dict(horizontalalignment="center", verticalalignment="center")
            fontdict = dict(size=15, weight="bold")
            x = 0.5 #round(ax.axes.get_xlim()[0]) / 2
            y = 150 #round(ax.axes.get_ylim()[1]) / 2
            found = False
            ax.text(x, y, "No Picture", fontdict=fontdict, **kwargs)
        
        t,y,g = meta.iloc[i].title, meta.iloc[i].year, meta.iloc[i].genre.split(", ")
        g = "\n".join(g)
        if given is None:
            label = '"{t}"\n{g}\n({y})'.format(t=t,y=y,g=g)
        else:
            r =  given[i]
            label = '"{t}"\n{g}\n({y})\n{r:.1f}'.format(t=t,y=y,g=g,r=r)
        
        ax.set_xlabel(label)
        ax.grid(False)
    
    fig.suptitle(title, fontsize=15, y=1)
    fig.tight_layout()
    return fig

def rated_by(uid, meta, rating, n=5):
    """
    Show movies rated by a given user.
    
    Paramters
    =========
    uid: A user ID (int).
    meta: The DataFrame containing informations about movies.
    rating: The User-Movie rating matrix.
    n: The number of movies to display.
    """
    
    rating = rating[uid]
    ids = rating.nonzero()[0]
    n = n if n <= len(ids) else len(ids)
    ids = np.random.choice(ids, size=n, replace=False)
    rating = rating[ids]
    meta = meta.iloc[ids]
    
    title = 'Movies rated by user "{}"'.format(uid)
    
    fig,axs = plt.subplots(nrows=1, ncols=n, figsize=(3*n, 6), sharey=True)
    fig = plot_on(fig, meta, title, rating)
    
    fig.savefig("images/rated_by.png", transparent=True)

def recommend_to(uid, pref, attr, meta, rating, n=5):
    """
    Show movies recommended to a paticular user.
    
    Paramters:
    ==========
    uid: User ID.
    pref: Users preferences matrix.
    attr: Movies attributes matrix.
    meta: Movies info dataframe.
    rating: Rating matrix.
    """
    poisson = np.random.poisson
    title = 'Movies that user "{}" might like'.format(uid)
    
    # Discard movies already rated by the user,
    m = rating.shape[1] # Number of movies.
    ids = rating[uid].nonzero()[0] # Rated movies IDs.
    ids = np.array([i for i in range(m) if not i in ids]) # Not Rated movies IDs
    pref = pref[:, uid] # User preference
    attr = attr[:, ids]
    
    scores = poisson(np.dot(pref, attr))
    ids = np.array([i for i in np.argsort(-scores) if i in ids]) # Recommended movies IDs
    
    meta = meta.iloc[ids]
    
    fig,axs = plt.subplots(nrows=1, ncols=n, figsize=(3*n, 6), sharey=True)
    fig = plot_on(fig, meta, title)
    
    fig.savefig("images/recommend_to.png", transparent=True)

def similar_to(mid, attr, meta, rating, n=5):
    """
    Show movies closer to a given one int the attribute subspace.
    
    Paramters
    =========
    mid: Movie ID
    attr: Movies attributes matrix.
    meta: Movies informations DataFrame.
    """
    norm = np.linalg.norm
    r = rating[:, mid]
    r = np.median(r[r.nonzero()])
    
    t,y,g = meta.iloc[mid].title, meta.iloc[mid].year, meta.iloc[mid].genre
    title = 'Movies similar to "{t}": {g} ({y}) - {r:.1f}*'.format(t=t, y=y, g=g, r=r)
    
    dist = np.apply_along_axis(norm, 1, attr[:, mid] - attr.T) # Euclidean distance: lower is closer.
    ids = np.argsort(dist)[1:n+1] # Discard the movie itselft
    meta = meta.iloc[ids]
    rating = rating[:, ids]
    #total = np.apply_along_axis(np.count_nonzero, 0, rating) # Uncomment this if you want the mean instead
    #rating = rating.sum(axis=0) / np.where(total == 0, 1, total) # and comment the two following lines.
    rating = np.where(rating == 0, np.nan, rating)
    rating = np.nanmedian(rating, axis=0)
    
    fig,axs = plt.subplots(nrows=1, ncols=n, figsize=(3*n, 6), sharey=True)
    fig = plot_on(fig, meta, title, given=rating)
    
    fig.savefig("images/similar_to.png", transparent=True)

In [None]:
uid = np.random.randint(n)

In [None]:
rated_by(uid, meta, rating)

In [None]:
recommend_to(uid, pref, attr, meta, rating)

In [None]:
similar_to(0, attr, meta, rating) # Similar to "Toy Story"

In [None]:
def closer_to(mid):
    norm = np.linalg.norm
    dist = np.apply_along_axis(norm, 1, attr[:, mid] - attr.T) # Euclidean distance: lower is closer.
    ids = np.argsort(dist)[:n+1] # Include the movie itselft
    dist = dist[ids]
    info = meta.iloc[ids]
    info["distance"] = dist
    return info

In [None]:
info = closer_to(0) # Toy Story's ID
info.head(6)