# Matrix Factorizations

In [1]:
import numpy as np
import pandas as pd
import spacy
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pymc3 as pm
import theano
import theano.tensor as tt

In [2]:
def mask(token):
    # Helper function to mask out non-tokens
    if (not token.is_ascii
            or token.is_stop
            or token.like_num
            or token.pos_ in ['X', 'SYM']):
        return False
    return True


def tokenize(document):
    # Tokenize by lemmatizing
    doc = nlp(document)
    return [token.lemma_ for token in doc if mask(token)]

In [3]:
# Disable tagger, parser and named-entity recognition
nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])

# Read data
DATA_FILE = 'NeutralPolitics.csv'
data = pd.read_csv(DATA_FILE).squeeze()

In [4]:
data.head()

0     what points to no collusion which of these cl...
1    the unhrc replaced the un commission on human ...
2    when they replaced the terrible commission on ...
3    the flores decision flores v reno and subseque...
4    the protestors in question are being led bycom...
Name: text, dtype: object

In [5]:
# Vectorize data using tf-idfs
vectorizer = TfidfVectorizer(strip_accents='unicode',
                             tokenizer=tokenize,
                             max_df=0.90,
                             min_df=0.001,
                             norm='l2')

tfidf = vectorizer.fit_transform(data)
feature_names = vectorizer.get_feature_names()

## NMF (Non-Negative Matrix Factorization)

Some people in the collaborative filtering space refer to this method as SVD, despite it having very little to do with SVD.

In [6]:
# Factorize with NMF.
nmf = NMF(n_components=20,
          random_state=1618,
          alpha=0.2)  # L2 regularization

W = nmf.fit_transform(tfidf)
H = nmf.components_
err = nmf.reconstruction_err_

In [None]:
# Print clusters and exemplars.
for topic_idx, [scores, topic] in enumerate(zip(np.transpose(W), H)):
    print('Cluster #{}:'.format(topic_idx))
    print('Cluster importance: {}'.format(
        float((np.argmax(W, axis=1) == topic_idx).sum()) / W.shape[0]))

    for token, importance in zip(
            [feature_names[i] for i in np.argsort(topic)[:-10 - 1:-1]],
            np.sort(topic)[:-15 - 1:-1]):
        print('{}: {:2f}'.format(token, importance))

    print('')

    for exemplar_idx in np.argsort(scores)[-5:]:
        print(exemplar_idx)
        print(data[exemplar_idx])
        print('')

    print('----------')

## PMF (Probabilistic Matrix Factorization)

In [7]:
def sparse_std(tfidf, axis=None):
    """ Standard deviation of scipy.sparse matrix, via [E(X^2) - E(X)^2]^(1/2) """
    return np.sqrt(np.mean(tfidf.power(2), axis=axis) - np.square(np.mean(tfidf, axis=axis)))

In [8]:
rows, columns, entries = scipy.sparse.find(tfidf)

n, m = tfidf.shape
dim = 20

sigma = entries.std()
sigma_u = sparse_std(tfidf, axis=1).mean()
sigma_v = sparse_std(tfidf, axis=0).mean()

In [23]:
entries.max()

1.0

In [15]:
sigma

0.15254229531386043

In [16]:
sigma_u

0.020264763601418535

In [18]:
sigma_v

0.014763014069451984

In [9]:
'''
# Naive implementation, will not work.
with pm.Model() as pmf:
    U = pm.Normal('U', mu=0, sd=sigma_u, shape=[n, dim])
    V = pm.Normal('V', mu=0, sd=sigma_v, shape=[m, dim])
    R = pm.Normal('R', mu=tt.dot(U, V.T), sd=sigma, shape=[n, m], observed=tfidf)

    map_estimate = pm.find_MAP()
''';

In [14]:
# This doesn't seem to work either...?
with pm.Model() as pmf:
    U = pm.Normal('U', mu=0, sd=sigma_u, shape=[n, dim])
    V = pm.Normal('V', mu=0, sd=sigma_v, shape=[m, dim])
    R_nonzero = pm.Normal('R_nonzero',
                          mu=tt.sum(np.multiply(U[rows, :], V[columns, :]), axis=1),
                          sd=sigma,
                          observed=entries)
    
    map_estimate = pm.find_MAP()

logp = 5.5661e+05, ||grad|| = 0: 100%|██████████| 2/2 [00:00<00:00,  9.50it/s]   


## Bayesian Probabilistic Matrix Factorization (BPMF)

## References

[1] https://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf

[2] https://www.cs.toronto.edu/~amnih/papers/bpmf.pdf