In [1]:
import numpy as np
import tensorflow as tf
import time

# Load data

In [2]:
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import scale
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score


mnist = fetch_mldata('MNIST original', data_home='./tmp')

# only binary classification supported
mask = (mnist['target'] == 3) + (mnist['target'] == 5)

X_all = scale(mnist['data'][mask].astype(float))
y_all = (mnist['target'][mask]==3)*1

# make it more sparse
X_all = X_all * (np.random.uniform(0, 1, X_all.shape) > 0.8)



print('Dataset shape: {}'.format(X_all.shape))
print('Non-zeros rate: {}'.format(np.mean(X_all != 0)))
print('Classes balance: {} / {}'.format(np.mean(y_all==0), np.mean(y_all==1)))


X_tr, X_te, y_tr, y_te = train_test_split(X_all, y_all, random_state=42, test_size=0.3)

Dataset shape: (13454, 784)
Non-zeros rate: 0.163292799653
Classes balance: 0.469228482236 / 0.530771517764


# Baselines

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
for model in [
                LogisticRegression(), 
                RandomForestClassifier(n_jobs=-1, n_estimators=200)
            ]:
    model.fit(X_tr, y_tr)
    predictions = model.predict(X_te)
    acc = accuracy_score(y_te, predictions)
    print('model: {}'.format(model.__str__()))
    print('accuracy: {}'.format(acc))
    print()

model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
accuracy: 0.894723804806
()
model: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
accuracy: 0.896457765668
()


# Dense input

In [4]:
from tffm import TFFMClassifier

for (order, rank) in [(2, 3), (3, 10)]:
    model = TFFMClassifier(
                order=order, 
                rank=rank, 
                optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
                n_epochs=100, 
                batch_size=-1,
                init_std=0.001,
                input_type='dense'
    )
    model.fit(X_tr, y_tr, show_progress=True)
    predictions = model.predict(X_te)
    print('[order={}] accuracy: {}'.format(order, accuracy_score(y_te, predictions)))
    model.destroy()  # this will close tf.Session and free resources

100%|██████████| 100/100 [00:18<00:00,  5.62epoch/s]


[order=2] accuracy: 0.873173148378


100%|██████████| 100/100 [00:33<00:00,  2.96epoch/s]


[order=3] accuracy: 0.918503839485


# Sparse part

In [5]:
import scipy.sparse as sp
# only CRS format supported
X_tr_sparse = sp.csr_matrix(X_tr)
X_te_sparse = sp.csr_matrix(X_te)

In [6]:
model = TFFMClassifier(
            order=3, 
            rank=10, 
            optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
            n_epochs=100, 
            batch_size=-1,
            init_std=0.001,
            input_type='sparse'
)
model.fit(X_tr_sparse, y_tr, show_progress=True)
predictions = model.predict(X_te_sparse)
print('[rank={}] accuracy: {}'.format(rank, accuracy_score(y_te, predictions)))
model.destroy()  # this will close tf.Session and free resources

100%|██████████| 100/100 [00:34<00:00,  2.98epoch/s]


[rank=10] accuracy: 0.918999256874
