In [1]:
import time 
import logging
import theano
import scipy as sp
from theano import tensor as tt

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
class PMF():
    
    def __init__(self, train, dim , alpha=2, std=0.01, bounds=(1,5)):
        self.dim = dim # latent dimension
        self.alpha = alpha # likeihood variance : sigma
        self.std = np.sqrt(1/alpha) # 
        self.bounds = bounds
        self.data = train.copy()
        n, m = self.data.shape 
        
        nan_mask = np.isnan(self.data) # missing mask
        self.data[nan_mask] = self.data[~nan_mask].mean() # 평균값으로 nan 값 채워넣음
        
        self.alpha_u = 1/ self.data.var(axis=1).mean() # u 분산 평균
        self.alpha_v = 1/ self.data.var(axis=0).mean() # v 분산 평균
        
        logging.info('building the PMF model')
        with pm.Model() as pmf:
            """
            
            """
            U = pm.MvNormal('U', mu = 0, tau = self.alpha_u * np.eye(dim),
                           shape = (n,dim), testval = np.random.randn(n,dim) * std)
            V = pm.MvNormal('V', mu = 0, tau = self.alpha_v * np.eye(dim),
                            shape = (m,dim), testval=np.random.randn(m,dim)*std)
            R = pm.Normal(
                'R', mu= tt.dot(U, V.T)[~nan_mask], tau=self.alpha,
                observed=self.data[~nan_mask])
    
        logging.info('done building the PMF model')
        self.model = pmf
    def __str__(self):
        return self.name


In [None]:
def _find_map(self):
    tstart = time.time()
    with self.model:
        logging.info('finding PMF MAP using L-bfgs-b optimization...')
        self._map = pm.find_MAP(method='L-BFGS-B')
    elapsed = int(time.time()- tstart)
    logging.info('found PMF MAP in %d seconds'% elapsed)
    return self._map

def _map(self):
    try:
        return self._map
    except :
        return self.find_map()
    
PMF.find_map = _find_map
PMF.map = property(_map)

In [None]:
def _draw_samples(self, **kwargs):
    kwargs.setdefault('chains', 1)
    with self.model:
        self.trace =pm.sample(**kwargs)
PMF.draw_samples = _draw_samples

In [None]:
def _predict(self, U, V):
    R = np.dot(U, V.T)
    n,m = R.shape
    sample_R = np.random.normal(R, self.std) # 
    
    low, high = self.bounds
    sample_R[sample_R< low] = low
    sample_R[sample_R> high] = high
    return sample_R
PMF.predict = _predict

In [None]:
def rmse(test_data, predicted):
    I = ~np.isnan(test_data)
    N = I.sum()
    sqerror = abs(test_data - predicted)**2
    mse = sqerror[I].sum()/N
    return np.sqrt(mse)

In [None]:
def split_train_test(data, percent_test=0.1):
    n, m = data.shape
    N = n * m
    
    train = data.copy()
    test = np.ones(data.shape) * np.nan
    
    tosample = np.where(~np.isnan(train)) 
    idx_pairs = list(zip(tosample[0], tosample[1]))
    
    test_size = int(len(idx_pairs) * percent_test)
    train_size = len(idx_pairs)- test_size
    
    indices = np.arange(len(idx_pairs))
    sample = np.random.choice(indices, replace=False, size=test_size)
    
    for idx in sample:
        idx_pair = idx_pairs[idx]
        test[idx_pair] = train[idx_pair]  # transfer to test set
        train[idx_pair] = np.nan          # remove from train set

    # Verify everything worked properly
    assert(train_size == N-np.isnan(train).sum())
    assert(test_size == N-np.isnan(test).sum())

    # Return train set and test set
    return train, test

train, test = split_train_test(dense_data)

In [None]:
ALPHA = 2
DIM = 10
pmf = PMF(train, DIM, ALPHA, std=0.05)

In [None]:
pmf.find_map()

In [None]:
def eval_map(pmf_model, train, test):
    U = pmf_model.map['U']
    V = pmf_model.map['V']

    # Make predictions and calculate RMSE on train & test sets.
    predictions = pmf_model.predict(U, V)
    train_rmse = rmse(train, predictions)
    test_rmse = rmse(test, predictions)
    overfit = test_rmse - train_rmse

    # Print report.
    print('PMF MAP training RMSE: %.5f' % train_rmse)
    print('PMF MAP testing RMSE:  %.5f' % test_rmse)
    print('Train/test difference: %.5f' % overfit)

    return test_rmse


# Add eval function to PMF class.
PMF.eval_map = eval_map

In [None]:
pmf_map_rmse = pmf.eval_map(train, test)
pmf_improvement = baselines['mom'] - pmf_map_rmse
print('PMF MAP Improvement:   %.5f' % pmf_improvement)

In [None]:
pmf.draw_samples(draws=500, tune=500)

In [None]:
def _norms(pmf_model, monitor=('U', 'V'), ord='fro'):
    """Return norms of latent variables at each step in the
    sample trace. These can be used to monitor convergence
    of the sampler.
    """
    monitor = ('U', 'V')
    norms = {var: [] for var in monitor}
    for sample in pmf_model.trace:
        for var in monitor:
            norms[var].append(np.linalg.norm(sample[var], ord))
    return norms


def _traceplot(pmf_model):
    """Plot Frobenius norms of U and V as a function of sample #."""
    trace_norms = pmf_model.norms()
    u_series = pd.Series(trace_norms['U'])
    v_series = pd.Series(trace_norms['V'])
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))
    u_series.plot(kind='line', ax=ax1, grid=False,
                  title="$\|U\|_{Fro}^2$ at Each Sample")
    v_series.plot(kind='line', ax=ax2, grid=False,
                  title="$\|V\|_{Fro}^2$ at Each Sample")
    ax1.set_xlabel("Sample Number")
    ax2.set_xlabel("Sample Number")


PMF.norms = _norms
PMF.traceplot = _traceplot

In [None]:
pmf.traceplot()

In [None]:
def _running_rmse(pmf_model, test_data, train_data, burn_in=0, plot=True):
    """Calculate RMSE for each step of the trace to monitor convergence.
    """
    burn_in = burn_in if len(pmf_model.trace) >= burn_in else 0
    results = {'per-step-train': [], 'running-train': [],
               'per-step-test': [], 'running-test': []}
    R = np.zeros(test_data.shape)
    for cnt, sample in enumerate(pmf_model.trace[burn_in:]):
        sample_R = pmf_model.predict(sample['U'], sample['V'])
        R += sample_R
        running_R = R / (cnt + 1)
        results['per-step-train'].append(rmse(train_data, sample_R))
        results['running-train'].append(rmse(train_data, running_R))
        results['per-step-test'].append(rmse(test_data, sample_R))
        results['running-test'].append(rmse(test_data, running_R))

    results = pd.DataFrame(results)

    if plot:
        results.plot(
            kind='line', grid=False, figsize=(15, 7),
            title='Per-step and Running RMSE From Posterior Predictive')

    # Return the final predictions, and the RMSE calculations
    return running_R, results


PMF.running_rmse = _running_rmse

In [None]:
predicted, results = pmf.running_rmse(test, train)

In [None]:
# And our final RMSE?
final_test_rmse = results['running-test'].values[-1]
final_train_rmse = results['running-train'].values[-1]
print('Posterior predictive train RMSE: %.5f' % final_train_rmse)
print('Posterior predictive test RMSE:  %.5f' % final_test_rmse)
print('Train/test difference:           %.5f' % (final_test_rmse - final_train_rmse))
print('Improvement from MAP:            %.5f' % (pmf_map_rmse - final_test_rmse))
print('Improvement from Mean of Means:  %.5f' % (baselines['mom'] - final_test_rmse))