# Baseline interpolation

In [1]:
# libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-darkgrid')

In [2]:
data = pd.read_csv('../data/movie.data', sep='\t', names=["userid", "itemid", "rating", "timestamp"])

movie_columns  = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL',
                  'unknown','Action','Adventure', 'Animation',"Children's", 'Comedy', 'Crime',
                  'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                  'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('../data/u.item', sep='|', names=movie_columns , encoding='latin-1',index_col="movie id",parse_dates=['release date'])
ratings = data.rating # 영화 ratings

## Methods

추천 알고리즘의 성능을 평가하기 위한 베이스라인들을 정리. 



### 1.Uniform Random
모든 결측치를 주어진 데이터의 최소, 최대값 사이에서 random하게 뽑은 값으로 보간

$$
R_{ij}  \sim Uniform(1,5)
$$

### 2. Global Mean
관측된 데이터들의 전체 평균 평점으로 결측치를 보간

$$
Global \space mean = \frac {1} {n} \frac {1} {m} \sum^n_i \sum^m_j I_{ij}R_{ij} \\
R_{ij} = Global \space mean
$$

### 3. Mean of Means 
유저에 따라서 평점이 전반적으로 높을수도 낮을수도 있음 : 개인적인 특징   
영화에 따라서 평점이 다를 것. 좋은영화라면 전반적으로 높은 평점을 그렇지 않다면 낮은평점이 책정될 것 : 영화의 특징  
유저와 영화의 특징과 전체 평점을 평균하여 결측치를 보간

$$
user \space mean = \frac {1} {n} \sum^n_i I_{ij}R_{ij} \\
movie \space mean = \frac {1} {m} \sum^m_j I_{ij}R_{ij} \\
R = \frac {1} {3} ( user \space mean_j + movie \space mean_i + Global \space mean)
$$

In [109]:
class Baseline():
    """Calculate baseline predictions."""

    def __init__(self, train_data):
        """Simple heuristic-based transductive learning to fill in missing
        values in data matrix."""
        self.predict(train_data.copy())

    def predict(self, train_data):
        raise NotImplementedError(
            'baseline prediction not implemented for base class')

    def rmse(self, test_data):
        """Calculate root mean squared error for predictions on test data."""
        I = ~np.isnan(test_data)   # indicator for missing values
        N = I.sum()                # number of non-missing values
        sqerror = abs(test_data - self.predicted) ** 2  # squared error array
        mse = sqerror[I].sum() / N                 # mean squared error
        return np.sqrt(mse)      
        

    def __str__(self):
        return split_title(self.__class__.__name__)


# Implement the 3 baselines.

class UniformRandomBaseline(Baseline):
    """결측치를 random하게 샘플링한 값으로 보간"""

    def predict(self, train_data):
        nan_mask = np.isnan(train_data)
        masked_train = np.ma.masked_array(train_data, nan_mask)
        pmin, pmax = masked_train.min(), masked_train.max()
        N = nan_mask.sum()
        train_data[nan_mask] = np.random.uniform(pmin, pmax, N)
        self.predicted = train_data


class GlobalMeanBaseline(Baseline):
    """결측치를 전체 평균 값으로 보간"""

    def predict(self, train_data):
        nan_mask = np.isnan(train_data)
        train_data[nan_mask] = train_data[~nan_mask].mean()
        self.predicted = train_data


class MeanOfMeansBaseline(Baseline):
    """결측치를 유저평균,영화평균,전체평균의 평균값으로 채움"""

    def predict(self, train_data):
        nan_mask = np.isnan(train_data)
        masked_train = np.ma.masked_array(train_data, nan_mask)
        global_mean = masked_train.mean()
        user_means = masked_train.mean(axis=1)
        item_means = masked_train.mean(axis=0)
        self.predicted = train_data.copy()
        n, m = train_data.shape
        for i in range(n):
            for j in range(m):
                if np.ma.isMA(item_means[j]):
                    self.predicted[i, j] = np.mean(
                        (global_mean, user_means[i]))
                else:
                    self.predicted[i, j] = np.mean(
                        (global_mean, user_means[i], item_means[j]))


baseline_methods = {}
baseline_methods['ur'] = UniformRandomBaseline
baseline_methods['gm'] = GlobalMeanBaseline
baseline_methods['mom'] = MeanOfMeansBaseline

In [90]:
def split_train_test(data, percent_test=0.1):
    """Split the data into train/test sets.
    :param int percent_test: Percentage of data to use for testing. Default 10.
    """
    n, m = data.shape             # # users, # movies
    N = n * m                     # # cells in matrix

    # Prepare train/test ndarrays.
    train = data.copy()
    test = np.ones(data.shape) * np.nan

    # Draw random sample of training data to use for testing.
    tosample = np.where(~np.isnan(train))       # 샘플링 대상은 존재하는 값들. 인덱스
    idx_pairs = list(zip(tosample[0], tosample[1]))   # 존재하는 값 인덱스

    test_size = int(len(idx_pairs) * percent_test)  # test 크기
    train_size = len(idx_pairs) - test_size   # train 크기

    indices = np.arange(len(idx_pairs))         #  존재하는 값의 총 길이에 대한 idnex
    sample = np.random.choice(indices, replace=False, size=test_size) # 테스트 길이만큼의 인덱스 샘플링.

    # random sample을 이용해 test set 채우기
    for idx in sample:
        idx_pair = idx_pairs[idx]
        test[idx_pair] = train[idx_pair]  # train 값을 test값에 대입
        train[idx_pair] = np.nan          # train 값은 제거

    # 잘 나뉘어 졌는지 확인
    assert(train_size == N-np.isnan(train).sum())
    assert(test_size == N-np.isnan(test).sum())

    # Return train set and test set
    return train, test

train, test = split_train_test(train_data)

In [110]:
baselines = {}
for name in baseline_methods:
    Method = baseline_methods[name]
    method = Method(train)
    baselines[name] = method.rmse(test)
    print('%s RMSE:\t%.5f' % (method, baselines[name]))

Uniform Random Baseline RMSE:	1.70544
Global Mean Baseline RMSE:	1.12748
Mean Of Means Baseline RMSE:	1.01826


## refrence
https://docs.pymc.io/notebooks/probabilistic_matrix_factorization.html