In [2]:
import numpy as np 
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split
import scipy.sparse as sparse

RANDOM_STATE = 17

### Utils

In [12]:
def submit(data, name):
    pred = pd.DataFrame({'Id': range(1, len(data)+1), 'Score': data})
    pred.to_csv(name, index=False)
    
def shuffle_data(X, y, seed=None):
    if seed:
        np.random.seed(seed)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    return X[idx], y[idx]

def set_scope(y_pred):
    y_pred[y_pred > 5] = 5
    y_pred[y_pred < 1] = 1
    return y_pred

def train_split(user_item, test_size=0.3, random_state=None):
    X_train = np.zeros(user_item.shape)
    X_test = np.zeros(user_item.shape)
    if random_state is not None:
        np.random.seed(random_state)
        
    has_score = np.argwhere(user_item > 0)
    
    # Перемешиваем и делим
    np.random.shuffle(has_score)
    split = int(len(has_score) * test_size)
    test = has_score[: split]
    train = has_score[split :]
    
    # Собираем значения train матрицы
    for i, idx in enumerate(train):
        X_train[idx[0], idx[1]] = user_item[idx[0], idx[1]]
    
    # Собираем значения test матрицы
    for i, idx in enumerate(test):
        X_test[idx[0], idx[1]] = user_item[idx[0], idx[1]]
        
    return X_train, X_test

def liner_regression(X_train, y_train, X_test):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train)
    X_train_tr = encoder.transform(X_train)
    X_test_tr = encoder.transform(X_test)
    
    clf = Ridge(alpha=3, random_state=RANDOM_STATE)
    clf.fit(X_train_tr, y_train)
    y_pred = clf.predict(X_test_tr)
    return y_pred

In [4]:
train = pd.read_csv('./data/train.txt', names=['user', 'item', 'score'], sep='\t')
test = pd.read_csv('./data/test.txt', names=['user', 'item'], sep='\t')

In [5]:
train.head()

Unnamed: 0,user,item,score
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [6]:
martix_train = train.pivot_table(columns='item', index='user', values='score').fillna(0).values
martix_train

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

### IALS

public MSE 0.90865

In [7]:
# https://www.researchgate.net/publication/220788980_Large-Scale_Parallel_Collaborative_Filtering_for_the_Netflix_Prize
# https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf
# http://yifanhu.net/PUB/cf.pdf
# https://www.cs.rochester.edu/twiki/pub/Main/HarpSeminar/Factorization_Meets_the_Neighborhood-_a_Multifaceted_Collaborative_Filtering_Model.pdf
# https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe
# http://activisiongamescience.github.io/2016/01/11/Implicit-Recommender-Systems-Biased-Matrix-Factorization/
# https://arxiv.org/pdf/1809.00979.pdf
# https://www.ethanrosenthal.com/2016/01/09/explicit-matrix-factorization-sgd-als/

In [8]:
class MY_IALS:
    def __init__(self, iterations=10, latent_features=5, alpha=25, lambda_reg=8, verbose=1):
        self.iterations = iterations
        self.latent_features = latent_features
        self.alpha = alpha
        self.lambda_reg = lambda_reg
        self.verbose = verbose
        np.random.seed(17)
        
    def fit(self, train, test=None):
        # Количество всех юзеров и фильмов
        user_size, item_size = train.shape
    
        # 1) Инициалтзируем неизвестные пары в train
        real_score = train > 0
        init_score = train == 0
        # Берем среднее от всех не нулевых значений
        # mean_score = train[real_score].mean()
        # matrix_full = train + init_score * mean_score
        # В качестве инициализации берем предсказание Ridge регрессии
        has_score = np.argwhere(real_score == True)
        need_init = np.argwhere(init_score == True)
        has_init = liner_regression(has_score, train[real_score], need_init)
        matrix_init = init_score * 1.
        for i, idx in enumerate(need_init):
            matrix_init[idx[0]][idx[1]] = has_init[i]
        matrix_full = train + matrix_init
        
        # 2) Считаем уверенность C = 1 + alpha * train
        # Для всех реальных рейтингов которые были поставлены пользователями (а не инициализированные) будет высокая уверенность
        # C = np.ones(train.shape) + self.alpha * train
        C = np.ones(train.shape) + self.alpha * np.log(np.ones(train.shape) + train / 0.1)
        C_I = C - 1
        
        # 3) Считаем Lambda * I которое одинаково для обоих шагов
        lambda_I = self.lambda_reg * sparse.eye(self.latent_features + 1)
        
        # 4) Создаем две матрицы со случайными числами на которые будем раскладывать user x item    
        # user u x f
        X = np.hstack([np.ones((user_size, 1)), np.random.normal(size=(user_size, self.latent_features))])
        # item i x f
        Y = np.hstack([np.ones((item_size, 1)), np.random.normal(size=(item_size, self.latent_features))])
        
        # 5) Смещение W0
        X_bias = np.array([0] * user_size)
        Y_bias = np.array([0] * item_size)
        
        # Каждому юзеру или фильму присвоим свое значение регуляризации в зависимости от того как много у них оценок
        n_user = (train > 0).sum(1) # сумма рейтингов каждого юзера (по строкам)
        n_item = (train > 0).sum(0) # сумма рейтингов кажого фильма (по столбцам)
        
        for iteration in range(self.iterations):            
            # Шаг user - по строкам
            yTy = np.dot(Y.T, Y)
            Cu = C * (matrix_full - Y_bias)
            for u in range(user_size):
                # X = ((Y.T*Y + Y.T*(C - I) * Y) + lambda*I)^-1 * (Y.T * Cu)
                inv = np.linalg.inv(yTy + np.dot(Y.T * C_I[u], Y) + lambda_I * n_user[u])
                X[u] = np.dot(np.dot(inv, Y.T), Cu[u].reshape(-1, 1)).ravel()
            X_bias = X[:, 0].copy().reshape(-1, 1)
            X[:, 0] = 1
                
            # Шаг item - по столбцам
            xTx = np.dot(X.T, X)
            Ci = C * (matrix_full - X_bias)
            for i in range(item_size):
                # Y = ((X.T*X + X.T*(C - I) * X) + lambda*I)^-1 * (X.T * Ci)
                inv = np.linalg.inv(xTx + np.dot(X.T * C_I[:, i], X) + lambda_I * n_item[i])
                Y[i] = np.dot(np.dot(inv, X.T), Ci[:, i].reshape(-1, 1)).ravel()
            Y_bias = Y[:, 0].copy().ravel()
            Y[:, 0] = 1
            
            result = np.dot(X[:, 1:], Y[:, 1:].T) + X_bias + Y_bias
            result = set_scope(result)
            
            if test is not None and self.verbose == 1:
                mse = np.sqrt(((result * (test > 0) - test) ** 2).sum() / (test > 0).sum())
                print(str(iteration) + " - " + str(mse))
        
        return result

In [10]:
X_train, X_test = train_split(martix_train, test_size = 0.1, random_state=RANDOM_STATE)
result = MY_IALS().fit(X_train, X_test)

0 - 0.9483852905567737
1 - 0.9283910516008547
2 - 0.9168814917902718
3 - 0.9116646082299161
4 - 0.9083196920470347
5 - 0.9059969043931989
6 - 0.9043352800160034
7 - 0.9031268241132095
8 - 0.9022903926008039
9 - 0.9017922983483267


In [11]:
result = MY_IALS().fit(martix_train)

In [12]:
y_test_pred = []
for i, idx in enumerate(test.values):
    y_test_pred.append(result[idx[0]-1][idx[1]-1])
    
submit(y_test_pred, 'predict5.txt')

### Liner Regression

public MSE 0.94785

In [8]:
train_df = train.copy()
X = train_df.values[:, :2]
y = train_df.values[:, 2]
X_all, y_all = shuffle_data(X, y, RANDOM_STATE)

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size=0.3, random_state=RANDOM_STATE)
y_valid_pred = liner_regression(X_train, y_train, X_valid)
mean_squared_error(y_valid_pred, y_valid)

0.8816631217520343

In [15]:
y_pred_mean = [2.5] * y_valid.shape[0]
mean_squared_error(y_pred_mean, y_valid)

2.3253744801442715

In [16]:
# test
y_test_pred = liner_regression(X_all, y_all, test.values)
y_test_pred = set_scope(y_test_pred)

In [15]:
submit(y_test_pred, 'predict1.txt')

### Fastfm

public MSE 0.91155

In [15]:
from fastFM.mcmc import FMRegression

In [17]:
clf_fm = FMRegression(rank=3, n_iter=1000, random_state=RANDOM_STATE)
y_valid_pred_fm = clf_fm.fit_predict(X_train_tr, y_train, X_valid_tr)

In [18]:
mean_squared_error(y_valid_pred_fm, y_valid)

0.8212921715442131

In [27]:
# test
clf_fm = FMRegression(rank=3, n_iter=1000, random_state=RANDOM_STATE)
y_valid_pred_fm = clf_fm.fit_predict(X_all_tr, y_all, X_test_tr)
y_valid_pred_fm = set_scope(y_valid_pred_fm)

In [28]:
submit(y_valid_pred_fm, 'predict2.txt')