# Практика по матричным разложениям

In [None]:
import os
import sys

In [None]:
sys.path.append(os.path.join('..', '..'))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse as sp
import warnings
import zipfile

from matplotlib import rcParams
from sklearn.decomposition import TruncatedSVD, NMF

from src.utils.common import get_data_folder, timeit

%matplotlib inline
rcParams['font.size'] = 14
rcParams['figure.figsize'] = 7, 6

warnings.filterwarnings('ignore')
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})

SEED = 3141952
np.random.seed(SEED)

### Handling the data

In [None]:
data_folder_path = get_data_folder()
path_to_ml_archive = os.path.join(data_folder_path, 'ml-1m.zip')

if not os.path.exists(os.path.join(data_folder_path, 'ml-1m')):
    with zipfile.ZipFile(path_to_ml_archive, 'r') as f:
        f.extractall(data_folder_path) 

In [None]:
df_ratings = pd.read_csv(os.path.join(data_folder_path, 'ml-1m', 'ratings.dat'), 
                         delimiter='::', 
                         header=None, 
                         names=['user_id', 'movie_id', 'rating', 'timestamp']).drop(columns='timestamp')

df_movie = pd.read_csv(os.path.join(data_folder_path, 'ml-1m', 'movies.dat'), 
                       delimiter='::',
                       header=None, 
                       names=['movie_id', 'name', 'category'])

In [None]:
df_ratings.head(10)

In [None]:
df_ratings.shape

In [None]:
len(df_ratings['user_id'].unique()), len(df_ratings['movie_id'].unique())

In [None]:
df_movie.head(10)

### Preprocessing

In [None]:
df_ratings_sample = df_ratings[(df_ratings['user_id'] < 500) & (df_ratings['movie_id'] < 100)]

users = df_ratings_sample['user_id']
movies = df_ratings_sample['movie_id']
user_item_csr = sp.csr_matrix((df_ratings_sample['rating'], (users, movies)))

In [None]:
user_item_csr

In [None]:
user_item_csr[1, 1]

### Алгоритмы

#### SVD

* docs: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD
* code: https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/decomposition/_truncated_svd.py#L25

In [None]:
svd = TruncatedSVD(n_components=16, random_state=SEED)
US = svd.fit_transform(user_item_csr)
VT = svd.components_
user_item_svd = np.dot(US, VT)

In [None]:
user_item_csr[:10, :10].toarray()

In [None]:
user_item_svd[:10, :10]

In [None]:
diff = user_item_csr - user_item_svd
np.min(diff), np.mean(diff), np.max(diff)

#### SGD

In [None]:
class SGDMF():
    # From https://github.com/albertauyeung/matrix-factorization-in-python
    def __init__(self, X, k, lr, lmbda, iterations):
        """
        Perform matrix factorization to predict empty
        entries in a matrix.

        Arguments
        - X (ndarray)      : user-item rating matrix
        - k (int)          : number of latent dimensions
        - lr (float)       : learning rate
        - lmbda (float)    : regularization parameter
        - iterations(int)  : number of sgd iterations
        """

        self.X = X
        self.num_users, self.num_items = X.shape
        self.k = k
        self.lr = lr
        self.lmbda = lmbda
        self.iterations = iterations

    def train(self):
        self.P = np.random.normal(scale=1. / self.k, size=(self.num_users, self.k))
        self.Q = np.random.normal(scale=1. / self.k, size=(self.num_items, self.k))

        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.X[np.where(self.X != 0)])

        self.samples = [
            (i, j, self.X[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.X[i, j] > 0
        ]

        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i + 1) % 10 == 0:
                print("Iteration: %d ; error = %.4f" % (i + 1, mse))

        return training_process

    def mse(self):
        xs, ys = self.X.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.X[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.lr * (e - self.lmbda * self.b_u[i])
            self.b_i[j] += self.lr * (e - self.lmbda * self.b_i[j])

            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]

            self.P[i, :] += self.lr * (e * self.Q[j, :] - self.lmbda * self.P[i, :])
            self.Q[j, :] += self.lr * (e * P_i - self.lmbda * self.Q[j, :])

    def get_rating(self, i, j):
        return self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)

    def full_matrix(self):
        return self.b + self.b_u[:, np.newaxis] + self.b_i[np.newaxis:, ] + self.P.dot(self.Q.T)

In [None]:
sgd = SGDMF(X=user_item_csr.toarray(), k=16, lr=0.1, lmbda=0.1, iterations=10)
sgd.train()

In [None]:
user_item_sgd = sgd.full_matrix()

In [None]:
user_item_csr[:10, :10].toarray()

In [None]:
user_item_sgd[:10, :10]

#### NMF

* docs: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html#sklearn.decomposition.NMF
* code: https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/decomposition/_nmf.py#L1096

In [None]:
nmf = NMF(n_components=16, random_state=SEED)
W = nmf.fit_transform(user_item_csr)
H = nmf.components_
user_item_nmf = np.dot(W, H)

In [None]:
W, W.shape

In [None]:
H, H.shape

In [None]:
user_item_nmf.shape

In [None]:
user_item_csr[:10, :10].toarray()

In [None]:
user_item_nmf[:10, :10]