In [1]:
from scipy import sparse
from tqdm.notebook import tqdm
import numpy as np
import csv

In [2]:
class FactorizationMachine:
    def __init__(self, feature_dim: int, latent_size=3):
        self.w0 = np.zeros(1)
        self.w = np.zeros(feature_dim)
        self.V = np.zeros((feature_dim, latent_size))
    
    def train(self, data, epochs=5, batch_size=8, lr=1e-3, n_users=480189, n_movies=17770):
        data_len = data.shape[0]
        pbar = tqdm(total=data_len * epochs)
        for epoch in range(epochs):
            avg_loss = 0
            for batch in self.batchify(data, batch_size):
                x, y = self.collate_batch(batch, n_users, n_movies)
                loss = self.step(x, y, lr)
                avg_loss += loss
                pbar.update(len(y))
                pbar.set_description(f"Epoch {epoch}; Loss {loss:.4f}")
            avg_loss /= (len(data) // batch_size)
            print(f"Epoch {epoch}; Average Loss {avg_loss:.4f}")

    def step(self, x, y, lr):
        batch_size = x.shape[0]
        a = x @ self.V
        b = (x.power(2)) @ (self.V ** 2)
        c = x.dot(self.w)
        y_pred = self.w0 + c + 0.5 * (a ** 2 - b).sum(-1)
        diff = (y - y_pred)
        loss = np.mean(diff ** 2)
        
        dloss_dy_pred = - 2 * diff.reshape(-1, 1)
        
        dloss_dw0 = dloss_dy_pred.mean(axis=0)
        dloss_dw = np.ravel(x.multiply(dloss_dy_pred).mean(axis=0))
        
        dloss_dV = x.multiply(dloss_dy_pred).T @ a / batch_size
        dloss_dV = dloss_dV.T
        x2 = x.power(2)
        for i, row_ in enumerate(self.V.T):
            c_ = x2.multiply(row_).multiply(dloss_dy_pred).mean(0)
            dloss_dV[i] -= np.ravel(c_)
        
        self.w0 -= lr * dloss_dw0
        self.w -= lr * dloss_dw
        self.V -= lr * dloss_dV.T
        
        return loss
    
    def rmse_score(self, data, batch_size=8, n_users=480189, n_movies=17770):
        data_len = data.shape[0]
        cum_rmse = 0
        pbar = tqdm(desc="Calculating RMSE")
        for batch in self.batchify(data, batch_size):
            x, y = self.collate_batch(batch, n_users, n_movies)
            y_pred = self.predict(x)
            diff = (y - y_pred)
            cum_rmse += np.sqrt(np.mean(diff ** 2))
            pbar.update(len(y))
        
        return cum_rmse / (data_len // batch_size)
    
    def predict(self, x: sparse.csr_matrix):
        a = (x @ self.V) ** 2
        b = (x.power(2)) @ (self.V ** 2)
        c = x.dot(self.w)
        result = self.w0 + c + 0.5 * (a - b).sum(-1)
        return result
    
    @staticmethod
    def batchify(iterable, batch_size=32):
        for i in range(0, len(iterable), batch_size):
            span = slice(i, i + batch_size)
            yield iterable[span]
    
    @staticmethod
    def collate_batch(batch, n_users, n_movies):
        y, x_data, x_i, x_j = [], [], [], []
        users_and_movies = n_users + n_movies
        for i, (mid, uid, rate) in enumerate(batch):
            x_i.append(i)
            x_j.append(uid)
            x_data.append(1)
            x_i.append(i)
            x_j.append(n_users + mid)
            x_data.append(1)
            y.append(rate)
        x = sparse.coo_matrix((x_data, (x_i, x_j)), shape=(len(y), users_and_movies))
        return x.tocsr(), np.array(y)
        

In [3]:
def read_data(total_size=100480507):
    data = np.empty(shape=(total_size, 3), dtype=np.int)

    with open("data/full_data.csv") as file:
        reader = csv.reader(file)
        for i, sample in tqdm(enumerate(reader), total=total_size):
            data[i] = sample
    
    return data

In [4]:
total=100480507
n_users=480189
n_movies=17770

data = read_data()

HBox(children=(FloatProgress(value=0.0, max=100480507.0), HTML(value='')))




In [5]:
def kfold(x, k=5, shuffle=True):
    index = np.arange(len(x))
    if shuffle:
        np.random.shuffle(index)
    chunks = np.array_split(index, k)
    for i in range(k):
        train_index = np.concatenate(tuple(ch for j, ch in enumerate(chunks) if j != i))
        test_index = chunks[i]
        yield x[train_index], x[test_index]

In [6]:
batch_size = 100000
for i, (train_part, test_part) in enumerate(kfold(data), 1):
    print(f"Training {i} fold")
    fm = FactorizationMachine(n_users + n_movies, latent_size=5)
    fm.train(train_part, batch_size=batch_size, lr=1e-3)
    rmse = fm.rmse_score(test_part, batch_size=batch_size)
    print(f"RMSE on {i} fold: {rmse:.4f}\n")

Training 1 fold


HBox(children=(FloatProgress(value=0.0, max=401922025.0), HTML(value='')))

Epoch 0; Average Loss 5.0636
Epoch 1; Average Loss 1.3335
Epoch 2; Average Loss 1.1844
Epoch 3; Average Loss 1.1782
Epoch 4; Average Loss 1.1777


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Calculating RMSE', max=1.0, style=Progr…

RMSE on 1 fold: 1.0898

Training 2 fold


HBox(children=(FloatProgress(value=0.0, max=401922025.0), HTML(value='')))

Epoch 0; Average Loss 5.0635
Epoch 1; Average Loss 1.3334
Epoch 2; Average Loss 1.1844
Epoch 3; Average Loss 1.1782
Epoch 4; Average Loss 1.1777


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Calculating RMSE', max=1.0, style=Progr…

RMSE on 2 fold: 1.0898

Training 3 fold


HBox(children=(FloatProgress(value=0.0, max=401922030.0), HTML(value='')))

Epoch 0; Average Loss 5.0635
Epoch 1; Average Loss 1.3335
Epoch 2; Average Loss 1.1844
Epoch 3; Average Loss 1.1782
Epoch 4; Average Loss 1.1777


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Calculating RMSE', max=1.0, style=Progr…

RMSE on 3 fold: 1.0898

Training 4 fold


HBox(children=(FloatProgress(value=0.0, max=401922030.0), HTML(value='')))

Epoch 0; Average Loss 5.0632
Epoch 1; Average Loss 1.3334
Epoch 2; Average Loss 1.1843
Epoch 3; Average Loss 1.1781
Epoch 4; Average Loss 1.1776


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Calculating RMSE', max=1.0, style=Progr…

RMSE on 4 fold: 1.0900

Training 5 fold


HBox(children=(FloatProgress(value=0.0, max=401922030.0), HTML(value='')))

Epoch 0; Average Loss 5.0633
Epoch 1; Average Loss 1.3333
Epoch 2; Average Loss 1.1843
Epoch 3; Average Loss 1.1781
Epoch 4; Average Loss 1.1776


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Calculating RMSE', max=1.0, style=Progr…

RMSE on 5 fold: 1.0900



### Результаты

| Folds | MSE Train Loss | RMSE Test Loss |
|-------|----------------|----------------|
| 1     | 1.1777         | 1.0898         |
| 2     | 1.1777         | 1.0898         |
| 3     | 1.1777         | 1.0898         |
| 4     | 1.1776         | 1.0900         |
| 5     | 1.1776         | 1.0900         |