In [77]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import torch
from utils import *

In [78]:
def _read_df_in_format(root):
    def reformat_id(id):
    # split and reformat the df
        row, col = id.split('_')
        return int(row[1:]), int(col[1:])
    df = pd.read_csv(root)
    df['row'], df['col'] = zip(*df['Id'].map(reformat_id))
    df.drop('Id', axis=1, inplace=True)
    return df

def store_dense_matrix_to_submission(sub_sample_path, store_path, data_matrix,clip_min=1, clip_max=5):
    # print("Loading requests specified by submission samples...")
    df = _read_df_in_format(sub_sample_path)
    # print(f"Storing {nrows} records for submission as requested...")
    row_id = df['row'].to_numpy() - 1
    col_id = df['col'].to_numpy() - 1
    data_matrix = np.clip(data_matrix, clip_min, clip_max)
    df['Prediction'] = data_matrix[row_id, col_id]

    def reformat_id(record):
        return f"r{record['row']:.0f}_c{record['col']:.0f}"
    df['Id'] = df.apply(reformat_id, axis=1)
    df = df.drop(['row', 'col'], axis=1)
    df.to_csv(store_path, columns=['Id', 'Prediction'], index=False)

def _convert_df_to_matrix(df):
    n_row = df['row'].max()
    n_col = df['col'].max()
    # print(n_row)
    # print(n_col)
    row_id = df['row'].to_numpy() - 1 # id starts from 1
    col_id = df['col'].to_numpy() - 1

    data_matrix = np.zeros((n_row, n_col))
    data_matrix[:] = np.nan
    # Check! Data type could cause rounding errors!
    data_matrix[row_id, col_id] = df['Prediction']
    is_provided = data_matrix!=0
    return data_matrix, is_provided

In [89]:
class Baseline:
    def __init__(self, rank_svd = 9, rank_als = 3, num_iterations = 20, lambda_als = 0.1):
        self.rank_svd = rank_svd
        self.rank_als = rank_als
        self.num_iterations = num_iterations
        self.lambda_als = lambda_als
        self.num_movies = 1000

    def SVD(self, A, num_movies, k=9):
        U, s, Vt = np.linalg.svd(A, full_matrices=False)

		# using the top k eigenvalues
        S = np.zeros((num_movies, num_movies))
        S[:k, :k] = np.diag(s[:k])

		# reconstruct matrix
        return U, S, Vt   
    
    def IterSVD(self, A, mask_A, shrinkage=38, n_itr=15):
        X = A.copy()
        for i in range(n_itr):
            U, s, Vt = np.linalg.svd(X, full_matrices=False)
            s_ = (s - shrinkage).clip(min=0)
            X = U.dot(np.diag(s_)).dot(Vt)
            X[mask_A] = A[mask_A]
            print("%sth iteration is complete." % i)

        return X

    def ALS(self, A, mask_A, k=3, n_itr=20, lambda_=0.1):
        print("Initializing ALS")
        n, m = A.shape
        U, S, Vt = self.SVD(A, self.num_movies, self.rank_svd)
        U = np.copy(U[:,:k])
        V = np.copy(Vt[:k,:])

        print("Starting Iterations")

        for iter in range(n_itr):
            for i, Ri in enumerate(mask_A):
                temp1 = V@(Ri[:,None] * V.T) + lambda_ * np.eye(k)
                temp2 = np.dot(V, np.dot(np.diag(Ri), A[i].T))
                U[i] = np.linalg.solve(temp1, temp2).T
            print("Error after solving for U matrix:", np.sum((mask_A * (A - np.dot(U, V))) ** 2) / np.sum(mask_A))

            for j, Rj in enumerate(mask_A.T):
                temp1 = U.T@(Rj[:,None] * U) + lambda_ * np.eye(k)
                temp2 = U.T@(Rj * A[:, j])
                V[:,j] = np.linalg.solve(temp1,temp2)
            print("Error after solving for V matrix:", np.sum((mask_A * (A - np.dot(U, V))) ** 2) / np.sum(mask_A))
            print("%sth iteration is complete." % iter)

        return U, V

    def predict(self, U, V, col_mean, col_std):
        return de_norm(U.dot(V), col_mean, col_std)


In [80]:
# df = _read_df_in_format('data_train.csv')
# df = _convert_df_to_matrix(df)[0]
# df = pd.DataFrame(df)
# df.to_csv('data_train_matrix.csv', index = False)

df = pd.read_csv('data_train_matrix.csv')

In [81]:
A, mean, std = stand_norm(df)
A = A.to_numpy()
A[np.isnan(A)] = 0
mask_A = A != 0
# mask_A

In [91]:
bsl = Baseline()


In [None]:
X = bsl.IterSVD(A, mask_A, shrinkage=38, n_itr=15)

In [93]:
U, V = bsl.ALS(X, mask_A, k=3, n_itr=20, lambda_=0.1)

Initializing ALS
Starting Iterations
Error after solving for U matrix: 0.8888921198541139
Error after solving for V matrix: 0.8742462777284741
0th iteration is complete.
Error after solving for U matrix: 0.8743980928917027
Error after solving for V matrix: 0.8695334240925179
1th iteration is complete.
Error after solving for U matrix: 0.8705748990903635
Error after solving for V matrix: 0.8679736743098979
2th iteration is complete.
Error after solving for U matrix: 0.868867500645511
Error after solving for V matrix: 0.8672281836995291
3th iteration is complete.
Error after solving for U matrix: 0.8679377591174855
Error after solving for V matrix: 0.8668076901508284
4th iteration is complete.
Error after solving for U matrix: 0.8673710812670917
Error after solving for V matrix: 0.866545105959936
5th iteration is complete.
Error after solving for U matrix: 0.8669988332069211
Error after solving for V matrix: 0.8663692750273488
6th iteration is complete.
Error after solving for U matrix: 

In [94]:
predictions = bsl.predict(U, V, mean, std)
predictions = np.round(predictions)
store_dense_matrix_to_submission('sampleSubmission.csv', 'submission.csv', predictions, clip_min=1, clip_max=5)

In [32]:
A = np.random.random((1000,10000,3))
