In [48]:
import numpy as np
import scipy.sparse as sp
import csv
import time
import pandas as pd
import scipy.linalg as la

## Read Data

In [61]:
# class CR_Data():
#     def __init__(self, csv_file="small.csv"):
#         self.csv_file = csv_file
#         df = pd.read_csv(csv_file)
#         users = set(df["user_id"])
#         tracks = set(df["trackname"])
#         self.user2idx, self.idx2user, self.track2idx, self.idx2track = {}, {}, {}, {}
#         #print(users, tracks)
#         for idx, user in enumerate(users):
#             self.user2idx[user] = idx
#             self.idx2user[idx] = user
#         for idx, track in enumerate(tracks):
#             self.track2idx[track] = idx
#             self.idx2track[idx] = track

#     def split_train_test(self, train_portion=0.8):
#         datas = []
#         with open(self.csv_file, 'rt', newline="", encoding='utf-8') as f:
#             reader = csv.reader(f)
#             assert tuple(next(reader)) == ("user_id","artistname","trackname","playlistname")
#             datas = [row for row in reader if len(row) == 4] # Filter invalid data
        
#         train_num = int(len(datas) * train_portion)
#         train_datas, test_datas = datas[:train_num], datas[train_num:]
#         train_data_sparse, test_data_sparse = self.build_sparse_matrix(train_datas), self.build_sparse_matrix(test_datas)
        
#         return train_data_sparse, test_data_sparse
    
#     def build_sparse_matrix(self, inputs):
#         rows, cols, data = [], [], []
#         for item in inputs:
#             user, _, track, _ = item
#             rows.append(self.user2idx[user])
#             cols.append(self.track2idx[track])
#             data.append(1)
#         X = sp.coo_matrix((data, (rows, cols)), shape=(len(self.user2idx), len(self.track2idx)))
        
#         return X

class CR_Data():
    def __init__(self, csv_file="small.csv"):
        self.csv_file = csv_file
        self.df = pd.read_csv(csv_file)
        self.df = self.df.drop(columns=["artistname", "playlistname"])
        users = set(self.df["user_id"])
        tracks = set(self.df["trackname"])
        self.user2idx, self.idx2user, self.track2idx, self.idx2track = {}, {}, {}, {}
        #print(users, tracks)
        for idx, user in enumerate(users):
            self.user2idx[user] = idx
            self.idx2user[idx] = user
        for idx, track in enumerate(tracks):
            self.track2idx[track] = idx
            self.idx2track[idx] = track
        
        self.df.user_id = self.df.user_id.apply(lambda x: self.user2idx[x])
        self.df.trackname = self.df.trackname.apply(lambda x: self.track2idx[x])
        self.df['freq'] = self.df.groupby(['trackname','user_id'])['trackname'].transform("count")
        self.df.drop_duplicates(inplace=True)
        self.df = self.df.reset_index().drop(columns="index")
        print(self.df)
        self.df = self.df.sample(frac=1) # shuffle
        print(self.df)

    def split_train_test(self, train_portion=0.8):
        train_num = int(len(self.df) * train_portion)
        train_datas, test_datas = self.df.iloc[:train_num], self.df.iloc[train_num:]
        train_data_sparse, test_data_sparse = self.build_sparse_matrix(train_datas), self.build_sparse_matrix(test_datas)
        
        return train_data_sparse, test_data_sparse
    
    def build_sparse_matrix(self, inputs):
        rows, cols, data = [], [], []
        for _, item in inputs.iterrows():
            user, track, freq = item['user_id'], item['trackname'], item['freq']
            rows.append(user)
            cols.append(track)
            data.append(freq)
        X = sp.coo_matrix((data, (rows, cols)), shape=(len(self.user2idx), len(self.track2idx)))
        
        return X

cr_data = CR_Data()
train_data, test_data = cr_data.split_train_test()
#print(train_data.toarray(), test_data.toarray())

       user_id  trackname  freq
0            1       6690     1
1            1       7706     1
2            1       9078     1
3            1       4188     1
4            1       4167     1
...        ...        ...   ...
10291        2        566     1
10292        2       8563     1
10293        2       6117     1
10294        2       4321     1
10295        2       8397     1

[10296 rows x 3 columns]
       user_id  trackname  freq
9753         8       3985     1
7196         8       3332     2
6218         8       2721     2
348          3       7230     1
10208        7       4945     1
...        ...        ...   ...
2374         8       6479     2
1796         4          5     1
7891         8       4673     1
6215         8       8465     2
6364         8       6732     1

[10296 rows x 3 columns]


In [53]:
def recommend(X, U, V):
    """Recommend a new movie for every user.

        args: 
            X : np.array[num_users, num_movies] -- the ratings matrix
            U : np.array[num_users, num_features] -- a matrix of features for each user
            V : np.array[num_movies,num_features] -- a matrix of features for each movie

        return: List[int] -- a list of movie Ids for each user
    """
    
    res = []
    pred = U @ V.T
    pred[np.where(X != 0)] = np.NINF
    
    return list(np.argmax(pred, axis=1))

def error(X, U, V):
    """ Compute the mean error of the observed ratings in X and their estimated values. 

        args: 
            X : np.array[num_users, num_movies] -- the ratings matrix
            U : np.array[num_users, num_features] -- a matrix of features for each user
            V : np.array[num_movies,num_features] -- a matrix of features for each movie

        return: float -- the mean squared error between non-zero entries of X and the ratings
            predicted by U and V; as this is an error and not a loss function, you do not need to include the
            regularizing terms.
        """

    pred = U @ V.T
    indices = np.where(X != 0)
    error = np.square(pred[indices] - X[indices]).mean()
    
    return error

In [57]:
def train(X_train, X_test, k, niters=12, lam=10., verbose=True):
    """ Train a collaborative filtering model. 
        Args: 
            X_train : np.array[num_users, num_movies] -- the training ratings matrix, assumed dense
            X_test : np.array[num_users, num_movies] -- the test ratings matrix, assumed dense
            k : int -- the number of features in the CF model
            niters : int -- number of iterations to run
            lam : float -- regularization parameter, shown as lambda
            verbose : boolean -- if true, print the error on train and test sets every few iterations 

        return : Tuple[U, V]
            U : np.array[num_users,  num_features] -- the user-feature matrix
            V : np.array[num_movies, num_features] -- the movie-feature matrix
    """
    # MODIFY THIS FUNCTION
    
    m, n = X_train.shape
    W = np.zeros([m, n])
    W[np.where(X_train != 0)] = 1
#     i_indices, j_indices = np.where(X_train != 0)
#     print(i_indices, j_indices)
#     print(m, n, len(i_indices), len(j_indices))
    U = np.random.normal(scale=1.0, size=(m, k))
    V = np.random.normal(scale=1.0, size=(k, n))
    I = np.identity(k)

    if verbose:
        print("| Time    | Iter  | Train Err | Test Err |")
        print("| ------- | ----- | --------- | -------- |")

    start_time = time.perf_counter()
    for e in range(niters):
        
        for j in range(n):
            i_indices = np.where(W[:, j] == 1)[0]
            V[:, j] = la.solve(U[i_indices, :].T.dot(U[i_indices, :]) + lam * I, U[i_indices, :].T.dot(X_train[i_indices, j]))

        for i in range(m):
            j_indices = np.where(W[i] == 1)[0]
            U[i] = la.solve(V[:, j_indices].dot(V[:, j_indices].T) + lam * I, V[:, j_indices].dot(X_train[i, j_indices]))
        
        if verbose: 
            print(f"| {time.perf_counter() - start_time: 7.3f} |{e+1: 6d} |{error(X_train, U, V.T):10.4f} |{error(X_test, U, V.T):9.4f} |")
    
    if verbose: 
        print("")
    #return U, V.T

In [64]:
train(train_data.toarray(), test_data.toarray(), 1, 20)

| Time    | Iter  | Train Err | Test Err |
| ------- | ----- | --------- | -------- |
|   0.758 |     1 |    1.2030 |   2.3584 |
|   1.516 |     2 |    0.1636 |   2.4011 |
|   2.260 |     3 |    0.1537 |   2.3794 |
|   2.993 |     4 |    0.1482 |   2.3603 |
|   3.731 |     5 |    0.1443 |   2.3465 |
|   4.461 |     6 |    0.1414 |   2.3377 |
|   5.189 |     7 |    0.1396 |   2.3321 |
|   5.915 |     8 |    0.1386 |   2.3286 |
|   6.640 |     9 |    0.1381 |   2.3262 |
|   7.405 |    10 |    0.1377 |   2.3245 |
|   8.168 |    11 |    0.1375 |   2.3234 |
|   8.916 |    12 |    0.1374 |   2.3227 |
|   9.646 |    13 |    0.1373 |   2.3222 |
|  10.379 |    14 |    0.1372 |   2.3218 |
|  11.108 |    15 |    0.1372 |   2.3216 |
|  11.829 |    16 |    0.1371 |   2.3214 |
|  12.544 |    17 |    0.1371 |   2.3213 |
|  13.330 |    18 |    0.1371 |   2.3212 |
|  14.078 |    19 |    0.1371 |   2.3211 |
|  14.819 |    20 |    0.1371 |   2.3211 |

