<a href="https://colab.research.google.com/github/eunyul24/eunyul24.github.io/blob/master/B_DS2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import numpy as np
import csv

In [3]:
header = []
userId = []
movieId = []
ratings = []
test = []
rownum = -1

with open('/content/drive/My Drive/Colab Notebooks/ml-20m/ratings.csv','r') as f:
    data = csv.reader(f)
    for row in data:
        rownum += 1
        if rownum == 0:
            header = row
            continue
        if int(row[3]) < 1388502017: 
            userId.append(int(row[0]))
            movieId.append(int(row[1]))
            ratings.append(float(row[2]))
        else: test.append([int(row[0]), int(row[1]), float(row[2]), int(row[3])])
            
print(len(userId))
print(len(test))

19152913
847350


In [8]:
userIdx = dict()
for i, uid in enumerate(np.unique(userId)):
    userIdx[uid] = i

movieIdx = dict()
for i, mid in enumerate(np.unique(movieId)):
    movieIdx[mid] = i

X = np.zeros((len(ratings),2), dtype=int)
for i in range(len(userId)):
    X[i] = [userIdx[userId[i]], movieIdx[movieId[i]]]

In [9]:
class MatrixFactorization():
    def __init__(self, ratings, X, k = 10, learning_rate = 0.01, reg_param = 0.1, epochs = 20):
        """
        param R: ratings
        param X: userId, movieId
        param k: latent parameter
        param learning_rate: alpha on weight update
        param reg_param: beta on weight update
        param epochs: training epochs
        """

        self.ratings = ratings
        self.X = X
        self.num_users = len(np.unique(X[:, 0]))
        self.num_movies = len(np.unique(X[:, 1]))
        self.k = k
        self.learning_rate = learning_rate
        self.reg_param = reg_param
        self.epochs = epochs

    def fit(self):
        """
        training Matrix Factorization : Update matrix latent weight and bias
        
        return: training_process
        """

        # init latent features
        self.P = np.random.normal(size=(self.num_users, self.k))
        self.Q = np.random.normal(size=(self.num_movies, self.k))

        # init biases
        self.b = np.mean(self.ratings)
        self.b_P = np.zeros(self.num_users)
        self.b_Q = np.zeros(self.num_movies)

        # train while epochs
        self.training_process = []
        for epoch in range(self.epochs):
            for i,rating in enumerate(self.ratings):
                self.gradient_descent(self.X[i, 0], self.X[i, 1], rating)
            rmse = self.rmse()
            self.training_process.append((epoch,rmse))
            
            # print status
            if (epoch + 1) % 10 == 0:
                print("Iteration: %d ; RMSE = %.4f" % (epoch + 1, rmse))
        
        return self.training_process
    

    def rmse(self):
        """
        compute root mean square error
        
        return: rmse cost
        """
        
        error = 0
        for i,rating in enumerate(ratings):
            error += pow(rating - self.get_prediction(self.X[i, 0], self.X[i, 1]), 2)
        return np.sqrt(error)


    def gradient_descent(self, i, j, rating):
        """
        graident descent function

        param i: user index of matrix
        param j: item index of matrix
        param rating: rating of (i,j)
        """

        # get error
        prediction = self.get_prediction(i, j)
        error = rating - prediction

        # update biases
        self.b_P[i] += self.learning_rate * (error - self.reg_param * self.b_P[i])
        self.b_Q[j] += self.learning_rate * (error - self.reg_param * self.b_Q[j])

        # update latent feature
        self.P[i, :] += self.learning_rate * (error * self.Q[j, :] - self.reg_param * self.P[i, :])
        self.Q[j, :] += self.learning_rate * (error * self.P[i, :] - self.reg_param * self.Q[j, :])


    def get_prediction(self, i, j):
        """
        get predicted rating: user_i, item_j
        
        return: prediction of r_ij
        """
        return self.b + self.b_P[i] + self.b_Q[j] + self.P[i, :].dot(self.Q[j, :].T)

In [10]:
MF = MatrixFactorization(ratings, X)
training_process = MF.fit()

print("train RMSE:", MF.rmse())

train RMSE: {} 3781.8531457378017


In [None]:
f = open('/content/drive/My Drive/Colab Notebooks/ml-20m/B_results_DS2.csv', 'w', encoding='utf-8')
header[2] = 'predected rating'
wr = csv.writer(f)
wr.writerow(header)

error = 0

for uId, mId, rating, time in test:
    if uId in userIdx.keys() and mId in movieIdx.keys():
        predicted = MF.get_prediction(userIdx[uId], movieIdx[mId])
    elif not uId in userIdx.keys() and mId in movieIdx.keys():
        predicted = np.mean([ratings[i] for i in np.where(X[:, 1] == movieIdx[mId])[0]])
    elif uId in userIdx.keys() and not mId in movieIdx.keys():
        predicted = np.mean([ratings[i] for i in np.where(X[:, 0] == userIdx[uId])[0]])
    else:
        predicted = np.mean(ratings)

    error += pow(rating - predicted, 2)
    
    wr.writerow([uId, mId, predicted,time])

f.close()
print("test RMSE:", np.sqrt(error))