In [16]:
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

In [17]:
ratings_path = "/Users/amitnarang/Downloads/ml-latest-small/ratings.csv"

In [18]:
df = pd.read_csv(ratings_path, sep = ',')

user_vector_matrix = dict()
movie_vector_matrix = dict()

columns = ['userId', 'movieId', 'rating']
test_data = []
train_data = []

for row in df.itertuples():
    if row.Index % 3 == 0:
        test_data.append([row.userId, row.movieId, row.rating])
    else:
        train_data.append([row.userId, row.movieId, row.rating])

test_df = pd.DataFrame(data=test_data, columns=columns)
train_df = pd.DataFrame(data=train_data, columns=columns)
max_train_movie = max(train_df['movieId'])
print(df)
print(test_df)
print(train_df)

        userId  movieId  rating   timestamp
0            1        1     4.0   964982703
1            1        3     4.0   964981247
2            1        6     4.0   964982224
3            1       47     5.0   964983815
4            1       50     5.0   964982931
...        ...      ...     ...         ...
100831     610   166534     4.0  1493848402
100832     610   168248     5.0  1493850091
100833     610   168250     5.0  1494273047
100834     610   168252     5.0  1493846352
100835     610   170875     3.0  1493846415

[100836 rows x 4 columns]
       userId  movieId  rating
0           1        1     4.0
1           1       47     5.0
2           1      101     5.0
3           1      157     5.0
4           1      223     3.0
...       ...      ...     ...
33607     610   160527     4.5
33608     610   161582     4.0
33609     610   163937     3.5
33610     610   166528     4.0
33611     610   168250     5.0

[33612 rows x 3 columns]
       userId  movieId  rating
0           1   

In [33]:
class ALSStreamingModel:
    def __init__(self, l, num_features, alpha):
        self.l = l
        self.num_features = num_features
        self.alpha = alpha
        self.user_features = dict()
        self.movie_features = np.random.randint(100, size=(max_train_movie, num_features))
        self.ratings = dict()
        
    def fit(self, train):
        for row in train.itertuples():
            self.update_user_vector(row)
        return self 

    def _als_step(self, ratings, solve_vecs, fixed_vecs):
        """
        when updating the user matrix,
        the item matrix is the fixed vector and vice versa
        
        ratings: 1xnum_movies
        solve_vecs: 1xnum_features
        fixed_vecs: 1xnum_features
        RF * (F^-1F + lI)^-1
        num_features x num_features
        
        num_users x num_movies * num_movies x num_features
        num_users x num_features 
        
        (610, 193609) (610, 200) (193609, 200)
        ratings user movies
        (1, 40) (1, 40) (1, 193609)
        b has to be 1x40
        ratings is 1xY fixedVecs is Yx40
        user movies ratings
        """
        A = fixed_vecs.T.dot(fixed_vecs) + np.eye(self.num_features) * self.l
        #print(A.shape)
        b = ratings.dot(fixed_vecs)
        A_inv = np.linalg.inv(A)
        solve_vecs = b.dot(A_inv)
        return solve_vecs
    
    def update_user_vector(self, row):
        rating = row.rating
        userId = row.userId
        movieId = row.movieId

        if userId in self.user_features:
            user_vector = self.user_features[userId]
            rating_vector = self.ratings[userId]
        else:
            user_vector = np.random.randint(100, size=(1, self.num_features))
            rating_vector = np.zeros((1, max_train_movie))

        movie_vector = self.movie_features
        rating_vector[0, movieId-1] = rating
        self.ratings[userId] = rating_vector
        #print(user_vector.shape, movie_vector.shape, rating_vector.shape)
        new_user_vector = self._als_step(rating_vector, user_vector, movie_vector)
        self.user_features[userId] = new_user_vector
    
    def predict_set(self, data):
        
        correct_results = []
        predicted_results = []
        for row in data.itertuples():
            prediction = self.predict_rating(row.userId, row.movieId)
            predicted_results.append(prediction)
            correct_results.append(row.rating)
        
        return self.compute_mse(correct_results, predicted_results)
    
    def predict_rating(self, userId, movieId):
        """predict ratings for every user and item"""
        if userId not in self.user_features or movieId not in self.movie_features:
            return 0
        user_vector = self.user_features[userId]
        movie_vector = self.movie_features[movieId]
        prediction = user_vector.dot(movie_vector.T)
        if np.isnan(prediction) or prediction > 5:
            return 5
        if prediction < 0:
            return 0
        return prediction

    def compute_mse(self, y_true, y_pred):
        """ignore zero terms prior to comparing the mse"""
        mse = mean_squared_error(np.asarray(y_true), np.asarray(y_pred))
        return mse

In [35]:
als = ALSStreamingModel(.01, 40, .1)
als.fit(train_df)
print(als.predict_set(test_df))
print(als.predict_set(train_df))

(1, 40) (193609, 40) (1, 193609)
(40, 40)
(1, 40) (193609, 40) (1, 193609)


KeyboardInterrupt: 