# Recommendation class

In [1]:
import numpy as np
import pandas as pd
import sys # can use sys to take command line arguments
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [26]:
class Recommender():
    '''
    What is this class all about - write a really good doc string here
    '''
    def __init__(self):
        '''
        what do we need to start out our recommender system
        '''
        pass

    def read_dataset(self, items_path='./data/items_clean.csv', reviews_path='./data/train_data.csv'):
        '''
        INPUTS:
        ------------
            items_path - (string) file path to the items, default='./items_clean.csv'
            reviews_path - (string) file path to the reviews, default='./train_data.csv'

        OUTPUTS:
        ------------
            items - (dataframe) item dataframe
            reviews - (dataframe) review dataframe
        '''

        # Read in the datasets
        items = pd.read_csv(items_path)
        reviews = pd.read_csv(reviews_path)

        del items['Unnamed: 0']
        del reviews['Unnamed: 0']

        print('items')
        print(items.head())
        print(items.shape)
        print('------------------------')
        print(' ')
        print('reviews')
        print(reviews.head())
        print(reviews.shape)
        print('------------------------')
        print(' ')

        return items, reviews

    def create_train_test(self,reviews, order_by, train_size_prct=0.8):
        '''
        INPUTS:
        ------------
            reviews - (pandas df) dataframe to split into train and test
            order_by - (string) column name to sort by
            train_size_prct - (float) - percentage of data used for training, default=0.8

        OUTPUTS:
        ------------
            training_df -  (pandas df) dataframe of the training set
            validation_df - (pandas df) dataframe of the test set
        '''

        # Define the train and test data size via train_size_prct
        training_size = int(reviews.shape[0] * train_size_prct)
        testing_size = reviews.shape[0] - training_size

        # Sort the reviews by date before splitting
        # use old data for training, new data for validation
        reviews_new = reviews.sort_values(order_by)
        training_df = reviews_new.head(training_size)
        validation_df = reviews_new.iloc[training_size:training_size+testing_size]

        print('reviews_new')
        print(reviews_new.head())
        print(reviews_new.shape)
        print('------------------------')
        print(' ')
        print('training_df')
        print(training_df.head())
        print(training_df.shape)
        print('------------------------')
        print(' ')
        print('validation_df')
        print(validation_df.head())
        print(validation_df.shape)
        print('------------------------')
        print(' ')


        return training_df, validation_df

    def fit(self,
            items_path='./data/items_clean.csv',
            reviews_path='./data/train_data.csv',
            order_by='date',
            train_size_prct=0.8,
            latent_features=15,
            learning_rate=0.005,
            iters=10
           ):

        ''' Fit the recommender engine to the dataset and
            save the results to pull from when you need to make predictions

        INPUTS:
        ------------
            items_path - (string) file path to the items, default='./items_clean.csv'
            reviews_path - (string) file path to the reviews, default='./train_data.csv'
            order_by - (string) column name to sort by
            train_size_prct - (float) - percentage of data used for training, default=0.8
            latent_features - (int) the number of latent features used, default=15,
            learning_rate - (float) the learning rate, default=0.005
            iters - (int) the number of iterations, default=10

        OUTPUTS:
        -------------
            user_mat - (numpy array) a user by latent feature matrix
            item_mat - (numpy array) a latent feature by item matrix

        '''
        # Read in item and review DataFrames
        items, reviews = self.read_dataset(items_path,reviews_path)

        # Hyperparameters: Number of latent features, lr, epochs
        latent_features = latent_features
        learning_rate = learning_rate
        iters = iters

        training_df, validation_df = self.create_train_test(reviews, order_by, train_size_prct)

        # Create user-by-item matrix as np array
        train_user_item = training_df[['user_id', 'movie_id', 'rating', 'timestamp']]
        train_data_df = train_user_item.groupby(['user_id', 'movie_id'])['rating'].max().unstack()
        ratings_mat = np.array(train_data_df)
        self.ratings_mat = ratings_mat

        print('user-by-item matrix')
        print(ratings_mat)
        print(ratings_mat.shape)
        print('------------------------')
        print(' ')

        # Number of users and items in the user-by-item matrix
        self.n_users = ratings_mat.shape[0]
        self.n_items = ratings_mat.shape[1]
        self.num_ratings = np.count_nonzero(~np.isnan(ratings_mat))

        print('number of users: ', self.n_users)
        print('number of items: ', self.n_items)
        print('number of non nan ratings: ', self.num_ratings)

        # Initialize the user and item matrices with random values
        user_mat = np.random.rand(self.n_users, latent_features)
        item_mat = np.random.rand(latent_features, self.n_items)

        print('U matrix (users) before training')
        print(user_mat)
        print(user_mat.shape)
        print('------------------------')
        print(' ')

        print('Vt matrix (items) before training')
        print(item_mat)
        print(item_mat.shape)
        print('------------------------')
        print(' ')

        # Initialize sse at 0 for first iteration
        sse_accum = 0

        # keep track of iteration and MSE
        print("Optimizaiton Statistics")
        print("Iterations | Mean Squared Error ")

        # for each iteration
        for iteration in range(iters):

            # update our sse
            old_sse = sse_accum
            sse_accum = 0

            # For each user-item pair
            for i in range(self.n_users):
                for j in range(self.n_items):

                    # if the rating exists
                    if ratings_mat[i, j] > 0:

                        # compute the error as the actual minus the dot product of the user and item latent features
                        diff = ratings_mat[i, j] - np.dot(user_mat[i, :], item_mat[:, j])

                        # Keep track of the sum of squared errors for the matrix
                        sse_accum += diff**2

                        # update the values in each matrix in the direction of the gradient
                        for k in range(latent_features):
                            user_mat[i, k] += learning_rate * (2*diff*item_mat[k, j])
                            item_mat[k, j] += learning_rate * (2*diff*user_mat[i, k])


            # print results
            print("%d \t\t %f" % (iteration+1, sse_accum / self.num_ratings))

        # Validation
        print('Start validation ...')
        rmse, perc_rated, actual_v_pred, preds, acts = self.validation_comparison(validation_df, ratings_mat, user_mat=user_mat, item_mat=item_mat)
        print('rmse: ', rmse)
        print('perc_rated: ', perc_rated)
        print('actual_v_pred: ', actual_v_pred)

        self.plot_validation_results(rmse, perc_rated, actual_v_pred, preds, acts)

        print(' ')
        print('Saving user-by-item matrix as pickle ...')
        with open('ratings_mat.pkl','wb') as f:
            pickle.dump(ratings_mat, f)
        print('...done')
        print('------------------------')
        print(' ')

        print(' ')
        print('Saving user_mat as pickle ...')
        with open('user_mat.pkl','wb') as f:
            pickle.dump(user_mat, f)
        print('...done')
        print('------------------------')
        print(' ')

        print(' ')
        print('Saving item_mat as pickle ...')
        with open('item_mat.pkl','wb') as f:
            pickle.dump(item_mat, f)
        print('...done')
        print('------------------------')
        print(' ')

        return user_mat, item_mat, ratings_mat, training_df, validation_df

    def predict_rating(self, ratings_mat, user_matrix, item_matrix, user_id, item_id, load_mat=False):
        ''' makes predictions of a rating for a user on a item-user combo

        INPUTS:
        ------------
            user_matrix - user by latent factor matrix
            item_matrix - latent factor by item matrix
            user_id - the user_id from the reviews df
            item_id - the item_id according the items df

        OUTPUTS:
        ------------
            pred - the predicted rating for user_id-item_id according to FunkSVD
        '''
        if load_mat==True:
            ratings_mat, user_mat, item_mat, ratings_mat = self.load_matrices()

        # Create series of users and items in the right order
        user_ids_series = np.array(ratings_mat.index)
        item_ids_series = np.array(ratings_mat.columns)

        # User row and item Column
        user_row = np.where(user_ids_series == user_id)[0][0]
        item_col = np.where(item_ids_series == item_id)[0][0]

        # Take dot product of that row and column in U and V to make prediction
        pred = np.dot(user_matrix[user_row, :], item_matrix[:, item_col])

        return pred

    def validation_comparison(self, val_df, ratings_mat, user_mat, item_mat):
        '''
        INPUTS:
        ------------
            val_df - the validation dataset created in create_train_test
            user_mat - U matrix in FunkSVD
            item_mat - V matrix in FunkSVD

        OUTPUTS:
        ------------
            rmse - RMSE of how far off each value is from it's predicted value
            perc_rated - percent of predictions out of all possible that could be rated
            actual_v_pred - a 10 x 10 grid with counts for actual vs predicted values
        '''

        val_users = np.array(val_df['user_id'])
        val_items = np.array(val_df['movie_id'])
        val_ratings = np.array(val_df['rating'])

        sse = 0
        num_rated = 0
        preds, acts = [], []
        actual_v_pred = np.zeros((10,10))
        for idx in range(len(val_users)):
            print(idx)
            try:
                print('idx not null ', idx)
                pred = self.predict_rating(ratings_mat, user_mat, item_mat, val_users[idx], val_items[idx])
                sse += (val_ratings[idx] - pred)**2
                num_rated+=1
                preds.append(pred)
                acts.append(val_ratings[idx])
                actual_v_pred[11-int(val_ratings[idx]-1), int(round(pred)-1)]+=1

            except Exception as e:
                print(e)
                continue

        rmse = np.sqrt(sse/num_rated)
        perc_rated = num_rated/len(val_users)
        return rmse, perc_rated, actual_v_pred, preds, acts

    def plot_validation_results(self, rmse, perc_rated, actual_v_pred, preds, acts):
        # How well did we do?
        print(rmse, perc_rated)
        sns.heatmap(actual_v_pred);
        plt.xticks(np.arange(10), np.arange(1,11));
        plt.yticks(np.arange(10), np.arange(1,11));
        plt.xlabel("Predicted Values");
        plt.ylabel("Actual Values");
        plt.title("Actual vs. Predicted Values");

    def load_matrices(self, ratings_mat_path='ratings_mat.pkl', user_mat_path='user_mat.pkl', item_mat_path='movie_mat.pkl'):

        with open(ratings_mat_path,'rb') as f:
            ratings_mat = pickle.load(f)
        print('Shape of user_mat')
        print(ratings_mat.shape)
        print('------------------------')
        print(' ')

        with open(user_mat_path,'rb') as f:
            user_mat = pickle.load(f)
        print('Shape of user_mat')
        print(user_mat.shape)
        print('------------------------')
        print(' ')

        with open(item_mat_path,'rb') as f:
            item_mat = pickle.load(f)
        print('Shape of user_mat')
        print(item_mat.shape)
        print('------------------------')
        print(' ')

        return ratings_mat, user_mat, item_mat


    def find_similar_items(self, item_id):
        '''
        INPUTS:
        ------------
            item_id - a item_id

        OUTPUTS:
        ------------
            similar_items - an array of the most similar items by title
        '''

        # find the row of each item id
        item_idx = np.where(items['item_id'] == item_id)[0][0]

        # find the most similar item indices - to start I said they need to be the same for all content
        similar_idxs = np.where(dot_prod_items[item_idx] == np.max(dot_prod_items[item_idx]))[0]

        # pull the item titles based on the indices
        similar_items = np.array(items.iloc[similar_idxs, ]['item'])

        return similar_items

    def get_item_names(self, item_ids):
        '''
        INPUTS:
        ------------
            item_ids - a list of item_ids

        OUTPUT:
        ------------
            items - a list of item names associated with the item_ids
        '''

        item_lst = list(items[items['item_id'].isin(item_ids)]['item'])

        return item_lst



    def create_ranked_df(self, items, reviews):
        '''
        INPUTS:
        ------------
            items - the items dataframe
            reviews - the reviews dataframe

        OUTPUT:
        ------------
            ranked_items - a dataframe with items that are sorted by highest avg rating, more reviews,
                        then time, and must have more than 4 ratings
        '''

        # Pull the average ratings and number of ratings for each item
        item_ratings = reviews.groupby('item_id')['rating']
        avg_ratings = item_ratings.mean()
        num_ratings = item_ratings.count()
        last_rating = pd.DataFrame(reviews.groupby('item_id').max()['date'])
        last_rating.columns = ['last_rating']

        # Add Dates
        rating_count_df = pd.DataFrame({'avg_rating': avg_ratings, 'num_ratings': num_ratings})
        rating_count_df = rating_count_df.join(last_rating)

        # merge with the items dataset
        item_recs = items.set_index('item_id').join(rating_count_df)

        # sort by top avg rating and number of ratings
        ranked_items = item_recs.sort_values(['avg_rating', 'num_ratings', 'last_rating'], ascending=False)

        # for edge cases - subset the item list to those with only 5 or more reviews
        ranked_items = ranked_items[ranked_items['num_ratings'] > 4]

        return ranked_items


    def popular_recommendations(self, user_id, n_top, ranked_items):
        '''
        INPUT:
        ------------
            user_id - the user_id (str) of the individual you are making recommendations for
            n_top - an integer of the number recommendations you want back
            ranked_items - a pandas dataframe of the already ranked items based on avg rating, count, and time

        OUTPUTS:
        ------------
            top_items - a list of the n_top recommended items by item title in order best to worst
        '''

        top_items = list(ranked_items['item'][:n_top])

        return top_items



    def start_prediction(self):
        user_mat, item_mat = self.load_matrices()


    def make_recs(self, _id, train_data, train_df, items, user_mat, item_mat, _id_type='item', rec_num=5):
        '''
        INPUTS:
        ------------
            _id - either a user or item id (int)
            _id_type - "item" or "user" (str)
            train_data - dataframe of data as user-item matrix
            train_df - dataframe of training data reviews
            items - items df
            user_mat - the U matrix of matrix factorization
            item_mat - the V matrix of matrix factorization
            rec_num - number of recommendations to return (int)

        OUTPUTS:
        ------------
            recs - (array) a list or numpy array of recommended items like the
                    given item, or recs for a user_id given
        '''

        # if the user is available from the matrix factorization data,
        # I will use this and rank items based on the predicted values
        # For use with user indexing
        val_users = train_data_df.index
        rec_ids = create_ranked_df(items, train_df)

        if _id_type == 'user':
            if _id in train_data.index:
                # Get the index of which row the user is in for use in U matrix
                idx = np.where(val_users == _id)[0][0]

                # take the dot product of that row and the V matrix
                preds = np.dot(user_mat[idx,:],item_mat)

                # pull the top items according to the prediction
                indices = preds.argsort()[-rec_num:][::-1] #indices
                rec_ids = train_data_df.columns[indices]
                rec_names = get_item_names(rec_ids)

            else:
                # if we don't have this user, give just top ratings back
                ranked_items = create_ranked_df(items, reviews)
                rec_names = popular_recommendations(_id, rec_num, ranked_items)

        # Find similar items if it is a item that is passed
        else:
            rec_ids = find_similar_items(_id)
            rec_names = get_item_names(rec_ids)

        return rec_ids, rec_names



if __name__ == '__main__':
    # test different parts to make sure it works
    pass


In [30]:
rec = Recommender()
user_mat, item_mat, ratings_mat, training_df, validation_df = rec.fit(items_path='./data/movies_clean.csv', 
        reviews_path='./data/train_data.csv',
        order_by='date',
        train_size_prct=0.8,
        latent_features=15, 
        learning_rate=0.005, 
        iters=3
        )

items
   movie_id                                              movie  \
0         8      Edison Kinetoscopic Record of a Sneeze (1894)   
1        10                La sortie des usines Lumière (1895)   
2        12                      The Arrival of a Train (1896)   
3        25  The Oxford and Cambridge University Boat Race ...   
4        91                         Le manoir du diable (1896)   

               genre  date  1800's  1900's  2000's  History  News  Horror  \
0  Documentary|Short  1894       1       0       0        0     0       0   
1  Documentary|Short  1895       1       0       0        0     0       0   
2  Documentary|Short  1896       1       0       0        0     0       0   
3                NaN  1895       1       0       0        0     0       0   
4       Short|Horror  1896       1       0       0        0     0       1   

   ...  Fantasy  Romance  Game-Show  Action  Documentary  Animation  Comedy  \
0  ...        0        0          0       0            

1 		 10.846503
2 		 6.243550
3 		 4.375915
Start validation ...
0
idx not null  0
'numpy.ndarray' object has no attribute 'index'
1
idx not null  1
'numpy.ndarray' object has no attribute 'index'
2
idx not null  2
'numpy.ndarray' object has no attribute 'index'
3
idx not null  3
'numpy.ndarray' object has no attribute 'index'
4
idx not null  4
'numpy.ndarray' object has no attribute 'index'
5
idx not null  5
'numpy.ndarray' object has no attribute 'index'
6
idx not null  6
'numpy.ndarray' object has no attribute 'index'
7
idx not null  7
'numpy.ndarray' object has no attribute 'index'
8
idx not null  8
'numpy.ndarray' object has no attribute 'index'
9
idx not null  9
'numpy.ndarray' object has no attribute 'index'
10
idx not null  10
'numpy.ndarray' object has no attribute 'index'
11
idx not null  11
'numpy.ndarray' object has no attribute 'index'
12
idx not null  12
'numpy.ndarray' object has no attribute 'index'
13
idx not null  13
'numpy.ndarray' object has no attribute 'index'
14
i

idx not null  559
'numpy.ndarray' object has no attribute 'index'
560
idx not null  560
'numpy.ndarray' object has no attribute 'index'
561
idx not null  561
'numpy.ndarray' object has no attribute 'index'
562
idx not null  562
'numpy.ndarray' object has no attribute 'index'
563
idx not null  563
'numpy.ndarray' object has no attribute 'index'
564
idx not null  564
'numpy.ndarray' object has no attribute 'index'
565
idx not null  565
'numpy.ndarray' object has no attribute 'index'
566
idx not null  566
'numpy.ndarray' object has no attribute 'index'
567
idx not null  567
'numpy.ndarray' object has no attribute 'index'
568
idx not null  568
'numpy.ndarray' object has no attribute 'index'
569
idx not null  569
'numpy.ndarray' object has no attribute 'index'
570
idx not null  570
'numpy.ndarray' object has no attribute 'index'
571
idx not null  571
'numpy.ndarray' object has no attribute 'index'
572
idx not null  572
'numpy.ndarray' object has no attribute 'index'
573
idx not null  573
'n

idx not null  1254
'numpy.ndarray' object has no attribute 'index'
1255
idx not null  1255
'numpy.ndarray' object has no attribute 'index'
1256
idx not null  1256
'numpy.ndarray' object has no attribute 'index'
1257
idx not null  1257
'numpy.ndarray' object has no attribute 'index'
1258
idx not null  1258
'numpy.ndarray' object has no attribute 'index'
1259
idx not null  1259
'numpy.ndarray' object has no attribute 'index'
1260
idx not null  1260
'numpy.ndarray' object has no attribute 'index'
1261
idx not null  1261
'numpy.ndarray' object has no attribute 'index'
1262
idx not null  1262
'numpy.ndarray' object has no attribute 'index'
1263
idx not null  1263
'numpy.ndarray' object has no attribute 'index'
1264
idx not null  1264
'numpy.ndarray' object has no attribute 'index'
1265
idx not null  1265
'numpy.ndarray' object has no attribute 'index'
1266
idx not null  1266
'numpy.ndarray' object has no attribute 'index'
1267
idx not null  1267
'numpy.ndarray' object has no attribute 'inde

ZeroDivisionError: division by zero

In [19]:
rec.make_recs(48, 'user')

TypeError: make_recs() missing 3 required positional arguments: 'train_df', 'items', and 'user_mat'

In [29]:
rec.make_recs(48, ratings_mat, training_df, items, user_mat, item_mat, _id_type='user', rec_num=5)

NameError: name 'ratings_mat' is not defined