In [1]:
from google.colab import drive

In [2]:
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [3]:
!ls "/content/gdrive/My Drive/msci720_prj/dataset/train-dataset/ml-implicit"

movies.csv  ratings.csv


In [5]:
!pip install lenskit



In [7]:
from lenskit.datasets import MovieLens
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, als, item_knn, user_knn, funksvd, bias
from lenskit.algorithms.basic import Popular
from lenskit.metrics.topn import bulk_impl

from lenskit import topn

import os
import pathlib
import json
import gzip
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

In [21]:
models_directory = '/content/gdrive/My Drive/msci720_prj/Models'

In [9]:
algo_classes = {
    # Item Item Explicit
    'IIEx_20_2_0000001':  (item_knn.ItemItem, ['nnbrs', 'min_nbrs', 'min_sim']),
    'IIEx_30_30_005': (item_knn.ItemItem, ['nnbrs', 'min_nbrs', 'min_sim']),

    # User User Explicit
    'UUEx_120_30_001': (user_knn.UserUser, ['nnbrs', 'min_nbrs', 'min_sim']),
    'UUEx_30_2_0000001':    (user_knn.UserUser, ['nnbrs', 'min_nbrs', 'min_sim']),

    # Item Item Implicit
    'IIIm_120_15_0001': (item_knn.ItemItem, ['nnbrs', 'min_nbrs', 'min_sim', 'feedback']),

    # User User Implicit
    'UUIm_30_2_001': (user_knn.UserUser, ['nnbrs', 'min_nbrs', 'min_sim', 'feedback']),

    # Non-kNN models
    'BiasedMF' : (als.BiasedMF, ['features']),
    'ImplicitMF': (als.ImplicitMF, ['features']),
    'Popular': (Popular, []), # While listed here, Popular is created directly in main()
    'Bias': (bias.Bias, ['damping']),

    # Item Item Implicit specific for question 6
    # IIIm_1_1_0001 : nnbrs = 1, min_nbrs = 1, min_sim = 0.001, feedback = "implicit"
    "IIIm_1_1_0001": (item_knn.ItemItem, ['nnbrs', 'min_nbrs', 'min_sim', 'feedback'])
}

In [10]:
def create_instance(algo_name, params):
    algo_class, constructor_args = algo_classes[algo_name]
    kwargs = {arg: params[arg] for arg in constructor_args if arg in params}
    return algo_class(**kwargs)

In [22]:
def main():
    explicit_dataset = MovieLens('/content/gdrive/My Drive/msci720_prj/dataset/train-dataset/ml-explicit/training_set_user11k')
    explicit_ratings = explicit_dataset.ratings
    explicit_ratings.drop_duplicates(inplace=True)

    implicit_dataset = MovieLens('/content/gdrive/My Drive/msci720_prj/dataset/train-dataset/ml-implicit/training_set_user11k')
    implicit_ratings = implicit_dataset.ratings
    implicit_ratings.drop_duplicates(inplace=True)


    all_models = {}
    all_models['Popular'] = create_instance('Popular', None)

    for file in os.listdir(models_directory):
        if file.endswith('.json'):
            file_path = os.path.join(models_directory, file)
            with open(file_path, 'r') as json_file:
                data = json.load(json_file)
                algo_name = data.get('algo')
                params = {k: v for k, v in data.items() if k != 'algo'}
                model = create_instance(algo_name, params)
                all_models[algo_name] = model

    for algo_name, model in tqdm(all_models.items()):
        algo = util.clone(model)
        algo = Recommender.adapt(algo)

        if algo_name in ['IIIm_120_15_0001','UUIm_30_2_001','ImplicitMF','IIIm_1_1_0001']:
            algo.fit(implicit_ratings)
        else:
            algo.fit(explicit_ratings)

        out = f'/content/gdrive/My Drive/msci720_prj/lenskit-saved-models/{algo_name}.pkl.gz'
        pathlib.Path(out).parent.mkdir(parents=True, exist_ok=True)
        with gzip.open(out, 'wb') as f:
            pickle.dump(algo, f, 4)

In [23]:
main()

  b = blocks[bi]
100%|██████████| 10/10 [26:54<00:00, 161.46s/it]


In [24]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [25]:
from lenskit.datasets import MovieLens
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, als, item_knn, user_knn, funksvd
from lenskit.algorithms.basic import Popular
from lenskit.metrics.topn import bulk_impl

from lenskit import topn

import os
import pathlib
import json
import gzip
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
import fnmatch
from collections import defaultdict

import logging

In [27]:
log_file = 'lenskit.log'
logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

number_of_recs = 2000

In [28]:
def get_user_rated_movies( ratings_file ):
    """
    Takes in a movielens ratings csv and returns a dictionary
    from user_id to set of movie_ids.
    """

    user2movies = dict()

    with open( ratings_file, 'r' ) as r:
        header = r.readline()
        for line in r:
            line = line.removesuffix('\n')
            fields = line.split(',')
            if len(fields) != 4:
                print("wrong number of fields")
                exit()
            user_id  = int(fields[0])
            movie_id = int(fields[1])
            movies = user2movies.setdefault( user_id, set() )
            movies.add( movie_id )

    return user2movies

In [41]:
def pred_main():

    # We use lenskit to make recs for a given user, and lenskit
    # will correctly only return recs for items not already rated,
    # but for our implicit dataset, that is not sufficient.  The user's
    # train ratings are the ratings in the explicit dataset.  The implicit
    # data set will likely be missing some lower rated items that are
    # part of the user's train ratings.  So, we need to filter the recs
    # from lenskit to remove all of their train ratings and not only
    # the ones in the implicit dataset.
    train_explicit_ratings_file = '/content/gdrive/My Drive/msci720_prj/dataset/train-dataset/ml-explicit/training_set_user11k/ratings.csv'
    user2movies = get_user_rated_movies( train_explicit_ratings_file ) # movies are ints

    output_dir = "/content/gdrive/My Drive/msci720_prj/lenskit-recs"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

    # script_directory = os.path.dirname(os.path.abspath(__file__))
    script_directory = '/content/gdrive/My Drive/msci720_prj'
    userIds_file_path = os.path.join(script_directory, "userIds.json")
    with open(userIds_file_path, "r") as json_file:
        userIds = json.load(json_file) # loads these userIds as ints

    saved_folder_dir = pathlib.Path("/content/gdrive/My Drive/msci720_prj/lenskit-saved-models")
    saved_model_names = []
    for item in tqdm(saved_folder_dir.iterdir()):
        # model_name = str(item).split('/')[1].split('.')[0]
        model_name = str(item).split('/')[6].split('.')[0]
        results_file_name = os.path.join( output_dir, f'{model_name}.results' )

        with gzip.open(item, 'rb') as f:
            algo = pickle.load(f)

        # recs = batch.recommend(algo, userIds, n=number_of_recs, n_jobs=16)
        recs = batch.recommend(algo, userIds, n=number_of_recs, n_jobs=1)
        recs['algorithm'] = model_name
        recs['Q0'] = 'Q0'
        with open( results_file_name, 'a', newline='' ) as results_file:
            for user_id in userIds:
                user_rated_movieIds = user2movies[user_id]
                user_recs = recs[recs['user'] == user_id]
                user_recs = user_recs[~user_recs['item'].isin(user_rated_movieIds)] # recs 'item' is an int64

                user_recs.sort_values(by='score', ascending=False, inplace=True)
                user_recs = user_recs.head(100)
                user_recs['rank'] = np.arange( 1, len(user_recs) + 1 )

                user_recs.to_csv(results_file, sep=' ', columns = ['user', 'Q0', 'item', 'rank','score','algorithm'],index=False, header=False)


In [42]:
pred_main()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_recs.sort_values(by='score', ascending=False, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_recs['rank'] = np.arange( 1, len(user_recs) + 1 )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_recs['rank'] = np.arange( 1, len(user_recs) + 1 )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i