# Lenskit ALS (Extension 1)

In [1]:
import pandas as pd
import lenskit
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import als, Recommender
from lenskit.metrics.topn import precision, ndcg
import time
from sklearn.preprocessing import LabelEncoder
import dask.dataframe as dd
import numpy as np

## Load Data

In [26]:
# Load the dataset as a Pandas DataFrame
train_path = '/Users/choi/git/NYU/DSGA1004 Big Data/final-project-group-2/recommender_train_small'
train_full_path = '/Users/choi/git/NYU/DSGA1004 Big Data/final-project-group-2/recommender_train'

val_path = '/Users/choi/git/NYU/DSGA1004 Big Data/final-project-group-2/recommender_val_small'
val_full_path = '/Users/choi/git/NYU/DSGA1004 Big Data/final-project-group-2/recommender_val'

test_path = '/Users/choi/git/NYU/DSGA1004 Big Data/final-project-group-2/recommender_test'

train = dd.read_parquet(train_path).compute()
train_full = dd.read_parquet(train_full_path).compute()

val = dd.read_parquet(val_path).compute()
val_full = dd.read_parquet(val_full_path).compute()

test = dd.read_parquet(test_path).compute()

## Preproces Data

In [27]:
# Assuming you have user, item, and rating columns in the dataset
train = train.rename(columns={'user_id': 'user', 'recording_msid': 'item', 'count': 'rating'})
val = val.rename(columns={'user_id': 'user', 'recording_msid': 'item', 'count': 'rating'})

train_full = train_full.rename(columns={'user_id': 'user', 'recording_msid': 'item', 'count': 'rating'})
val_full = val_full.rename(columns={'user_id': 'user', 'recording_msid': 'item', 'count': 'rating'})

test = test.rename(columns={'user_id': 'user', 'recording_msid': 'item', 'count': 'rating'})

In [33]:
# Find unique items present in both train and test DataFrames
unique_items_train = set(train_full['item'].unique())
unique_items_test = set(test['item'].unique())
common_items = unique_items_train.intersection(unique_items_test)

# Filter test DataFrame to only include items present in both DataFrames
le = LabelEncoder()
le.fit(train_full['item'])
test_filtered = test[test['item'].isin(common_items)]

# Convert string recording_msid to index (int)
test_filtered['item'] = le.transform(test_filtered['item'])
train_full['item'] = le.transform(train_full['item'])
val_full['item'] = le.transform(val_full['item'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_filtered['item'] = le.transform(test_filtered['item'])


In [35]:
# Convert string recording_msid to index (int) for small set
le = LabelEncoder()
le.fit(train['item'])
train['item'] = le.transform(train['item'])
val['item'] = le.transform(val['item'])

## ALS Function

In [7]:
def ALS_lenskit(train, true):
    start = time.time()

    # Define the LensKit ALS model
    lk_als = als.ImplicitMF(200, reg=1, iterations=15, weight=10, use_ratings=True, progress=None)

    # Train the model on your training data
    model = Recommender.adapt(lk_als)
    model.fit(train)

    # Run recommendations for the validation data
    pred = batch.recommend(model, true.user.unique(), 100)
    
    print('total elapsed time: ', time.time() - start)
    
    return pred

## Evaluation Function

In [8]:
def mapAtK(pred, true, k = 100):
    # Create a dictionary to store the relevant items for each user
    relevant_items = true.groupby("user")["item"].apply(set).to_dict()

    # Create a dictionary to store the recommended items for each user
    recommended_items = pred.groupby("user")["item"].apply(list).to_dict()

    # Calculate average precision for each user
    aps = []
    for user, true_items in relevant_items.items():
        rec_items = recommended_items.get(user, [])[:k]
        hits = [int(item in true_items) for item in rec_items]
        if np.sum(hits) > 0:
            precision_at_k = np.cumsum(hits) / (1 + np.arange(len(hits)))
            ap = np.sum(precision_at_k * hits) / len(true_items)
            aps.append(ap)
        else:
            aps.append(0)

    # Calculate the mean average precision
    map_at_k = np.mean(aps)
    
    return np.round(map_at_k, 4)

## Small Validation Dataset Evaluation

### Run Lenskit ALS

In [24]:
pred = ALS_lenskit(train, val)

BLAS using multiple threads - can cause oversubscription
found 1 potential runtime problems - see https://boi.st/lkpy-perf


total elapsed time:  900.0470471382141


In [25]:
print(pred)

          item     score   user  rank
0       165231  1.280896     27     1
1       176194  1.150671     27     2
2       185797  1.141786     27     3
3       248740  1.115174     27     4
4        62633  1.085859     27     5
...        ...       ...    ...   ...
652695  209145  0.651710  21973    96
652696   97510  0.650223  21973    97
652697  186829  0.649740  21973    98
652698  206821  0.648709  21973    99
652699  231384  0.647883  21973   100

[652700 rows x 4 columns]


### Evaluate

In [26]:
map_at_k = mapAtK(pred, val)

In [1]:
map_at_k

0.01253


## Large Validation Dataset Evaluation

### Run Lenskit ALS

In [None]:
pred = ALS_lenskit(train_full, val_full)

BLAS using multiple threads - can cause oversubscription
found 1 potential runtime problems - see https://boi.st/lkpy-perf


total elapsed time:  2883.058085203171


In [None]:
print(pred)

          item     score   user  rank
0       336327  0.627314     53     1
1       614884  0.589878     53     2
2       571758  0.557194     53     3
3       384013  0.555655     53     4
4       347973  0.548397     53     5
...        ...       ...    ...   ...
755895  474239  0.631508  22187    96
755896  846201  0.631236  22187    97
755897  875639  0.629932  22187    98
755898  654108  0.628784  22187    99
755899  810591  0.628367  22187   100

[755900 rows x 4 columns]


### Evaluate

In [None]:
map_at_k = mapAtK(pred, val_full)

In [2]:
map_at_k

0.02738


## Large Test Dataset Evaluation

### Run Lenskit ALS

In [None]:
pred = ALS_lenskit(train_full, test_filtered)

total elapsed time:  2485.868931055069


In [None]:
print(pred)

          item     score   user  rank
0       638720  1.506001      1     1
1       877976  1.404588      1     2
2       767927  1.399701      1     3
3       722020  1.395297      1     4
4       538986  1.371902      1     5
...        ...       ...    ...   ...
521195  381021  1.061963  22705    96
521196  695359  1.060480  22705    97
521197  514112  1.058581  22705    98
521198  474509  1.058088  22705    99
521199  619750  1.057959  22705   100

[521200 rows x 4 columns]


### Evaluate

In [None]:
map_at_k = mapAtK(pred, test_filtered)

In [3]:
map_at_k

0.037489
