In [1]:
import cornac
from cornac.eval_methods import RatioSplit
from cornac.metrics import MAE, RMSE, Precision, Recall, NDCG, AUC, MAP
import pandas as pd

In [2]:
df = pd.read_csv(r"C:\Users\jvhua\OneDrive\Desktop\CSE-6242-Group-Project\users-score-2023.csv")

In [3]:
anime_count = df.groupby('user_id')['anime_id'].count()
 
user_ids_to_drop = anime_count[anime_count < 100].index
 
filtered_df = df[~df['user_id'].isin(user_ids_to_drop)]

In [4]:
data = filtered_df[['user_id', 'anime_id', 'rating']].reset_index(drop = True)

In [5]:
data['rating'] = data['rating'].astype("float")

In [6]:
# Convert DataFrame to list of tuples
data_tuples = list(data.itertuples(index=False, name=None))

In [7]:
K = 50  # number of nearest neighbors

# Define an evaluation method to split feedback into train and test sets
ratio_split = RatioSplit(
    data=data_tuples, test_size=0.2, exclude_unknowns=True, verbose=True, seed=123
)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 78787
Number of items = 16113
Number of ratings = 15463935
Max rating = 10.0
Min rating = 1.0
Global mean = 7.5
---
Test data:
Number of users = 78787
Number of items = 16113
Number of ratings = 3865577
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 78787
Total items = 16113


In [8]:
# ItemKNN methods
item_knn_cosine = cornac.models.ItemKNN(k=K, similarity="cosine", name="ItemKNN-Cosine")
item_knn_pearson = cornac.models.ItemKNN(
    k=K, similarity="pearson", name="ItemKNN-Pearson"
)
item_knn_adjusted = cornac.models.ItemKNN(
    k=K, similarity="cosine", mean_centered=True, name="ItemKNN-AdjustedCosine"
)

In [12]:
# Put everything together into an experiment
cornac.Experiment(
    eval_method=ratio_split,
    models=[
        item_knn_cosine,
        item_knn_pearson,
        item_knn_adjusted,
    ],
    metrics=[cornac.metrics.RMSE(), cornac.metrics.NDCG(k=5), cornac.metrics.FMeasure(), cornac.metrics.MAP()],
    user_based=True,
).run()


[ItemKNN-Cosine] Training started!


  0%|          | 0/16113 [00:00<?, ?it/s]


[ItemKNN-Cosine] Evaluation started!


Rating:   0%|          | 0/3865577 [00:00<?, ?it/s]

Ranking:   0%|          | 0/78787 [00:00<?, ?it/s]


[ItemKNN-Pearson] Training started!


  0%|          | 0/16113 [00:00<?, ?it/s]


[ItemKNN-Pearson] Evaluation started!


Rating:   0%|          | 0/3865577 [00:00<?, ?it/s]

Ranking:   0%|          | 0/78787 [00:00<?, ?it/s]


[ItemKNN-AdjustedCosine] Training started!


  0%|          | 0/16113 [00:00<?, ?it/s]


[ItemKNN-AdjustedCosine] Evaluation started!


Rating:   0%|          | 0/3865577 [00:00<?, ?it/s]

Ranking:   0%|          | 0/78787 [00:00<?, ?it/s]


TEST:
...
                       |   RMSE |  F1@-1 |    MAP | NDCG@5 | Train (s) |   Test (s)
---------------------- + ------ + ------ + ------ + ------ + --------- + ----------
ItemKNN-Cosine         | 1.3029 | 0.0062 | 0.0138 | 0.0081 |   25.2065 | 18798.9762
ItemKNN-Pearson        | 1.2979 | 0.0062 | 0.0039 | 0.0011 |   27.4492 | 18179.0571
ItemKNN-AdjustedCosine | 1.2045 | 0.0062 | 0.0170 | 0.0163 |   25.6129 | 18498.5183



In [15]:
len(item_knn_adjusted.score(21))

16113

In [7]:
import pickle

In [8]:
with open("my_animelist.pkl", 'rb') as f:
    loaded_list = pickle.load(f)

In [9]:
filter_df = data[data['anime_id'].isin(loaded_list)]

In [10]:
# Convert DataFrame to list of tuples
data_tuples = list(filter_df.itertuples(index=False, name=None))

In [11]:
# Define an evaluation method to split feedback into train and test sets
ratio_split2 = RatioSplit(
    data=data_tuples, test_size=0.2, exclude_unknowns=True, verbose=True, seed=123
)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 78775
Number of items = 4971
Number of ratings = 11586111
Max rating = 10.0
Min rating = 1.0
Global mean = 7.6
---
Test data:
Number of users = 78775
Number of items = 4971
Number of ratings = 2896510
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 78775
Total items = 4971


In [26]:
item_knn_adjusted1 = cornac.models.ItemKNN(
    k=20, similarity="cosine", mean_centered=True, name="ItemKNN-AdjustedCosine"
)
item_knn_adjusted2 = cornac.models.ItemKNN(
    k=50, similarity="cosine", mean_centered=True, name="ItemKNN-AdjustedCosine"
)
item_knn_adjusted3 = cornac.models.ItemKNN(
    k=70, similarity="cosine", mean_centered=True, name="ItemKNN-AdjustedCosine"
)
item_knn_adjusted4 = cornac.models.ItemKNN(
    k=100, similarity="cosine", mean_centered=True, name="ItemKNN-AdjustedCosine"
)

In [27]:
# Put everything together into an experiment
cornac.Experiment(
    eval_method=ratio_split2,
    models=[
        item_knn_adjusted1,
        item_knn_adjusted2,
        item_knn_adjusted3,
        item_knn_adjusted4
    ],
    metrics=[cornac.metrics.RMSE(), cornac.metrics.NDCG(k=5), cornac.metrics.FMeasure(), cornac.metrics.MAP()],
    user_based=True,
).run()


[ItemKNN-AdjustedCosine] Training started!


  0%|          | 0/4971 [00:00<?, ?it/s]


[ItemKNN-AdjustedCosine] Evaluation started!


Rating:   0%|          | 0/2896510 [00:00<?, ?it/s]

Ranking:   0%|          | 0/78748 [00:00<?, ?it/s]


[ItemKNN-AdjustedCosine] Training started!


  0%|          | 0/4971 [00:00<?, ?it/s]


[ItemKNN-AdjustedCosine] Evaluation started!


Rating:   0%|          | 0/2896510 [00:00<?, ?it/s]

Ranking:   0%|          | 0/78748 [00:00<?, ?it/s]


[ItemKNN-AdjustedCosine] Training started!


  0%|          | 0/4971 [00:00<?, ?it/s]


[ItemKNN-AdjustedCosine] Evaluation started!


Rating:   0%|          | 0/2896510 [00:00<?, ?it/s]

Ranking:   0%|          | 0/78748 [00:00<?, ?it/s]


[ItemKNN-AdjustedCosine] Training started!


  0%|          | 0/4971 [00:00<?, ?it/s]


[ItemKNN-AdjustedCosine] Evaluation started!


Rating:   0%|          | 0/2896510 [00:00<?, ?it/s]

Ranking:   0%|          | 0/78748 [00:00<?, ?it/s]


TEST:
...
                       |   RMSE |  F1@-1 |    MAP | NDCG@5 | Train (s) |  Test (s)
---------------------- + ------ + ------ + ------ + ------ + --------- + ---------
ItemKNN-AdjustedCosine | 1.2135 | 0.0153 | 0.0314 | 0.0657 |    8.7680 | 3244.9936
ItemKNN-AdjustedCosine | 1.2106 | 0.0153 | 0.0315 | 0.0590 |    9.8054 | 3344.5910
ItemKNN-AdjustedCosine | 1.2147 | 0.0153 | 0.0311 | 0.0545 |   10.2209 | 3368.5565
ItemKNN-AdjustedCosine | 1.2205 | 0.0153 | 0.0306 | 0.0492 |    9.2063 | 3432.2060



In [12]:
item_knn_adjustedidf = cornac.models.ItemKNN(
    k=20, similarity="cosine", mean_centered=True, name="ItemKNN-AdjustedCosineidf", weighting= 'idf',
)
item_knn_adjustedbm25 = cornac.models.ItemKNN(
    k=20, similarity="cosine", mean_centered=True, name="ItemKNN-AdjustedCosinebm25", weighting= 'bm25'
)

In [14]:
# Put everything together into an experiment
cornac.Experiment(
    eval_method=ratio_split2,
    models=[
        item_knn_adjustedidf,
        item_knn_adjustedbm25,
    ],
    metrics=[cornac.metrics.RMSE(), cornac.metrics.NDCG(k=5), cornac.metrics.FMeasure(), cornac.metrics.MAP()],
    user_based=True,
).run()


[ItemKNN-AdjustedCosineidf] Training started!


  0%|          | 0/4971 [00:00<?, ?it/s]


[ItemKNN-AdjustedCosineidf] Evaluation started!


Rating:   0%|          | 0/2896510 [00:00<?, ?it/s]

Ranking:   0%|          | 0/78748 [00:00<?, ?it/s]


[ItemKNN-AdjustedCosinebm25] Training started!


  0%|          | 0/4971 [00:00<?, ?it/s]


[ItemKNN-AdjustedCosinebm25] Evaluation started!


Rating:   0%|          | 0/2896510 [00:00<?, ?it/s]

Ranking:   0%|          | 0/78748 [00:00<?, ?it/s]


TEST:
...
                           |   RMSE |  F1@-1 |    MAP | NDCG@5 | Train (s) |  Test (s)
-------------------------- + ------ + ------ + ------ + ------ + --------- + ---------
ItemKNN-AdjustedCosineidf  | 1.2135 | 0.0153 | 0.0314 | 0.0657 |    8.9443 | 3465.5658
ItemKNN-AdjustedCosinebm25 | 1.2130 | 0.0153 | 0.0318 | 0.0678 |    9.7931 | 3505.7347



In [None]:
item_knn_adjustedbm25 = cornac.models.ItemKNN(
    k=20, similarity="cosine", mean_centered=True, name="ItemKNN-AdjustedCosinebm25", weighting= 'bm25', num_threads=0
)

In [19]:
ratio_split2.train_set

<cornac.data.dataset.Dataset at 0x1911eaeaca0>