In [1]:
!pip install recbole
!pip install ray
!pip install kmeans-pytorch

Collecting recbole
  Downloading recbole-1.2.0-py3-none-any.whl.metadata (1.4 kB)
Collecting colorlog==4.7.2 (from recbole)
  Downloading colorlog-4.7.2-py2.py3-none-any.whl.metadata (9.9 kB)
Collecting colorama==0.4.4 (from recbole)
  Downloading colorama-0.4.4-py2.py3-none-any.whl.metadata (14 kB)
Collecting thop>=0.1.1.post2207130030 (from recbole)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Collecting texttable>=0.9.0 (from recbole)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading recbole-1.2.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Downloading colorlog-4.7.2-py2.py3-none-any.whl (10 kB)
Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Installing collected packages: texttable, colorlog, 

#Data preprocessing

In [3]:
import os
import pandas as pd

# Load the dataset
file_path = 'ratings_Musical_Instruments.csv'
df = pd.read_csv(file_path, header=None)

# Rename columns to match RecBole's format
df.columns = ['user_id:token', 'item_id:token', 'rating:float', 'timestamp:float']

# Create the directory structure
dataset_name = 'musical_instruments'
output_dir = f'dataset/{dataset_name}'
os.makedirs(output_dir, exist_ok=True)

# Save the interaction file
output_file_inter = f'{output_dir}/{dataset_name}.inter'
df[['user_id:token', 'item_id:token', 'rating:float']].to_csv(output_file_inter, index=False, sep='\t')

# Create and save the user file
user_df = pd.DataFrame(df['user_id:token'].unique(), columns=['user_id:token'])
output_file_user = f'{output_dir}/{dataset_name}.user'
user_df.to_csv(output_file_user, index=False, sep='\t')

# Create and save the item file
item_df = pd.DataFrame(df['item_id:token'].unique(), columns=['item_id:token'])
output_file_item = f'{output_dir}/{dataset_name}.item'
item_df.to_csv(output_file_item, index=False, sep='\t')

# Print out the first few rows of each file for verification
print("First few rows of the interaction file:")
print(pd.read_csv(output_file_inter, sep='\t').head())

print("First few rows of the user file:")
print(pd.read_csv(output_file_user, sep='\t').head())

print("First few rows of the item file:")
print(pd.read_csv(output_file_item, sep='\t').head())

First few rows of the interaction file:
    user_id:token item_id:token  rating:float
0  A1YS9MDZP93857    0006428320           3.0
1  A3TS466QBAWB9D    0014072149           5.0
2  A3BUDYITWUSIS7    0041291905           5.0
3  A19K10Z0D2NTZK    0041913574           5.0
4  A14X336IB4JD89    0201891859           1.0
First few rows of the user file:
    user_id:token
0  A1YS9MDZP93857
1  A3TS466QBAWB9D
2  A3BUDYITWUSIS7
3  A19K10Z0D2NTZK
4  A14X336IB4JD89
First few rows of the item file:
  item_id:token
0    0006428320
1    0014072149
2    0041291905
3    0041913574
4    0201891859


#1) Research Questions Focused on Accuracy and Popularity

##1.1) How do different recommendation algorithms (e.g., BPR, ItemKNN) compare in terms of traditional accuracy metrics like Precision, Recall, and NDCG?

Testable Metrics: Precision, Recall, Hit, MRR, NDCG

Approach: Train and evaluate models using these algorithms on your datasets and compare the performance using these accuracy metrics.

In [4]:
from recbole.quick_start import run_recbole

# List of models to train and evaluate
models = ['BPR', 'ItemKNN', 'Pop']

# Iterate over the models and train/evaluate each one
for model in models:
    print(f"\nTraining and evaluating model: {model}")

    # Specify the configuration for training and evaluation
    config_dict = {
        'model': model,  # Use the current model
        'dataset': 'musical_instruments',  # Dataset name
        'data_path': 'dataset/',  # Path to the dataset folder
        'epochs': 1,  # Number of training epochs
        'topk': 10,  # Top-K items for evaluation
        'metrics': ['Precision', 'Recall', 'NDCG', 'Hit', 'MRR'],  # Metrics to evaluate
    }

    # Run the RecBole training and evaluation pipeline
    result = run_recbole(config_dict=config_dict)

    # Print the evaluation results for this model
    print(f"\nEvaluation results for {model}:")
    print(result)




Training and evaluating model: BPR


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|██████████████████████████████████████████████████| 12/12 [00:00<00:00, 27.31it/s]
Evaluate   : 100%|██████████████████████████████████████████████| 441/441 [00:00<00:00, 1091.49it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|████████████████████████████████████████████| 1362/1362 [00:01<00:00, 1308.97it/s]



Evaluation results for BPR:
{'best_valid_score': 0.0007, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('precision@10', 0.0005), ('recall@10', 0.0045), ('ndcg@10', 0.0016), ('hit@10', 0.0045), ('mrr@10', 0.0007)]), 'test_result': OrderedDict([('precision@10', 0.0001), ('recall@10', 0.0001), ('ndcg@10', 0.0001), ('hit@10', 0.0007), ('mrr@10', 0.0002)])}

Training and evaluating model: ItemKNN


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|█████████████████████████████████████████████████| 24/24 [00:00<00:00, 212.82it/s]
Evaluate   : 100%|███████████████████████████████████████████████| 441/441 [00:00<00:00, 881.02it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|█████████████████████████████████████████████| 1362/1362 [00:01<00:00, 883.42it/s]



Evaluation results for ItemKNN:
{'best_valid_score': 0.0053, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('precision@10', 0.0016), ('recall@10', 0.0147), ('ndcg@10', 0.0074), ('hit@10', 0.0159), ('mrr@10', 0.0053)]), 'test_result': OrderedDict([('precision@10', 0.002), ('recall@10', 0.0188), ('ndcg@10', 0.01), ('hit@10', 0.0198), ('mrr@10', 0.0074)])}

Training and evaluating model: Pop


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|█████████████████████████████████████████████████| 24/24 [00:00<00:00, 373.38it/s]
Evaluate   : 100%|██████████████████████████████████████████████| 441/441 [00:00<00:00, 1812.71it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|████████████████████████████████████████████| 1362/1362 [00:00<00:00, 1618.16it/s]



Evaluation results for Pop:
{'best_valid_score': 0.0091, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('precision@10', 0.0023), ('recall@10', 0.0227), ('ndcg@10', 0.0121), ('hit@10', 0.0227), ('mrr@10', 0.0091)]), 'test_result': OrderedDict([('precision@10', 0.0048), ('recall@10', 0.0477), ('ndcg@10', 0.0238), ('hit@10', 0.0477), ('mrr@10', 0.017)])}


##1.1) Results

This evaluation answers the research question by clearly showing how each algorithm performs on traditional accuracy metrics. In this case:

- **Pop** dominates in most accuracy metrics, implying that recommending popular items works well for the "Musical Instruments" dataset.
- **ItemKNN** shows a reasonable balance between accuracy metrics, indicating that similarity-based approaches can offer a balance between personalized and popular recommendations.
- **BPR** performs the worst, suggesting that its personalized pairwise ranking strategy may not capture the relationships needed to improve accuracy in this dataset.

These results provide insight into how different recommendation algorithms behave on the dataset and how effective they are in terms of traditional accuracy.


##1.2) What is the relationship between accuracy metrics and the popularity bias in recommendations?

Testable Metrics: AveragePopularity, NDCG

Approach: Analyze the extent to which algorithms favor popular items (measured by AveragePopularity) and how this correlates with their performance on accuracy metrics like NDCG.

In [5]:
# List of models to train and evaluate
models = ['BPR', 'ItemKNN', 'Pop']

# Iterate over the models and train/evaluate each one
for model in models:
    print(f"\nTraining and evaluating model: {model}")

    # Specify the configuration for training and evaluation
    config_dict = {
        'model': model,  # Use the current model
        'dataset': 'musical_instruments',  # Dataset name
        'data_path': 'dataset/',  # Path to the dataset folder
        'epochs': 1,  # Number of training epochs
        'topk': 10,  # Top-K items for evaluation
        'metrics': ['NDCG', 'AveragePopularity', 'MRR'],  # Metrics to evaluate
    }

    # Run the RecBole training and evaluation pipeline
    result = run_recbole(config_dict=config_dict)

    # Print the evaluation results for this model
    print(f"\nEvaluation results for {model}:")
    print(result)


Training and evaluating model: BPR


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|██████████████████████████████████████████████████| 12/12 [00:00<00:00, 21.47it/s]
Evaluate   : 100%|███████████████████████████████████████████████| 441/441 [00:00<00:00, 497.99it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|█████████████████████████████████████████████| 1362/1362 [00:02<00:00, 454.87it/s]



Evaluation results for BPR:
{'best_valid_score': 0.0007, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('ndcg@10', 0.0016), ('averagepopularity@10', 4.1658), ('mrr@10', 0.0007)]), 'test_result': OrderedDict([('ndcg@10', 0.0001), ('averagepopularity@10', 4.1314), ('mrr@10', 0.0002)])}

Training and evaluating model: ItemKNN


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|█████████████████████████████████████████████████| 24/24 [00:00<00:00, 291.91it/s]
Evaluate   : 100%|███████████████████████████████████████████████| 441/441 [00:00<00:00, 776.88it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|█████████████████████████████████████████████| 1362/1362 [00:01<00:00, 789.15it/s]



Evaluation results for ItemKNN:
{'best_valid_score': 0.0053, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('ndcg@10', 0.0074), ('averagepopularity@10', 4.0812), ('mrr@10', 0.0053)]), 'test_result': OrderedDict([('ndcg@10', 0.01), ('averagepopularity@10', 5.1952), ('mrr@10', 0.0074)])}

Training and evaluating model: Pop


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|█████████████████████████████████████████████████| 24/24 [00:00<00:00, 359.84it/s]
Evaluate   : 100%|██████████████████████████████████████████████| 441/441 [00:00<00:00, 1319.80it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|████████████████████████████████████████████| 1362/1362 [00:01<00:00, 1335.89it/s]



Evaluation results for Pop:
{'best_valid_score': 0.0091, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('ndcg@10', 0.0121), ('averagepopularity@10', 312.6991), ('mrr@10', 0.0091)]), 'test_result': OrderedDict([('ndcg@10', 0.0238), ('averagepopularity@10', 312.253), ('mrr@10', 0.017)])}


##1.2) Results

This evaluation addresses the research question by showing how algorithms balance accuracy (measured by NDCG) and popularity bias (measured by AveragePopularity). Here's the interpretation of the results:

- **Pop** exhibits the highest **AveragePopularity** score, indicating a strong bias towards recommending popular items. However, it also achieves the highest NDCG, showing that recommending popular items can improve accuracy but may limit diversity.
- **ItemKNN** has a more moderate **AveragePopularity** score, suggesting it balances popular and less popular items in recommendations. Its NDCG score is also reasonable, meaning it maintains accuracy while reducing some popularity bias.
- **BPR** has the lowest **AveragePopularity**, implying it tends to recommend less popular items. However, this comes at the cost of accuracy, as seen by its lower NDCG score.

These results show the trade-off between recommending popular items and maintaining accuracy. As algorithms reduce popularity bias (lower AveragePopularity), they tend to sacrifice some accuracy (NDCG).

This illustrates the relationship between accuracy and popularity bias, highlighting how different algorithms prioritize one over the other.


#2) Research Questions Focused on Diversity and Coverage
##2.1)How do recommendation algorithms balance item diversity and coverage while maintaining accuracy?
Testable Metrics: GiniIndex, ItemCoverage, ShannonEntropy, TailPercentage, NDCG

Approach: Measure the diversity (GiniIndex, ShannonEntropy) and coverage (ItemCoverage, TailPercentage) of recommendations and compare them against accuracy metrics (e.g., NDCG) to understand the tradeoffs.


In [6]:
# List of models to train and evaluate
models = ['BPR', 'ItemKNN', 'Pop']

# Iterate over the models and train/evaluate each one
for model in models:
    print(f"\nTraining and evaluating model: {model}")

    # Specify the configuration for training and evaluation
    config_dict = {
        'model': model,  # Use the current model
        'dataset': 'musical_instruments',  # Dataset name
        'data_path': 'dataset/',  # Path to the dataset folder
        'epochs': 1,  # Number of training epochs
        'topk': 10,  # Top-K items for evaluation
        'metrics': ['NDCG', 'GiniIndex', 'ShannonEntropy', 'ItemCoverage', 'TailPercentage', 'MRR'],  # Metrics to evaluate
    }

    # Run the RecBole training and evaluation pipeline
    result = run_recbole(config_dict=config_dict)

    # Print the evaluation results for this model
    print(f"\nEvaluation results for {model}:")
    print(result)


Training and evaluating model: BPR


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|██████████████████████████████████████████████████| 12/12 [00:00<00:00, 19.24it/s]
Evaluate   : 100%|███████████████████████████████████████████████| 441/441 [00:00<00:00, 991.65it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|████████████████████████████████████████████| 1362/1362 [00:01<00:00, 1011.17it/s]



Evaluation results for BPR:
{'best_valid_score': 0.0007, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('ndcg@10', 0.0016), ('giniindex@10', 0.6626), ('shannonentropy@10', 0.003), ('itemcoverage@10', 0.476), ('tailpercentage@10', 0.0946), ('mrr@10', 0.0007)]), 'test_result': OrderedDict([('ndcg@10', 0.0001), ('giniindex@10', 0.5315), ('shannonentropy@10', 0.0019), ('itemcoverage@10', 0.7765), ('tailpercentage@10', 0.0925), ('mrr@10', 0.0002)])}

Training and evaluating model: ItemKNN


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|█████████████████████████████████████████████████| 24/24 [00:00<00:00, 410.43it/s]
Evaluate   : 100%|██████████████████████████████████████████████| 441/441 [00:00<00:00, 1212.29it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|████████████████████████████████████████████| 1362/1362 [00:01<00:00, 1227.01it/s]



Evaluation results for ItemKNN:
{'best_valid_score': 0.0053, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('ndcg@10', 0.0074), ('giniindex@10', 0.9741), ('shannonentropy@10', 0.0081), ('itemcoverage@10', 0.1042), ('tailpercentage@10', 0.488), ('mrr@10', 0.0053)]), 'test_result': OrderedDict([('ndcg@10', 0.01), ('giniindex@10', 0.9822), ('shannonentropy@10', 0.0057), ('itemcoverage@10', 0.1287), ('tailpercentage@10', 0.5518), ('mrr@10', 0.0074)])}

Training and evaluating model: Pop


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|█████████████████████████████████████████████████| 24/24 [00:00<00:00, 346.78it/s]
Evaluate   : 100%|██████████████████████████████████████████████| 441/441 [00:00<00:00, 1561.81it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|████████████████████████████████████████████| 1362/1362 [00:01<00:00, 1128.33it/s]



Evaluation results for Pop:
{'best_valid_score': 0.0091, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('ndcg@10', 0.0121), ('giniindex@10', 0.9981), ('shannonentropy@10', 0.1937), ('itemcoverage@10', 0.0022), ('tailpercentage@10', 0.0), ('mrr@10', 0.0091)]), 'test_result': OrderedDict([('ndcg@10', 0.0238), ('giniindex@10', 0.9981), ('shannonentropy@10', 0.1941), ('itemcoverage@10', 0.0022), ('tailpercentage@10', 0.0), ('mrr@10', 0.017)])}


##2.1) Results

These results show that there is a trade-off between diversity (GiniIndex, ShannonEntropy) and accuracy (NDCG). Pop focuses on popular items with high accuracy but lacks diversity, while ItemKNN provides a better balance between diversity and accuracy. BPR spreads recommendations across more items but sacrifices accuracy.

#3) Increasing Diversity to Reduce Popularity Bias
##**Research Question:** Can increasing the diversity of recommendations reduce the popularity bias, and how does this impact overall accuracy?

Testable Metrics: AveragePopularity, GiniIndex, ShannonEntropy, NDCG

Approach: Use RecBole to train models like BPR, ItemKNN, and Pop on your dataset.
Evaluate the models using the specified metrics to see how diversity (GiniIndex, ShannonEntropy) affects the recommendation of popular items (AveragePopularity) and the accuracy (NDCG).

In [7]:
# List of models to train and evaluate
models = ['BPR', 'ItemKNN', 'Pop']
# Metrics to evaluate for each research question
metrics_list = [
    ['AveragePopularity', 'GiniIndex', 'ShannonEntropy', 'NDCG', 'MRR'],  # For diversity and popularity bias
]

# Iterate over the models and metrics
for i, metrics in enumerate(metrics_list, start=1):
    print(f"\nResearch Question {i}: Evaluating with metrics: {metrics}")

    for model in models:
        print(f"\nTraining and evaluating model: {model}")

        config_dict = {
            'model': model,
            'dataset': dataset_name,
            'data_path': 'dataset/',
            'epochs': 1,
            'topk': 10,
            'metrics': metrics,
        }

        result = run_recbole(config_dict=config_dict)
        print(f"\nEvaluation results for {model} on Research Question {i}:")
        print(result)


Research Question 1: Evaluating with metrics: ['AveragePopularity', 'GiniIndex', 'ShannonEntropy', 'NDCG', 'MRR']

Training and evaluating model: BPR


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|██████████████████████████████████████████████████| 12/12 [00:00<00:00, 27.34it/s]
Evaluate   : 100%|██████████████████████████████████████████████| 441/441 [00:00<00:00, 1003.79it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|█████████████████████████████████████████████| 1362/1362 [00:01<00:00, 952.88it/s]



Evaluation results for BPR on Research Question 1:
{'best_valid_score': 0.0007, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('averagepopularity@10', 4.1658), ('giniindex@10', 0.6626), ('shannonentropy@10', 0.003), ('ndcg@10', 0.0016), ('mrr@10', 0.0007)]), 'test_result': OrderedDict([('averagepopularity@10', 4.1314), ('giniindex@10', 0.5315), ('shannonentropy@10', 0.0019), ('ndcg@10', 0.0001), ('mrr@10', 0.0002)])}

Training and evaluating model: ItemKNN


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|█████████████████████████████████████████████████| 24/24 [00:00<00:00, 192.44it/s]
Evaluate   : 100%|███████████████████████████████████████████████| 441/441 [00:00<00:00, 825.71it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|█████████████████████████████████████████████| 1362/1362 [00:01<00:00, 780.20it/s]



Evaluation results for ItemKNN on Research Question 1:
{'best_valid_score': 0.0053, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('averagepopularity@10', 4.0812), ('giniindex@10', 0.9741), ('shannonentropy@10', 0.0081), ('ndcg@10', 0.0074), ('mrr@10', 0.0053)]), 'test_result': OrderedDict([('averagepopularity@10', 5.1952), ('giniindex@10', 0.9822), ('shannonentropy@10', 0.0057), ('ndcg@10', 0.01), ('mrr@10', 0.0074)])}

Training and evaluating model: Pop


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|█████████████████████████████████████████████████| 24/24 [00:00<00:00, 368.49it/s]
Evaluate   : 100%|██████████████████████████████████████████████| 441/441 [00:00<00:00, 1376.15it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|████████████████████████████████████████████| 1362/1362 [00:01<00:00, 1272.74it/s]



Evaluation results for Pop on Research Question 1:
{'best_valid_score': 0.0091, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('averagepopularity@10', 312.6991), ('giniindex@10', 0.9981), ('shannonentropy@10', 0.1937), ('ndcg@10', 0.0121), ('mrr@10', 0.0091)]), 'test_result': OrderedDict([('averagepopularity@10', 312.253), ('giniindex@10', 0.9981), ('shannonentropy@10', 0.1941), ('ndcg@10', 0.0238), ('mrr@10', 0.017)])}


##3) Results

This evaluation answers the research question by showing how diversity metrics like GiniIndex and ShannonEntropy influence popularity bias (measured by AveragePopularity) and accuracy (measured by NDCG).

**BPR**:

- Balances diversity and popularity with moderate GiniIndex and ShannonEntropy scores but has lower accuracy (NDCG).
- The popularity of recommended items is relatively low, meaning it provides a more diverse set of recommendations.

**ItemKNN**:

- Has higher GiniIndex and ShannonEntropy than BPR, but also a higher AveragePopularity, indicating that while it increases diversity, it still leans towards recommending popular items.
- Accuracy (NDCG) improves slightly over BPR, showing a better tradeoff between diversity and accuracy.

**Pop**:

- As expected, it heavily favors popular items (very high AveragePopularity), with minimal diversity (GiniIndex and ShannonEntropy).
- This results in the highest accuracy (NDCG), but at the cost of very low diversity in recommendations.

In summary, increasing diversity in algorithms like BPR reduces the popularity bias but comes with a tradeoff in accuracy, while ItemKNN offers a middle ground. Pop sacrifices diversity entirely for higher accuracy.


#4) Tradeoff Between Diversity and Accuracy
## **Research Question:** Is there a measurable tradeoff between recommending diverse items and maintaining high recommendation accuracy?

Testable Metrics: Precision, Recall, NDCG, GiniIndex, ShannonEntropy
Approach:
Compare the results of models optimized for accuracy (like BPR or ItemKNN) against those optimized for diversity (Pop might naturally show higher diversity due to its simplistic approach).
Run your models and evaluate the metrics to see if boosting diversity (high GiniIndex, ShannonEntropy) leads to a drop in accuracy (Precision, Recall, NDCG).

In [8]:
# List of models to train and evaluate
models = ['BPR', 'ItemKNN', 'Pop']

# Metrics to evaluate for each research question
metrics_list = [
    ['Precision', 'Recall', 'NDCG', 'GiniIndex', 'ShannonEntropy', 'MRR'],  # For diversity and accuracy tradeoff
]

# Iterate over the models and metrics
for i, metrics in enumerate(metrics_list, start=1):
    print(f"\nResearch Question {i}: Evaluating with metrics: {metrics}")

    for model in models:
        print(f"\nTraining and evaluating model: {model}")

        config_dict = {
            'model': model,
            'dataset': dataset_name,
            'data_path': 'dataset/',
            'epochs': 1,
            'topk': 10,
            'metrics': metrics,
        }

        result = run_recbole(config_dict=config_dict)
        print(f"\nEvaluation results for {model} on Research Question {i}:")
        print(result)


Research Question 1: Evaluating with metrics: ['Precision', 'Recall', 'NDCG', 'GiniIndex', 'ShannonEntropy', 'MRR']

Training and evaluating model: BPR


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|██████████████████████████████████████████████████| 12/12 [00:00<00:00, 25.34it/s]
Evaluate   : 100%|██████████████████████████████████████████████| 441/441 [00:00<00:00, 1082.15it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|█████████████████████████████████████████████| 1362/1362 [00:01<00:00, 945.70it/s]



Evaluation results for BPR on Research Question 1:
{'best_valid_score': 0.0007, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('precision@10', 0.0005), ('recall@10', 0.0045), ('ndcg@10', 0.0016), ('giniindex@10', 0.6626), ('shannonentropy@10', 0.003), ('mrr@10', 0.0007)]), 'test_result': OrderedDict([('precision@10', 0.0001), ('recall@10', 0.0001), ('ndcg@10', 0.0001), ('giniindex@10', 0.5315), ('shannonentropy@10', 0.0019), ('mrr@10', 0.0002)])}

Training and evaluating model: ItemKNN


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|█████████████████████████████████████████████████| 24/24 [00:00<00:00, 168.51it/s]
Evaluate   : 100%|███████████████████████████████████████████████| 441/441 [00:00<00:00, 703.74it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|█████████████████████████████████████████████| 1362/1362 [00:01<00:00, 722.03it/s]



Evaluation results for ItemKNN on Research Question 1:
{'best_valid_score': 0.0053, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('precision@10', 0.0016), ('recall@10', 0.0147), ('ndcg@10', 0.0074), ('giniindex@10', 0.9741), ('shannonentropy@10', 0.0081), ('mrr@10', 0.0053)]), 'test_result': OrderedDict([('precision@10', 0.002), ('recall@10', 0.0188), ('ndcg@10', 0.01), ('giniindex@10', 0.9822), ('shannonentropy@10', 0.0057), ('mrr@10', 0.0074)])}

Training and evaluating model: Pop


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|█████████████████████████████████████████████████| 24/24 [00:00<00:00, 377.44it/s]
Evaluate   : 100%|██████████████████████████████████████████████| 441/441 [00:00<00:00, 1384.08it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|████████████████████████████████████████████| 1362/1362 [00:01<00:00, 1143.13it/s]



Evaluation results for Pop on Research Question 1:
{'best_valid_score': 0.0091, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('precision@10', 0.0023), ('recall@10', 0.0227), ('ndcg@10', 0.0121), ('giniindex@10', 0.9981), ('shannonentropy@10', 0.1937), ('mrr@10', 0.0091)]), 'test_result': OrderedDict([('precision@10', 0.0048), ('recall@10', 0.0477), ('ndcg@10', 0.0238), ('giniindex@10', 0.9981), ('shannonentropy@10', 0.1941), ('mrr@10', 0.017)])}


##4) Results

This evaluation answers the research question by highlighting the tradeoff between recommending diverse items and maintaining high accuracy.

**BPR**:
- Shows moderate diversity (GiniIndex, ShannonEntropy) but lower accuracy (Precision, Recall, NDCG).
- Its balanced approach toward diversity affects its ability to maintain high recommendation accuracy.

**ItemKNN**:
- Achieves higher diversity than BPR, with strong GiniIndex and ShannonEntropy values.
- Although diversity is improved, the accuracy metrics (Precision, Recall, NDCG) remain reasonable, showing a balanced tradeoff.

**Pop**:
- Maximizes diversity (extremely high GiniIndex and ShannonEntropy), but at the cost of accuracy.
- Its simplistic approach results in higher Precision, Recall, and NDCG values, but its focus on popular items results in limited diversity.

Overall, the tradeoff is evident: models like Pop can achieve higher accuracy by focusing on popular items, while BPR and ItemKNN improve diversity at the expense of some accuracy.


#5) Long-Tail Recommendations
##**Research Question:** How well do traditional recommendation algorithms perform in recommending items from the long tail of the distribution?

Testable Metrics: TailPercentage, ItemCoverage, NDCG

Approach:
Train models like BPR, ItemKNN, and Pop on your dataset.
Use the specified metrics to measure how well each model can recommend long-tail items (TailPercentage, ItemCoverage) and how this affects overall accuracy (NDCG).


In [9]:
# List of models to train and evaluate
models = ['BPR', 'ItemKNN', 'Pop']

# Metrics to evaluate for each research question
metrics_list = [
    ['TailPercentage', 'ItemCoverage', 'NDCG', 'MRR'] # For long-tail recommendations
]

# Iterate over the models and metrics
for i, metrics in enumerate(metrics_list, start=1):
    print(f"\nResearch Question {i}: Evaluating with metrics: {metrics}")

    for model in models:
        print(f"\nTraining and evaluating model: {model}")

        config_dict = {
            'model': model,
            'dataset': dataset_name,
            'data_path': 'dataset/',
            'epochs': 1,
            'topk': 10,
            'metrics': metrics,
        }

        result = run_recbole(config_dict=config_dict)
        print(f"\nEvaluation results for {model} on Research Question {i}:")
        print(result)


Research Question 1: Evaluating with metrics: ['TailPercentage', 'ItemCoverage', 'NDCG', 'MRR']

Training and evaluating model: BPR


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|██████████████████████████████████████████████████| 12/12 [00:00<00:00, 26.10it/s]
Evaluate   : 100%|███████████████████████████████████████████████| 441/441 [00:00<00:00, 932.16it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|█████████████████████████████████████████████| 1362/1362 [00:01<00:00, 928.14it/s]



Evaluation results for BPR on Research Question 1:
{'best_valid_score': 0.0007, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('tailpercentage@10', 0.0946), ('itemcoverage@10', 0.476), ('ndcg@10', 0.0016), ('mrr@10', 0.0007)]), 'test_result': OrderedDict([('tailpercentage@10', 0.0925), ('itemcoverage@10', 0.7765), ('ndcg@10', 0.0001), ('mrr@10', 0.0002)])}

Training and evaluating model: ItemKNN


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|█████████████████████████████████████████████████| 24/24 [00:00<00:00, 377.29it/s]
Evaluate   : 100%|██████████████████████████████████████████████| 441/441 [00:00<00:00, 1068.03it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|█████████████████████████████████████████████| 1362/1362 [00:01<00:00, 984.30it/s]



Evaluation results for ItemKNN on Research Question 1:
{'best_valid_score': 0.0053, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('tailpercentage@10', 0.488), ('itemcoverage@10', 0.1042), ('ndcg@10', 0.0074), ('mrr@10', 0.0053)]), 'test_result': OrderedDict([('tailpercentage@10', 0.5518), ('itemcoverage@10', 0.1287), ('ndcg@10', 0.01), ('mrr@10', 0.0074)])}

Training and evaluating model: Pop


  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|█████████████████████████████████████████████████| 24/24 [00:00<00:00, 356.49it/s]
Evaluate   : 100%|██████████████████████████████████████████████| 441/441 [00:00<00:00, 1338.75it/s]
  checkpoint = torch.load(checkpoint_file, map_location=self.device)
Evaluate   : 100%|████████████████████████████████████████████| 1362/1362 [00:01<00:00, 1156.94it/s]



Evaluation results for Pop on Research Question 1:
{'best_valid_score': 0.0091, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('tailpercentage@10', 0.0), ('itemcoverage@10', 0.0022), ('ndcg@10', 0.0121), ('mrr@10', 0.0091)]), 'test_result': OrderedDict([('tailpercentage@10', 0.0), ('itemcoverage@10', 0.0022), ('ndcg@10', 0.0238), ('mrr@10', 0.017)])}


##5) Results

This evaluation answers the research question by showing how well each algorithm performs in recommending long-tail items and how it impacts overall accuracy.

**BPR**:
- Moderate performance in recommending long-tail items with decent TailPercentage and ItemCoverage.
- However, the accuracy (NDCG) remains relatively low, indicating that while it covers more items, it sacrifices accuracy.

**ItemKNN**:
- Performs well in recommending long-tail items with higher TailPercentage but lower ItemCoverage compared to BPR.
- Its accuracy is better than BPR (higher NDCG), showing a more balanced performance between covering long-tail items and maintaining accuracy.

**Pop**:
- Struggles to recommend long-tail items, as shown by the very low TailPercentage and ItemCoverage.
- This focus on popular items results in higher accuracy (NDCG), but at the cost of almost completely ignoring the long-tail.

In summary, BPR and ItemKNN are better suited for long-tail recommendations, while Pop focuses on popular items at the expense of long-tail coverage.
