## Loading Data & Importing Libraries

In [3]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

In [4]:
# import datasets
artists = pd.read_csv(os.path.join('..','data','artists.dat'), delimiter='\t')
tags = pd.read_csv(os.path.join('..','data','tags.dat'), delimiter='\t',encoding='ISO-8859-1')
user_artists = pd.read_csv(os.path.join('..','data','user_artists.dat'), delimiter='\t')
user_friends = pd.read_csv(os.path.join('..','data','user_friends.dat'), delimiter='\t')
user_taggedartists_timestamps = pd.read_csv(os.path.join('..','data','user_taggedartists-timestamps.dat'), delimiter='\t')
user_taggedartists = pd.read_csv(os.path.join('..','data','user_taggedartists.dat'), delimiter='\t')

## Data Cleaning

In [5]:
# Drop irrelevant columns from the Artists dataset
artists_cleaned = artists.drop(columns=['url', 'pictureURL']).drop_duplicates(keep='first') 

# Drop the irrelevant columns in the Tags dataset
tags_cleaned = tags.drop_duplicates(keep='first') 

# For the User-Artists dataset, we can filter out rows with a weight of 0, as they show no meaningful interaction
# user_artists_cleaned = user_artists[user_artists['weight'] > 0]
user_artists_cleaned = user_artists.drop_duplicates(keep='first') 

# Drop duplicates from the User-Tagged Artists Timestamps dataset
user_taggedartists_timestamps_cleaned = user_taggedartists_timestamps.drop_duplicates(keep='first') 

# Convert timestamps from ms to datetime format
user_taggedartists_timestamps_cleaned['timestamp'] = pd.to_datetime(user_taggedartists_timestamps_cleaned['timestamp'], unit='ms')

# Drop duplicates from the User-Friends dataset
user_friends_cleaned = user_friends.drop_duplicates(keep='first') 

# # Output cleaned datasets for inspection
# print("Cleaned Artists dataset:", artists_cleaned.info(), artists_cleaned.head())
# print("Cleaned Tags dataset:", tags_cleaned.info(), tags_cleaned.head())
# print("Cleaned User-Artists dataset:", user_artists_cleaned.info(), user_artists_cleaned.head())
# print("Cleaned User-Tagged Artists Timestamps dataset:", user_taggedartists_timestamps_cleaned.info(), user_taggedartists_timestamps_cleaned.head())
# print("Cleaned User-Friends dataset:", user_friends_cleaned.info(), user_friends_cleaned.head())

In [6]:
# Create a dictionary to map artistID to artistName
artist_id_to_name = dict(zip(artists['id'], artists['name']))

In [7]:
print(user_artists_cleaned)

       userID  artistID  weight
0           2        51   13883
1           2        52   11690
2           2        53   11351
3           2        54   10300
4           2        55    8983
...       ...       ...     ...
92829    2100     18726     337
92830    2100     18727     297
92831    2100     18728     281
92832    2100     18729     280
92833    2100     18730     263

[92834 rows x 3 columns]


---
## Train-Test Data Split

In [10]:
from sklearn.model_selection import train_test_split

# Ensure user_artists_cleaned has non-zero weights
user_artists_cleaned = user_artists_cleaned[user_artists_cleaned['weight'] > 0]

# Perform global train-test split
train_data, test_data = train_test_split(user_artists_cleaned, test_size=0.2, random_state=42)

# Create train and test matrices (copy the original user_artist_matrix)
train_matrix = user_artist_matrix.copy()
test_matrix = user_artist_matrix.copy()

# Set all non-train interactions in the train matrix to 0
train_matrix.loc[:, :] = 0
for row in train_data.itertuples(index=False):
    train_matrix.loc[row.userID, row.artistID] = row.weight

# Set all non-test interactions in the test matrix to 0
test_matrix.loc[:, :] = 0
for row in test_data.itertuples(index=False):
    test_matrix.loc[row.userID, row.artistID] = row.weight

# Verification
print(f"Train interactions: {train_data.shape[0]}")
print(f"Test interactions: {test_data.shape[0]}")


Train interactions: 74267
Test interactions: 18567


---

## User-based Implementation Testing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Function to get user-based recommendations
def get_user_based_recommendations(user_id, user_similarity_df, user_artist_matrix, artist_id_to_name, top_n=10, train_matrix=None, test_matrix=None):
    # Print the current train and test data used for this run
    print(f"Training data for user {user_id}:")
    print(train_matrix.loc[user_id])  # Training data for the specific user
    print(f"Test data for user {user_id}:")
    print(test_matrix.loc[user_id])  # Test data for the specific user
    
    # Get the most similar users (excluding the user itself)
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]

    recommendations = {}
    for similar_user in similar_users:
        # Get the artists this similar user has interacted with (non-zero values)
        interacted_artists = user_artist_matrix.loc[similar_user][user_artist_matrix.loc[similar_user] > 0].index.tolist()

        for artist in interacted_artists:
            # Only consider artists the target user has not interacted with
            if artist not in user_artist_matrix.loc[user_id][user_artist_matrix.loc[user_id] > 0].index.tolist():
                # Add the artist to recommendations with a score (using the scaled similarity as a weight)
                if artist not in recommendations:
                    recommendations[artist] = user_similarity_df[user_id][similar_user]
                else:
                    # Add the weight of similarity to the current score
                    recommendations[artist] += user_similarity_df[user_id][similar_user]

    # Sort recommendations by score (highest first)
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)

    # Convert artist IDs to names and prepare the final list with IDs, names, and scores
    recommended_artists = [(artist, artist_id_to_name.get(artist, "Unknown"), score) for artist, score in sorted_recommendations[:top_n]]

    return recommended_artists

# Global train-test split (ensure this is applied once at the start)
train_data, test_data = train_test_split(user_artists_cleaned, test_size=0.2, random_state=42)

# Create train and test matrices (copy the original user_artist_matrix)
train_matrix = user_artist_matrix.copy()
test_matrix = user_artist_matrix.copy()

# Set all non-train interactions in the train matrix to 0
train_matrix.loc[:, :] = 0
for row in train_data.itertuples(index=False):
    train_matrix.loc[row.userID, row.artistID] = row.weight

# Set all non-test interactions in the test matrix to 0
test_matrix.loc[:, :] = 0
for row in test_data.itertuples(index=False):
    test_matrix.loc[row.userID, row.artistID] = row.weight

# Verify the train-test splits
print(f"Train data size: {train_data.shape[0]}")
print(f"Test data size: {test_data.shape[0]}")

# Example: Get top 5 user-based recommendations for user with userID=2
user_id = 2
user_based_recommendations = get_user_based_recommendations(user_id, user_similarity_df, user_artist_matrix, artist_id_to_name, top_n=5, train_matrix=train_matrix, test_matrix=test_matrix)

# Display user-based recommendations in the required format
print("Top User-Based Recommendations for User 2:")
for artist_id, artist_name, score in user_based_recommendations:
    print(f"Artist ID: {artist_id}, Artist: {artist_name}, Similarity Score: {score:.2f}")

# Function to evaluate recommendations on test data
def evaluate_recommendations(user_id, user_similarity_df_train, train_matrix, test_matrix, artist_id_to_name, top_n=10):
    recommended_artists = get_user_based_recommendations(user_id, user_similarity_df_train, train_matrix, artist_id_to_name, top_n, train_matrix, test_matrix)

    # Get the actual interacted artists from the test set
    actual_artists = test_matrix.loc[user_id][test_matrix.loc[user_id] > 0].index.tolist()

    # Extract recommended artist IDs from the recommendations list
    recommended_artists_ids = [artist_id for artist_id, _, _ in recommended_artists]

    # Precision at K
    precision_at_k = precision_score([1 if artist in actual_artists else 0 for artist in recommended_artists_ids],
                                     [1] * len(recommended_artists_ids), average='micro')

    # Recall at K
    recall_at_k = recall_score([1 if artist in actual_artists else 0 for artist in recommended_artists_ids],
                               [1] * len(recommended_artists_ids), average='micro')

    # F1 at K
    f1_at_k = f1_score([1 if artist in actual_artists else 0 for artist in recommended_artists_ids],
                        [1] * len(recommended_artists_ids), average='micro')

    return precision_at_k, recall_at_k, f1_at_k

# Example: Evaluate recommendations for a user in the test set
precision_at_k, recall_at_k, f1_at_k = evaluate_recommendations(user_id=2, 
                                                                user_similarity_df_train=user_similarity_df_train, 
                                                                train_matrix=train_matrix, 
                                                                test_matrix=test_matrix, 
                                                                artist_id_to_name=artist_id_to_name, 
                                                                top_n=10)

print(f"Precision@K: {precision_at_k}")
print(f"Recall@K: {recall_at_k}")
print(f"F1@K: {f1_at_k}")


Train data size: 74267
Test data size: 18567


NameError: name 'user_similarity_df' is not defined

In [None]:
print(train_matrix.loc[user_id].value_counts())
print(test_matrix.loc[user_id].value_counts())

2
0.0        17584
2382.0         1
2119.0         1
1990.0         1
1972.0         1
1948.0         1
1868.0         1
1792.0         1
1740.0         1
1638.0         1
1594.0         1
1559.0         1
1553.0         1
1519.0         1
1438.0         1
1411.0         1
1407.0         1
1373.0         1
1363.0         1
1342.0         1
1337.0         1
1332.0         1
1330.0         1
2120.0         1
2397.0         1
13883.0        1
2547.0         1
11690.0        1
11351.0        1
10300.0        1
8983.0         1
6152.0         1
5955.0         1
4616.0         1
4147.0         1
3923.0         1
3782.0         1
3735.0         1
3644.0         1
3579.0         1
3312.0         1
3301.0         1
2927.0         1
2720.0         1
2686.0         1
2654.0         1
2619.0         1
2584.0         1
1315.0         1
Name: count, dtype: int64
2
0.0       17630
4337.0        1
1471.0        1
Name: count, dtype: int64


The training and test data is extremely sparse, with a large number of zeroes. This is significantly affecting the performance of our recommendations, hence, we need to use different methods to overcome this.

---

## Item-based Implementation Testing - EDIT

In [30]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Create a user-artist interaction matrix
user_artist_matrix = user_artists_cleaned.pivot(index='userID', columns='artistID', values='weight')
user_artist_matrix = user_artist_matrix.fillna(0)

# Compute cosine similarity between items (artists in this case)
item_similarity = cosine_similarity(user_artist_matrix.T)  # Transpose to compute similarity between artists
item_similarity_df = pd.DataFrame(item_similarity, index=user_artist_matrix.columns, columns=user_artist_matrix.columns)

# Function to get item-based recommendations for a user
def get_item_based_recommendations(user_id, item_similarity_df, user_artist_matrix, artist_id_to_name, top_n=10):
    if user_id not in user_artist_matrix.index:
        raise ValueError(f"user_id {user_id} not found in user_artist_matrix")
    
    # Get the artists the user has interacted with (non-zero values)
    interacted_artists = user_artist_matrix.loc[user_id][user_artist_matrix.loc[user_id] > 0].index.tolist()
    
    recommendations = {}
    
    # For each artist the user has interacted with, find similar artists
    for artist in interacted_artists:
        similar_artists = item_similarity_df[artist].sort_values(ascending=False).index[1:]  # Exclude the artist itself
        
        for similar_artist in similar_artists:
            if similar_artist not in user_artist_matrix.loc[user_id][user_artist_matrix.loc[user_id] > 0].index.tolist():
                if similar_artist not in recommendations:
                    recommendations[similar_artist] = item_similarity_df[artist][similar_artist]
                else:
                    recommendations[similar_artist] += item_similarity_df[artist][similar_artist]
    
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    
    # Convert artist IDs to names using artist_id_to_name
    recommended_artists = [(artist_id_to_name.get(artist_id, "Unknown"), score) for artist_id, score in sorted_recommendations[:top_n]]
    return recommended_artists

# Precision@K for a single user
def precision_at_k_single_user(recommended_artists, actual_artists, k):
    recommended_artists_k = [artist for artist, _ in recommended_artists[:k]]
    relevant_items = set(recommended_artists_k).intersection(set(actual_artists))
    
    if k == 0: 
        return 0
    
    return len(relevant_items) / k

# Function to create a train-test split for each user (80% train, 20% test)
def get_train_test_data(user_artist_matrix, user_id, test_size=0.2):
    user_data = user_artist_matrix.loc[user_id]
    non_zero_interactions = user_data[user_data > 0]
    
    train_data, test_data = train_test_split(non_zero_interactions.index, test_size=test_size)
    
    train_data = user_artist_matrix.loc[user_id, train_data]
    test_data = user_artist_matrix.loc[user_id, test_data]
    
    return train_data, test_data

# Evaluate Precision@K for a single user using train-test split
def evaluate_item_based_recommendations(user_id, item_similarity_df, user_artist_matrix, artist_id_to_name, k=10):
    # Get train and test data for the user
    train_data, test_data = get_train_test_data(user_artist_matrix, user_id)
    
    if len(test_data) == 0:
        print(f"User {user_id} has no interactions in the test set.")
        return None
    
    # Convert test data artist IDs to names
    test_data_artist_names = [artist_id_to_name.get(artist_id, "Unknown") for artist_id in test_data.tolist()]
    
    print(f"Test data for User {user_id}: {test_data_artist_names}")
    
    # Get item-based recommendations for the user
    recommendations = get_item_based_recommendations(user_id, item_similarity_df, user_artist_matrix, artist_id_to_name, top_n=k)
    
    print(f"Recommended artists for User {user_id}: {recommendations}")
    
    recommended_artists = [artist for artist, _ in recommendations]
    
    # Precision@K evaluation
    precision = precision_at_k_single_user(recommendations, test_data_artist_names, k)
    return precision


In [31]:
# Example: Evaluate Precision@K for a specific user using item-based collaborative filtering
user_id = 400
precision = evaluate_item_based_recommendations(user_id, item_similarity_df, user_artist_matrix, artist_id_to_name, k=10)

if precision is not None:
    print(f"\nPrecision@10 for User {user_id}: {precision:.4f}")
else:
    print(f"No test data for User {user_id}")


Test data for User 400: ['Enigma', 'Pleq & Chihiro', 'Café Del Mar', 'Prefab Sprout', 'Napalm Death', 'Icehouse', 'Cock Robin', 'ABC', 'Talk Talk', 'Alicia Keys']
Recommended artists for User 400: [('The Pussycat Dolls', 6.075895826180479), ('Lily Allen', 5.7880441242910905), ('Natasha Bedingfield', 5.588535556689219), ('Jordin Sparks', 5.248566112199366), ('Gwen Stefani', 5.240362225570055), ('Cascada', 4.9372139515057025), ('Cheryl Cole', 4.824498405678474), ('Karl Wolf', 4.709049898458179), ('Outlandish', 4.709049898458179), ('Cameron Cartio', 4.709049898458179)]

Precision@10 for User 400: 0.0000


---

## SVD Method Implementation Testing

In [69]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Ensure user_artists_cleaned has non-zero weights
user_artists_cleaned = user_artists_cleaned[user_artists_cleaned['weight'] > 0]

# Perform global train-test split
train_data, test_data = train_test_split(user_artists_cleaned, test_size=0.2, random_state=42)

# Create train and test matrices (copy the original user_artist_matrix)
train_matrix = user_artist_matrix.copy()
test_matrix = user_artist_matrix.copy()

# Set all non-train interactions in the train matrix to 0
train_matrix.loc[:, :] = 0
for row in train_data.itertuples(index=False):
    train_matrix.loc[row.userID, row.artistID] = row.weight

# Set all non-test interactions in the test matrix to 0
test_matrix.loc[:, :] = 0
for row in test_data.itertuples(index=False):
    test_matrix.loc[row.userID, row.artistID] = row.weight

# Verification
print(f"Train interactions: {train_data.shape[0]}")
print(f"Test interactions: {test_data.shape[0]}")

# Function to get SVD-based recommendations
def get_svd_recommendations(user_id, user_artist_matrix, artist_id_to_name, top_n=10, n_components=50):
    # Apply SVD to the user-artist matrix
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    svd_matrix = svd.fit_transform(user_artist_matrix)
    svd_components = svd.components_

    # Reconstruct the user-artist interaction matrix
    reconstructed_matrix = np.dot(svd_matrix, svd_components)
    
    recommendations = {}
    
    # Ensure user_id is within the valid range (2 to 1892)
    if user_id < 2 or user_id > user_artist_matrix.shape[0] + 1:
        raise ValueError(f"User ID {user_id} is out of bounds for the user_artist_matrix.")
    
    # Get the user's interaction vector from the reconstructed matrix (adjust for zero-based index)
    reconstructed_user_vector = reconstructed_matrix[user_id - 2]  # User IDs start at 2, so subtract 2
    
    # Iterate through all artists to recommend
    for i, score in enumerate(reconstructed_user_vector):
        # Check if the artist has been interacted with (score > 0) and if the artist ID is valid
        if user_artist_matrix.iloc[user_id - 2, i] == 0:  # Ensure we only recommend non-interacted artists
            artist_id = i  # The index of the artist in the matrix
            if artist_id not in recommendations:
                recommendations[artist_id] = score
            else:
                recommendations[artist_id] += score
    
    # Sort recommendations by score (highest first)
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    
    # Convert artist IDs to names using the artist_id_to_name mapping
    recommended_artists = [(artist_id, artist_id_to_name.get(artist_id, "Unknown"), score)
                           for artist_id, score in sorted_recommendations[:top_n]]
    
    return recommended_artists

# Function to evaluate SVD-based recommendations on test data
def evaluate_recommendations(user_id, user_artist_matrix, artist_id_to_name, top_n=10, n_components=50):
    recommended_artists = get_svd_recommendations(user_id, user_artist_matrix, artist_id_to_name, top_n, n_components)

    # Get the actual interacted artists from the test set
    actual_artists = test_matrix.loc[user_id][test_matrix.loc[user_id] > 0].index.tolist()

    # Extract recommended artist IDs from the recommendations list
    recommended_artists_ids = [artist_id for artist_id, _, _ in recommended_artists]

    # Precision at K
    precision_at_k = precision_score([1 if artist in actual_artists else 0 for artist in recommended_artists_ids],
                                     [1] * len(recommended_artists_ids), average='micro')

    # Recall at K
    recall_at_k = recall_score([1 if artist in actual_artists else 0 for artist in recommended_artists_ids],
                               [1] * len(recommended_artists_ids), average='micro')

    # F1 at K
    f1_at_k = f1_score([1 if artist in actual_artists else 0 for artist in recommended_artists_ids],
                        [1] * len(recommended_artists_ids), average='micro')

    return precision_at_k, recall_at_k, f1_at_k

# Example: Get top 5 SVD-based recommendations for user with userID=2
user_id = 2
svd_recommendations = get_svd_recommendations(user_id, user_artist_matrix, artist_id_to_name, top_n=5)

# Display SVD-based recommendations
print("\nTop SVD-Based Recommendations for User 2:")
for artist_id, artist_name, score in svd_recommendations:
    print(f"Artist ID: {artist_id}, Artist: {artist_name}, Similarity Score: {score:.2f}")

# Example: Evaluate recommendations for a user in the test set
precision_at_k, recall_at_k, f1_at_k = evaluate_recommendations(user_id=2, 
                                                                user_artist_matrix=user_artist_matrix, 
                                                                artist_id_to_name=artist_id_to_name, 
                                                                top_n=10, 
                                                                n_components=50)

# Print actual and recommended artists to debug
actual_artists = test_matrix.loc[user_id][test_matrix.loc[user_id] > 0].index.tolist()
print(f"Actual artists interacted by user {user_id}: {actual_artists}")

recommended_artists = get_svd_recommendations(user_id, user_artist_matrix, artist_id_to_name, top_n=10)
print(f"Recommended artists for user {user_id}: {recommended_artists}")

print(f"Precision@K: {precision_at_k}")
print(f"Recall@K: {recall_at_k}")
print(f"F1@K: {f1_at_k}")


Train interactions: 74267
Test interactions: 18567

Top SVD-Based Recommendations for User 2:
Artist ID: 3464, Artist: Counting Crows, Similarity Score: 2346.16
Artist ID: 1089, Artist: Suede, Similarity Score: 1826.24
Artist ID: 259, Artist: 9th Wonder, Similarity Score: 1581.01
Artist ID: 153, Artist: De/Vision, Similarity Score: 1536.43
Artist ID: 992, Artist: Chris Rea, Similarity Score: 1110.52
Actual artists interacted by user 2: [59, 90]
Recommended artists for user 2: [(3464, 'Counting Crows', 2346.1635801737975), (1089, 'Suede', 1826.2352731085477), (259, '9th Wonder', 1581.0064805606642), (153, 'De/Vision', 1536.4340448821988), (992, 'Chris Rea', 1110.5230835462432), (1496, 'Amsterdam Guitar Trio', 962.6347429798363), (469, 'Nick Carter', 947.2787227982989), (7594, 'Majek Fashek', 927.5537243285531), (222, 'Modest Mouse', 881.6089335976576), (4229, 'Адаптация Пчёл', 839.9796343081118)]
Precision@K: 0.0
Recall@K: 0.0
F1@K: 0.0


---
## Evaluation Based On Similarity
The data is very sparse, hecnce, metrics we were using before will likely 0 for most of the recommendations. Thus, we will evaluate our data using the average similarity score for the first 20 recommendations.

---