## Loading Data & Importing Libraries

In [2]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import implicit

In [3]:
# import datasets
artists = pd.read_csv(os.path.join('..','data','artists.dat'), delimiter='\t')
tags = pd.read_csv(os.path.join('..','data','tags.dat'), delimiter='\t',encoding='ISO-8859-1')
user_artists = pd.read_csv(os.path.join('..','data','user_artists.dat'), delimiter='\t')
user_friends = pd.read_csv(os.path.join('..','data','user_friends.dat'), delimiter='\t')
user_taggedartists_timestamps = pd.read_csv(os.path.join('..','data','user_taggedartists-timestamps.dat'), delimiter='\t')
user_taggedartists = pd.read_csv(os.path.join('..','data','user_taggedartists.dat'), delimiter='\t')

## Data Cleaning

In [4]:
# Drop irrelevant columns from the Artists dataset
artists_cleaned = artists.drop(columns=['url', 'pictureURL']).drop_duplicates(keep='first') 

# Drop the irrelevant columns in the Tags dataset
tags_cleaned = tags.drop_duplicates(keep='first') 

# For the User-Artists dataset, we can filter out rows with a weight of 0, as they show no meaningful interaction
# user_artists_cleaned = user_artists[user_artists['weight'] > 0]
user_artists_cleaned = user_artists.drop_duplicates(keep='first') 

# Drop duplicates from the User-Tagged Artists Timestamps dataset
user_taggedartists_timestamps_cleaned = user_taggedartists_timestamps.drop_duplicates(keep='first') 

# Convert timestamps from ms to datetime format
user_taggedartists_timestamps_cleaned['timestamp'] = pd.to_datetime(user_taggedartists_timestamps_cleaned['timestamp'], unit='ms')

# Drop duplicates from the User-Friends dataset
user_friends_cleaned = user_friends.drop_duplicates(keep='first') 

# # Output cleaned datasets for inspection
print("Cleaned Artists dataset:", artists_cleaned.info(), artists_cleaned.head())
print("Cleaned Tags dataset:", tags_cleaned.info(), tags_cleaned.head())
print("Cleaned User-Artists dataset:", user_artists_cleaned.info(), user_artists_cleaned.head())
print("Cleaned User-Tagged Artists Timestamps dataset:", user_taggedartists_timestamps_cleaned.info(), user_taggedartists_timestamps_cleaned.head())
print("Cleaned User-Friends dataset:", user_friends_cleaned.info(), user_friends_cleaned.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17632 entries, 0 to 17631
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17632 non-null  int64 
 1   name    17632 non-null  object
dtypes: int64(1), object(1)
memory usage: 275.6+ KB
Cleaned Artists dataset: None    id               name
0   1       MALICE MIZER
1   2    Diary of Dreams
2   3  Carpathian Forest
3   4       Moi dix Mois
4   5        Bella Morte
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11946 entries, 0 to 11945
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tagID     11946 non-null  int64 
 1   tagValue  11946 non-null  object
dtypes: int64(1), object(1)
memory usage: 186.8+ KB
Cleaned Tags dataset: None    tagID           tagValue
0      1              metal
1      2  alternative metal
2      3          goth rock
3      4        black metal
4      5        death metal
<class 'p

---

# Collaborative Filtering

## Theory

............

............

............


In [4]:
# Create a dictionary to map artistID to artistName
artist_id_to_name = dict(zip(artists['id'], artists['name']))

## Prepare Data for Model Implementation

In [5]:
# Create a user-artist interaction matrix using the user_artists_cleaned dataset
user_artist_matrix = user_artists_cleaned.pivot(index='userID', columns='artistID', values='weight')

# Fill NaN values with 0s (assuming binary or implicit feedback, i.e., 1 for interaction, 0 for no interaction)
user_artist_matrix = user_artist_matrix.fillna(0)

print(user_artist_matrix)

artistID  1      2      3      4      5      6      7      8      9      \
userID                                                                    
2           0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3           0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
4           0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
5           0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
6           0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
...         ...    ...    ...    ...    ...    ...    ...    ...    ...   
2095        0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2096        0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2097        0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2099        0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2100        0.0    0.0  408.0    0.0    0.0  404.0    0.0    0.0    0.0   

artistID  10     ...  18

## User-Based Implementation
EXPLANATION

In [6]:
# Compute the cosine similarity between users
user_similarity = cosine_similarity(user_artist_matrix)

# Convert the similarity matrix into a DataFrame for easy inspection
user_similarity_df = pd.DataFrame(user_similarity, index=user_artist_matrix.index, columns=user_artist_matrix.index)

# # Display a portion of the user similarity matrix
# print(user_similarity_df.head())

In [7]:
# Function to get user-based recommendations with the desired output format
def get_user_based_recommendations(user_id, user_similarity_df, user_artist_matrix, artist_id_to_name, top_n=10):
    # Check if the user_id exists in user_similarity_df
    if user_id not in user_similarity_df.columns:
        raise ValueError(f"user_id {user_id} not found in the user_similarity_df columns")
    
    # Get the most similar users (excluding the user itself)
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]

    recommendations = {}
    for similar_user in similar_users:
        # Get the artists this similar user has interacted with (non-zero values)
        interacted_artists = user_artist_matrix.loc[similar_user][user_artist_matrix.loc[similar_user] > 0].index.tolist()

        for artist in interacted_artists:
            # Only consider artists the target user has not interacted with
            if artist not in user_artist_matrix.loc[user_id][user_artist_matrix.loc[user_id] > 0].index.tolist():
                # Add the artist to recommendations with a score (using the scaled similarity as a weight)
                if artist not in recommendations:
                    recommendations[artist] = user_similarity_df[user_id][similar_user]
                else:
                    # Add the weight of similarity to the current score
                    recommendations[artist] += user_similarity_df[user_id][similar_user]

    # Sort recommendations by score (highest first)
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)

    # Convert artist IDs to names and prepare the final list with IDs, names, and scores
    recommended_artists = [(artist, artist_id_to_name.get(artist, "Unknown"), score) for artist, score in sorted_recommendations[:top_n]]

    return recommended_artists

# Example: Get top 5 user-based recommendations for user with userID=2
user_id = 2
user_based_recommendations = get_user_based_recommendations(user_id, user_similarity_df, user_artist_matrix, artist_id_to_name, top_n=5)

# Display user-based recommendations in the required format
print("Top User-Based Recommendations for User 2:")
for artist_id, artist_name, score in user_based_recommendations:
    print(f"Artist ID: {artist_id}, Artist: {artist_name}, Similarity Score: {score:.2f}")

Top User-Based Recommendations for User 2:
Artist ID: 289, Artist: Britney Spears, Similarity Score: 20.67
Artist ID: 288, Artist: Rihanna, Similarity Score: 20.10
Artist ID: 295, Artist: Beyoncé, Similarity Score: 16.92
Artist ID: 292, Artist: Christina Aguilera, Similarity Score: 16.73
Artist ID: 300, Artist: Katy Perry, Similarity Score: 15.50


In [8]:
# Example: Get top 5 user-based recommendations for user with userID=3
user_id = 3
user_based_recommendations = get_user_based_recommendations(user_id, user_similarity_df, user_artist_matrix, artist_id_to_name, top_n=5)

# Display user-based recommendations in the required format
print("Top User-Based Recommendations for User 3:")
for artist_id, artist_name, score in user_based_recommendations:
    print(f"Artist ID: {artist_id}, Artist: {artist_name}, Similarity Score: {score:.2f}")

Top User-Based Recommendations for User 3:
Artist ID: 757, Artist: Crystal Castles, Similarity Score: 0.25
Artist ID: 603, Artist: Aphex Twin, Similarity Score: 0.25
Artist ID: 1222, Artist: Venetian Snares, Similarity Score: 0.23
Artist ID: 2174, Artist: edIT, Similarity Score: 0.23
Artist ID: 154, Artist: Radiohead, Similarity Score: 0.21


## Item-based Implementation
EXPLANATION

In [9]:
# Compute the cosine similarity between artists (transpose the matrix to compare artists)
artist_similarity = cosine_similarity(user_artist_matrix.T)  # Transpose to compare artists (columns)

# Convert the similarity matrix into a DataFrame for easy inspection
artist_similarity_df = pd.DataFrame(artist_similarity, index=user_artist_matrix.columns, columns=user_artist_matrix.columns)

# Display a portion of the artist similarity matrix
# print(artist_similarity_df.head())

In [10]:
def get_item_based_recommendations(user_id, user_artist_matrix, artist_similarity_df, artist_id_to_name, top_n=10):
    # Get the artists the user has interacted with (non-zero values)
    interacted_artists = user_artist_matrix.loc[user_id][user_artist_matrix.loc[user_id] > 0].index.tolist()
    
    recommendations = {}
    for artist in interacted_artists:
        # Get the most similar artists to the ones the user interacted with
        similar_artists = artist_similarity_df[artist].sort_values(ascending=False).index[1:]  # Exclude the artist itself

        for similar_artist in similar_artists:
            # Add the similar artist to recommendations with a score (using the similarity as a weight)
            if similar_artist not in recommendations:
                recommendations[similar_artist] = artist_similarity_df[artist][similar_artist]
            else:
                recommendations[similar_artist] += artist_similarity_df[artist][similar_artist]

    # Sort recommendations by score (highest first)
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)

    # Convert artist IDs to names using artist_id_to_name
    recommended_artists = [(artist_id, artist_id_to_name.get(artist_id, "Unknown"), score) 
                           for artist_id, score in sorted_recommendations[:top_n]]

    return recommended_artists

In [11]:
# Example: Get top 5 item-based recommendations for user with userID=2
user_id = 2
item_based_recommendations = get_item_based_recommendations(user_id, user_artist_matrix, artist_similarity_df, artist_id_to_name, top_n=5)

# Display item-based recommendations
print("\nTop Item-Based Recommendations for User 2:")
for artist_id, artist_name, score in item_based_recommendations:
    print(f"Artist ID: {artist_id}, Artist: {artist_name}, Similarity Score: {score:.2f}")



Top Item-Based Recommendations for User 2:
Artist ID: 74, Artist: Basia, Similarity Score: 24.97
Artist ID: 92, Artist: Vitamin Z, Similarity Score: 24.97
Artist ID: 79, Artist: Fiction Factory, Similarity Score: 24.97
Artist ID: 87, Artist: Deacon Blue, Similarity Score: 24.97
Artist ID: 60, Artist: Matt Bianco, Similarity Score: 23.97


In [12]:
# Example: Get top 5 item-based recommendations for user with userID=3
user_id = 3
item_based_recommendations = get_item_based_recommendations(user_id, user_artist_matrix, artist_similarity_df, artist_id_to_name, top_n=5)

# Display item-based recommendations
print("\nTop Item-Based Recommendations for User 3:")
for artist_id, artist_name, score in item_based_recommendations:
    print(f"Artist ID: {artist_id}, Artist: {artist_name}, Similarity Score: {score:.2f}")


Top Item-Based Recommendations for User 3:
Artist ID: 134, Artist: Big Brotherz, Similarity Score: 41.77
Artist ID: 131, Artist: Part Timer, Similarity Score: 41.77
Artist ID: 130, Artist: Philippe Lamy, Similarity Score: 41.77
Artist ID: 129, Artist: Aless, Similarity Score: 41.77
Artist ID: 128, Artist: strom noir, Similarity Score: 41.77


---

## Matrix Decomposition Methods

### SVD

In [13]:
def get_svd_recommendations(user_id, user_artist_matrix, artist_id_to_name, top_n=10, n_components=50):
    # Apply SVD to the user-artist matrix
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    svd_matrix = svd.fit_transform(user_artist_matrix)
    svd_components = svd.components_

    # Reconstruct the user-artist interaction matrix
    reconstructed_matrix = np.dot(svd_matrix, svd_components)
    
    recommendations = {}
    
    # Ensure user_id is within the valid range (2 to 1892)
    if user_id < 2 or user_id > user_artist_matrix.shape[0] + 1:
        raise ValueError(f"User ID {user_id} is out of bounds for the user_artist_matrix.")
    
    # Get the user's interaction vector from the reconstructed matrix (adjust for zero-based index)
    reconstructed_user_vector = reconstructed_matrix[user_id - 2]  # User IDs start at 2, so subtract 2
    
    # Iterate through all artists to recommend
    for i, score in enumerate(reconstructed_user_vector):
        # Check if the artist has been interacted with (score > 0) and if the artist ID is valid
        if user_artist_matrix.iloc[user_id - 2, i] == 0:  # Ensure we only recommend non-interacted artists
            artist_id = i  # The index of the artist in the matrix
            if artist_id not in recommendations:
                recommendations[artist_id] = score
            else:
                recommendations[artist_id] += score
    
    # Sort recommendations by score (highest first)
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    
    # Convert artist IDs to names using the artist_id_to_name mapping
    recommended_artists = [(artist_id, artist_id_to_name.get(artist_id, "Unknown"), score)
                           for artist_id, score in sorted_recommendations[:top_n]]
    
    return recommended_artists

In [14]:
# Example: Get top 5 SVD-based recommendations for user with userID=2
user_id = 2
svd_recommendations = get_svd_recommendations(user_id, user_artist_matrix, artist_id_to_name, top_n=5)

# Display SVD-based recommendations
print("\nTop SVD-Based Recommendations for User 2:")
for artist_id, artist_name, score in svd_recommendations:
    print(f"Artist ID: {artist_id}, Artist: {artist_name}, Similarity Score: {score:.2f}")


Top SVD-Based Recommendations for User 2:
Artist ID: 3464, Artist: Counting Crows, Similarity Score: 2346.16
Artist ID: 1089, Artist: Suede, Similarity Score: 1826.24
Artist ID: 259, Artist: 9th Wonder, Similarity Score: 1581.01
Artist ID: 153, Artist: De/Vision, Similarity Score: 1536.43
Artist ID: 992, Artist: Chris Rea, Similarity Score: 1110.52


In [15]:
# Example: Get top 5 SVD-based recommendations for user with userID=3
user_id = 3
svd_recommendations = get_svd_recommendations(user_id, user_artist_matrix, artist_id_to_name, top_n=5)

# Display SVD-based recommendations
print("\nTop SVD-Based Recommendations for User 3:")
for artist_id, artist_name, score in svd_recommendations:
    print(f"Artist ID: {artist_id}, Artist: {artist_name}, Similarity Score: {score:.2f}")


Top SVD-Based Recommendations for User 3:
Artist ID: 184, Artist: James Blunt, Similarity Score: 6.19
Artist ID: 148, Artist: The Boats, Similarity Score: 4.82
Artist ID: 1089, Artist: Suede, Similarity Score: 3.29
Artist ID: 151, Artist: Deep Forest, Similarity Score: 2.92
Artist ID: 298, Artist: Lily Allen, Similarity Score: 2.87


#### ALS Using `implicit`

In [16]:
def get_als_recommendations(user_id, user_artist_matrix, artist_id_to_name, top_n=5, factors=50, regularization=0.1, iterations=20):
    # Convert the user-artist matrix to sparse format (csr_matrix)
    sparse_matrix = csr_matrix(user_artist_matrix.values)
    
    # Initialize and train the ALS model
    model = implicit.als.AlternatingLeastSquares(factors=factors, regularization=regularization, iterations=iterations)
    model.fit(sparse_matrix)

    # Get the user's interaction vector (row from sparse matrix)
    user_vector = sparse_matrix[user_id]

    # Get top N artist recommendations (returns artist IDs and scores)
    recommendations = model.recommend(user_id, user_vector, N=top_n)

    # Convert artist IDs to artist names using the provided dictionary
    recommended_artists = [(artist_id_to_name[artist_id], score) for artist_id, score in zip(recommendations[0], recommendations[1])]

    return recommended_artists

In [17]:
# Example: Get top 5 ALS-based recommendations for user with userID=2
user_id = 2
als_recommendations = get_als_recommendations(user_id, user_artist_matrix, artist_id_to_name, top_n=5)

# Display ALS-based recommendations
print(f"\nTop ALS-Based Recommendations for User {user_id}:")
for artist_name, score in als_recommendations:
    print(f"Artist: {artist_name}, Predicted Listening Count: {score:.2f}")

  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]


Top ALS-Based Recommendations for User 2:
Artist: Chiodos, Predicted Listening Count: 1.37
Artist: Kiko Loureiro, Predicted Listening Count: 1.18
Artist: Talk Talk, Predicted Listening Count: 1.06
Artist: Digital Summer, Predicted Listening Count: 1.04
Artist: Early Man, Predicted Listening Count: 1.01


In [18]:
# Example: Get top 5 ALS-based recommendations for user with userID=3
user_id = 8
als_recommendations = get_als_recommendations(user_id, user_artist_matrix, artist_id_to_name, top_n=5)

# Display ALS-based recommendations
print(f"\nTop ALS-Based Recommendations for User {user_id}:")
for artist_name, score in als_recommendations:
    print(f"Artist: {artist_name}, Predicted Listening Count: {score:.2f}")

  0%|          | 0/20 [00:00<?, ?it/s]


Top ALS-Based Recommendations for User 8:
Artist: Talking Heads, Predicted Listening Count: 1.52
Artist: Parliament, Predicted Listening Count: 1.22
Artist: Jeff Buckley, Predicted Listening Count: 1.17
Artist: Tortoise, Predicted Listening Count: 1.14
Artist: Red Hot Chili Peppers, Predicted Listening Count: 1.14


# PySpark
## Theory
EXPLANATION

## ALS User-based Collaborative Filtering with PySpark

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS

# Start Spark session
spark = SparkSession.builder.appName("CollaborativeFilteringALS").getOrCreate()

# Convert cleaned pandas DataFrames to PySpark DataFrames
artists_spark_df = spark.createDataFrame(artists_cleaned)
user_artists_spark_df = spark.createDataFrame(user_artists_cleaned)

In [22]:
# ALS model setup for user-based collaborative filtering
# ALS model setup
als = ALS(userCol="userID", itemCol="artistID", ratingCol="weight", coldStartStrategy="drop", implicitPrefs=True)

# Fit the ALS model
model = als.fit(user_artists_spark_df)

# Generate recommendations
user_recommendations = model.recommendForAllUsers(5)

In [23]:
# Create a dictionary to map artistID to artistName
artist_id_to_name = {row['id']: row['name'] for row in artists_spark_df.collect()}

# Function to map artistID to artistName and round scores to 2 decimal places
def map_recommendations(user_recommendations):
    def map_row(row):
        recommendations_with_names = [
            (artist_id_to_name.get(rec[0], "Unknown"), round(rec[1], 2)) for rec in row['recommendations']
        ]
        return (row['userID'], recommendations_with_names)

    mapped_recommendations = user_recommendations.rdd.map(map_row).toDF(["userID", "recommendations"])
    return mapped_recommendations

# Apply the artistID to name mapping function
user_recommendations_with_names = map_recommendations(user_recommendations)

# Show the final recommendations with artist names and rounded scores
user_recommendations_with_names.show(truncate=False)

+------+---------------------------------------------------------------------------------------------------------------------------+
|userID|recommendations                                                                                                            |
+------+---------------------------------------------------------------------------------------------------------------------------+
|3     |[{Janelle Monáe, 1.21}, {Erik Satie, 1.19}, {L'Arc~en~Ciel, 1.19}, {Ricardo Arjona, 1.18}, {Camille, 1.17}]                |
|5     |[{The Smiths, 1.1}, {Joy Division, 1.07}, {Radiohead, 1.06}, {The Rolling Stones, 1.06}, {Nine Inch Nails, 1.06}]          |
|6     |[{Darren Hayes, 0.53}, {Mylène Farmer, 0.52}, {Lil' Wayne, 0.49}, {Duffy, 0.49}, {Kanye West, 0.49}]                       |
|12    |[{Kill Hannah, 1.53}, {Omnia, 1.24}, {Stereophonics, 1.21}, {Blutengel, 1.18}, {L'Âme Immortelle, 1.17}]                   |
|13    |[{Lady Gaga, 0.92}, {Glee Cast, 0.9}, {Eminem, 0.9}, {Michael

In [None]:
# Explode the recommendations column into individual rows
exploded_user_recommendations = user_recommendations.withColumn("recommendation", F.explode("recommendations"))

# Extract artistID and rating from the struct fields
exploded_user_recommendations = exploded_user_recommendations.select(
    F.col("userID"),
    F.col("recommendation.artistID").alias("artistID"),  # Extract artistID
    F.col("recommendation.rating").alias("score")        # Extract rating as score
)

# Generate summary statistics for the scores
summary_stats = exploded_user_recommendations.select("score").summary()

# Display the summary statistics
summary_stats.show()

+-------+-------------------+
|summary|              score|
+-------+-------------------+
|  count|               9460|
|   mean| 1.1072712600533408|
| stddev|0.26236249658108024|
|    min|        5.50755E-40|
|    25%|          1.0340639|
|    50%|          1.1207719|
|    75%|          1.2433128|
|    max|          1.9848865|
+-------+-------------------+



## ALS Item-based Collaborative Filtering with PySpark

In [24]:
# ALS model setup for item-based collaborative filtering
# Swap userCol and itemCol for item-based filtering
als = ALS(userCol="artistID", itemCol="userID", ratingCol="weight", coldStartStrategy="drop", implicitPrefs=True)

# Fit the ALS model
model = als.fit(user_artists_spark_df)

# Generate item-based recommendations for each artist (item)
item_recommendations = model.recommendForAllItems(5)

In [25]:
# Apply the artistID to name mapping function
item_recommendations_with_names = map_recommendations(item_recommendations)

# Show the final recommendations with artist names and rounded scores
item_recommendations_with_names.show(truncate=False)

+------+-----------------------------------------------------------------------------------------------------------------------------+
|userID|recommendations                                                                                                              |
+------+-----------------------------------------------------------------------------------------------------------------------------+
|3     |[{Ani DiFranco, 1.21}, {Danny Elfman, 1.09}, {Michael Giacchino, 1.07}, {Gang of Four, 1.06}, {Clint Mansell, 1.04}]         |
|5     |[{She Wants Revenge, 1.13}, {Nouvelle Vague, 1.09}, {Sigur Rós, 1.09}, {The Last Shadow Puppets, 1.08}, {Joy Division, 1.08}]|
|6     |[{Aaliyah, 0.5}, {Whitney Houston, 0.5}, {Mary J. Blige, 0.49}, {Toni Braxton, 0.48}, {Janet Jackson, 0.48}]                 |
|12    |[{Tarja, 1.32}, {Howard Shore, 1.27}, {Creed, 1.22}, {Trapt, 1.22}, {Porcelain and the Tramps, 1.2}]                         |
|13    |[{Lady Gaga, 0.93}, {Evanescence, 0.91}, {Avril

In [31]:
# Explode the recommendations column into individual rows
exploded_item_recommendations = item_recommendations.withColumn("recommendation", F.explode("recommendations"))

# Extract artistID and rating from the struct fields
exploded_item_recommendations = exploded_item_recommendations.select(
    F.col("userID"),
    F.col("recommendation.artistID").alias("artistID"),  # Extract artistID
    F.col("recommendation.rating").alias("score")        # Extract rating as score
)

# Generate summary statistics for the scores
summary_stats = exploded_item_recommendations.select("score").summary()

# Display the summary statistics
summary_stats.show()

+-------+------------------+
|summary|             score|
+-------+------------------+
|  count|              9460|
|   mean|1.1070918623936887|
| stddev| 0.261973729412807|
|    min|     1.8534923E-38|
|    25%|         1.0341512|
|    50%|         1.1195546|
|    75%|         1.2442842|
|    max|         2.1529148|
+-------+------------------+



## Conclusion

---