In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
base_directory = "/Users/harrywilson/Desktop/DataScienceToolbox/Assessment2Data"

In [3]:
import chardet

file_path = f"{base_directory}/artists.dat"
with open(file_path, "rb") as f:
    result = chardet.detect(f.read(10000))  # Analyze the first 10KB
    print(result)


{'encoding': 'utf-8', 'confidence': 0.87625, 'language': ''}


In [4]:
# Function to load data
def load_data(filename):
    file_path = f"{base_directory}/{filename}"
    return pd.read_csv(file_path, sep="\t", header=0)  # sep="\t" for tab-separated, header=0 means first row is the header

In [5]:
# Load the dataset
artists_file = os.path.join(base_directory, "artists.dat")
tags_file = os.path.join(base_directory, "tags.dat")
user_artists_file = os.path.join(base_directory, "user_artists.dat")
user_taggedartists_file = os.path.join(base_directory, "user_taggedartists.dat")

In [6]:
# Load the datasets
artists = pd.read_csv(artists_file, sep="\t", header=0)
tags = pd.read_csv(tags_file, sep="\t", header=0, encoding="latin-1")
user_artists = pd.read_csv(user_artists_file, sep="\t", header=0)
user_taggedartists = pd.read_csv(user_taggedartists_file, sep="\t", header=0)

In [7]:
# Display a preview of the data
print(artists.head())
print(tags.head())
print(user_artists.head())
print(user_taggedartists.head())

   id               name                                         url  \
0   1       MALICE MIZER       http://www.last.fm/music/MALICE+MIZER   
1   2    Diary of Dreams    http://www.last.fm/music/Diary+of+Dreams   
2   3  Carpathian Forest  http://www.last.fm/music/Carpathian+Forest   
3   4       Moi dix Mois       http://www.last.fm/music/Moi+dix+Mois   
4   5        Bella Morte        http://www.last.fm/music/Bella+Morte   

                                          pictureURL  
0    http://userserve-ak.last.fm/serve/252/10808.jpg  
1  http://userserve-ak.last.fm/serve/252/3052066.jpg  
2  http://userserve-ak.last.fm/serve/252/40222717...  
3  http://userserve-ak.last.fm/serve/252/54697835...  
4  http://userserve-ak.last.fm/serve/252/14789013...  
   tagID           tagValue
0      1              metal
1      2  alternative metal
2      3          goth rock
3      4        black metal
4      5        death metal
   userID  artistID  weight
0       2        51   13883
1       2    

In [8]:
# Join user_taggedartists with tags for tag information
artist_tags_df = user_taggedartists.merge(tags, on="tagID", how="inner")

# Join the result with artists to get artist details and tag names
artist_tags_info_df = artist_tags_df.merge(artists, left_on="artistID", right_on="id")

# Aggregate tags for each artist into a list and remove duplicates
artist_profiles_df = (
    artist_tags_info_df.groupby(["artistID", "name"])["tagValue"]
    .apply(lambda tags: list(set(tags)))  # Remove duplicates and collect tags
    .reset_index()
    .rename(columns={"name": "artist_name", "tagValue": "tags"})
)

# Display artist profiles
print(artist_profiles_df.head())


   artistID        artist_name  \
0         1       MALICE MIZER   
1         2    Diary of Dreams   
2         3  Carpathian Forest   
3         4       Moi dix Mois   
4         5        Bella Morte   

                                                tags  
0  [gothic, jrock, j-rock, weeabo, japanese, visu...  
1  [gothic rock, true goth emo, german, ambient, ...  
2  [true norwegian black metal, saxophones, black...  
3  [japanese, gothic, rock, j-rock, gothic japane...  
4  [gothic rock, gothic, darkwave, deathrock, cov...  


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert tags into strings (required for CountVectorizer)
artist_profiles_df["tag_text"] = artist_profiles_df["tags"].apply(lambda tags: " ".join(tags))

# Initialize CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda x:x.split())


# Fit and transform the tag_text column
tag_vectors = vectorizer.fit_transform(artist_profiles_df["tag_text"])

# Convert the sparse matrix to a dense DataFrame for better visualization
tag_vectors_df = pd.DataFrame(tag_vectors.toarray(), columns=vectorizer.get_feature_names_out())

# Add artist information for context
vectorized_artist_profiles_df = pd.concat(
    [artist_profiles_df[["artistID", "artist_name"]], tag_vectors_df], axis=1
)

# Display vectorized features
print(vectorized_artist_profiles_df.head())



   artistID        artist_name  '80s  'n'  -  --  ---  --------  ---king  \
0         1       MALICE MIZER     0    0  0   0    0         0        0   
1         2    Diary of Dreams     0    0  0   0    0         0        0   
2         3  Carpathian Forest     0    0  0   0    0         0        0   
3         4       Moi dix Mois     0    0  0   0    0         0        0   
4         5        Bella Morte     0    0  0   0    0         0        0   

   -angels  ...  zombie  zombieland  zone  zoocore  zooey  zorn  zornish  ztt  \
0        0  ...       0           0     0        0      0     0        0    0   
1        0  ...       0           0     0        0      0     0        0    0   
2        0  ...       0           0     0        0      0     0        0    0   
3        0  ...       0           0     0        0      0     0        0    0   
4        0  ...       0           0     0        0      0     0        0    0   

   zu  ärzte  
0   0      0  
1   0      0  
2   0      

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TF-IDF to encode tags into numerical vectors
tfidf = TfidfVectorizer(max_features=20)  # Limit to 10 features for simplicity
artist_tag_matrix = tfidf.fit_transform(artist_profiles_df["tag_text"])

# Convert the matrix to a DataFrame for easier use
artist_features = pd.DataFrame(
    artist_tag_matrix.toarray(), 
    index=artist_profiles_df["artist_name"], 
    columns=tfidf.get_feature_names_out()
)
print(artist_features.head())


                   80s  alternative  dance  electronic  female  hip  hop  \
artist_name                                                                
MALICE MIZER       0.0          0.0    0.0    0.000000     0.0  0.0  0.0   
Diary of Dreams    0.0          0.0    0.0    0.486988     0.0  0.0  0.0   
Carpathian Forest  0.0          0.0    0.0    0.000000     0.0  0.0  0.0   
Moi dix Mois       0.0          0.0    0.0    0.000000     0.0  0.0  0.0   
Bella Morte        0.0          0.0    0.0    0.000000     0.0  0.0  0.0   

                   indie      live  love     metal  new  pop  post  punk  \
artist_name                                                                
MALICE MIZER         0.0  0.000000   0.0  0.000000  0.0  0.0   0.0   0.0   
Diary of Dreams      0.0  0.558014   0.0  0.000000  0.0  0.0   0.0   0.0   
Carpathian Forest    0.0  0.000000   0.0  1.000000  0.0  0.0   0.0   0.0   
Moi dix Mois         0.0  0.000000   0.0  0.832669  0.0  0.0   0.0   0.0   
Bella Morte

In [11]:
from sklearn.preprocessing import MinMaxScaler

# Apply MinMaxScaler to normalize the feature vectors
scaler = MinMaxScaler()
scaled_artist_features = pd.DataFrame(
    scaler.fit_transform(artist_features),
    index=artist_features.index,
    columns=artist_features.columns
)

# Display the scaled features
print(scaled_artist_features.head())


                   80s  alternative  dance  electronic  female  hip  hop  \
artist_name                                                                
MALICE MIZER       0.0          0.0    0.0    0.000000     0.0  0.0  0.0   
Diary of Dreams    0.0          0.0    0.0    0.486988     0.0  0.0  0.0   
Carpathian Forest  0.0          0.0    0.0    0.000000     0.0  0.0  0.0   
Moi dix Mois       0.0          0.0    0.0    0.000000     0.0  0.0  0.0   
Bella Morte        0.0          0.0    0.0    0.000000     0.0  0.0  0.0   

                   indie      live  love     metal  new  pop  post  punk  \
artist_name                                                                
MALICE MIZER         0.0  0.000000   0.0  0.000000  0.0  0.0   0.0   0.0   
Diary of Dreams      0.0  0.558014   0.0  0.000000  0.0  0.0   0.0   0.0   
Carpathian Forest    0.0  0.000000   0.0  1.000000  0.0  0.0   0.0   0.0   
Moi dix Mois         0.0  0.000000   0.0  0.832669  0.0  0.0   0.0   0.0   
Bella Morte

In [12]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Assuming that scaled_artist_features is a DataFrame where the index represents artist IDs
# Step 1: Compute cosine similarity matrix
similarity_matrix = cosine_similarity(scaled_artist_features)

# Step 2: Convert cosine similarity matrix to a DataFrame
artist_ids = scaled_artist_features.index  # Assuming index represents artist IDs
similarities_df = pd.DataFrame(
    similarity_matrix, index=artist_ids, columns=artist_ids
)

# Step 3: Reset the index and melt the DataFrame into long format
similarities_long_df = similarities_df.reset_index()
similarities_long_df = similarities_long_df.melt(
    id_vars=similarities_long_df.columns[0], var_name="artistID_2", value_name="similarity"
)
similarities_long_df.rename(columns={similarities_long_df.columns[0]: "artistID_1"}, inplace=True)

# Step 4: Filter out self-similarities and keep unique pairs
similarities_long_df = similarities_long_df[similarities_long_df["artistID_1"] < similarities_long_df["artistID_2"]]

# Reset the index after filtering
similarities_long_df = similarities_long_df.reset_index(drop=True)

# Display the updated DataFrame
print(similarities_long_df.head())



          artistID_1    artistID_2  similarity
0    Diary of Dreams  MALICE MIZER    0.334686
1  Carpathian Forest  MALICE MIZER    0.000000
2        Bella Morte  MALICE MIZER    1.000000
3        DIR EN GREY  MALICE MIZER    0.706250
4        Combichrist  MALICE MIZER    0.000000


In [13]:
# Input a user ID
input_user_id = 5

# Step 1: Get artists the user has already interacted with
interacted_artists = user_artists[user_artists["userID"] == input_user_id]["artistID"].tolist()

# Step 2: Filter the similarity matrix to exclude artists the user has interacted with
recommendations_filtered_df = similarities_long_df[
    (~similarities_long_df["artistID_1"].isin(interacted_artists)) &  # Artist 1 is not interacted with
    (~similarities_long_df["artistID_2"].isin(interacted_artists))    # Artist 2 is not interacted with
]

# Step 3: Group recommendations by artistID_2 (the recommended artist)
user_recommendations = (
    recommendations_filtered_df.groupby("artistID_2")
    .agg({"similarity": "mean"})  # Calculate average similarity for recommendations
    .rename(columns={"similarity": "avg_similarity"})
    .reset_index()
)

# Ensure data types match before merging
user_recommendations["artistID_2"] = user_recommendations["artistID_2"].astype(str)
artists["id"] = artists["id"].astype(str)

# Step 4: Merge with the artists DataFrame to add artist names
user_recommendations_with_names = pd.merge(
    user_recommendations, 
    artists, 
    left_on="artistID_2", 
    right_on="id", 
    how="inner"
).rename(columns={"name": "artist_name"})

# Step 5: Sort the recommendations by average similarity in descending order
user_recommendations_with_names = user_recommendations_with_names.sort_values(
    by="avg_similarity", ascending=False
)

# Step 6: Display the top recommendations
print(user_recommendations_with_names[["artistID_2", "artist_name", "avg_similarity"]].head(10))


   artistID_2          artist_name  avg_similarity
1       12012     Foxes In Fiction        0.237294
7           3    Carpathian Forest        0.228564
3        1997           Dragonland        0.226423
8         311  Natasha Bedingfield        0.207139
10        883               Tiamat        0.179470
11        999       Thompson Twins        0.102851
4        2002        John Petrucci        0.070631
2        1349     Rhapsody of Fire        0.064027
0         112         Go Koyashiki        0.044043
6        2562              Arcadia        0.011732


In [14]:
# Input a user ID
input_user_id = 5

# Step 1: Get artists the user has already interacted with
interacted_artists = user_artists[user_artists["userID"] == input_user_id]["artistID"].tolist()

# Step 2: Filter the similarity matrix to exclude artists the user has interacted with
recommendations_filtered_df = similarities_long_df[
    ~similarities_long_df["artistID_2"].isin(interacted_artists)
]

# Step 3: Group recommendations by userID and artistID_2
user_recommendations = (
    recommendations_filtered_df.groupby(["artistID_2"])
    .agg({"similarity": "mean"})  # Calculate average similarity for recommendations
    .rename(columns={"similarity": "avg_similarity"})
    .reset_index()
)

# Ensure data types match before merging
user_recommendations["artistID_2"] = user_recommendations["artistID_2"].astype(str)
artists["id"] = artists["id"].astype(str)

# Step 4: Merge with the artists DataFrame to add artist names
user_recommendations_with_names = pd.merge(
    user_recommendations,
    artists,
    left_on="artistID_2",
    right_on="id",
    how="inner"
).rename(columns={"name": "artist_name"})

# Step 5: Sort the recommendations by average similarity in descending order
user_recommendations_with_names = user_recommendations_with_names.sort_values(
    by="avg_similarity", ascending=False
)

# Step 6: Display the top recommendations
print(user_recommendations_with_names[["artistID_2", "artist_name", "avg_similarity"]].head(10))


   artistID_2          artist_name  avg_similarity
1       12012     Foxes In Fiction        0.237294
7           3    Carpathian Forest        0.228564
3        1997           Dragonland        0.226423
8         311  Natasha Bedingfield        0.207139
10        883               Tiamat        0.179470
11        999       Thompson Twins        0.102851
4        2002        John Petrucci        0.070631
2        1349     Rhapsody of Fire        0.064027
0         112         Go Koyashiki        0.044043
6        2562              Arcadia        0.011732
