# Doing Content Based Filtering with No PySpark

In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
base_directory = "/Users/harrywilson/Desktop/DataScienceToolbox/Assessment2Data"

In [5]:
import chardet

file_path = f"{base_directory}/artists.dat"
with open(file_path, "rb") as f:
    result = chardet.detect(f.read(10000))  # Analyze the first 10KB
    print(result)


{'encoding': 'utf-8', 'confidence': 0.87625, 'language': ''}


In [6]:
# Function to load data
def load_data(filename):
    file_path = f"{base_directory}/{filename}"
    return pd.read_csv(file_path, sep="\t", header=0)  # sep="\t" for tab-separated, header=0 means first row is the header

In [7]:
# Load the dataset
artists_file = os.path.join(base_directory, "artists.dat")
tags_file = os.path.join(base_directory, "tags.dat")
user_artists_file = os.path.join(base_directory, "user_artists.dat")
user_taggedartists_file = os.path.join(base_directory, "user_taggedartists.dat")

In [8]:
# Load the datasets
artists = pd.read_csv(artists_file, sep="\t", header=0)
tags = pd.read_csv(tags_file, sep="\t", header=0, encoding="latin-1")
user_artists = pd.read_csv(user_artists_file, sep="\t", header=0)
user_taggedartists = pd.read_csv(user_taggedartists_file, sep="\t", header=0)

In [9]:
# Display a preview of the data
print(artists.head())
print(tags.head())
print(user_artists.head())
print(user_taggedartists.head())

   id               name                                         url  \
0   1       MALICE MIZER       http://www.last.fm/music/MALICE+MIZER   
1   2    Diary of Dreams    http://www.last.fm/music/Diary+of+Dreams   
2   3  Carpathian Forest  http://www.last.fm/music/Carpathian+Forest   
3   4       Moi dix Mois       http://www.last.fm/music/Moi+dix+Mois   
4   5        Bella Morte        http://www.last.fm/music/Bella+Morte   

                                          pictureURL  
0    http://userserve-ak.last.fm/serve/252/10808.jpg  
1  http://userserve-ak.last.fm/serve/252/3052066.jpg  
2  http://userserve-ak.last.fm/serve/252/40222717...  
3  http://userserve-ak.last.fm/serve/252/54697835...  
4  http://userserve-ak.last.fm/serve/252/14789013...  
   tagID           tagValue
0      1              metal
1      2  alternative metal
2      3          goth rock
3      4        black metal
4      5        death metal
   userID  artistID  weight
0       2        51   13883
1       2    

In [10]:
# Join user_taggedartists with tags for tag information
artist_tags_df = user_taggedartists.merge(tags, on="tagID", how="inner")

# Join the result with artists to get artist details and tag names
artist_tags_info_df = artist_tags_df.merge(artists, left_on="artistID", right_on="id")

# Aggregate tags for each artist into a list and remove duplicates
artist_profiles_df = (
    artist_tags_info_df.groupby(["artistID", "name"])["tagValue"]
    .apply(lambda tags: list(set(tags)))  # Remove duplicates and collect tags
    .reset_index()
    .rename(columns={"name": "artist_name", "tagValue": "tags"})
)

# Display artist profiles
print(artist_profiles_df.head())


   artistID        artist_name  \
0         1       MALICE MIZER   
1         2    Diary of Dreams   
2         3  Carpathian Forest   
3         4       Moi dix Mois   
4         5        Bella Morte   

                                                tags  
0  [jrock, j-rock, weeabo, visual kei, better tha...  
1  [gothic rock, darkwave, seen live, dark, vocal...  
2  [saxophones, black metal, norwegian black meta...  
3  [metal, bazarov, j-rock, visual kei, gothic ja...  
4  [gothic rock, darkwave, covers, deathrock, got...  


In [23]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert tags into strings (required for CountVectorizer)
artist_profiles_df["tag_text"] = artist_profiles_df["tags"].apply(lambda tags: " ".join(tags))

# Initialize CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda x:x.split())


# Fit and transform the tag_text column
tag_vectors = vectorizer.fit_transform(artist_profiles_df["tag_text"])

# Convert the sparse matrix to a dense DataFrame for better visualization
tag_vectors_df = pd.DataFrame(tag_vectors.toarray(), columns=vectorizer.get_feature_names_out())

# Add artist information for context
vectorized_artist_profiles_df = pd.concat(
    [artist_profiles_df[["artistID", "artist_name"]], tag_vectors_df], axis=1
)

# Display vectorized features
print(vectorized_artist_profiles_df.head())



   artistID        artist_name  '80s  'n'  -  --  ---  --------  ---king  \
0         1       MALICE MIZER     0    0  0   0    0         0        0   
1         2    Diary of Dreams     0    0  0   0    0         0        0   
2         3  Carpathian Forest     0    0  0   0    0         0        0   
3         4       Moi dix Mois     0    0  0   0    0         0        0   
4         5        Bella Morte     0    0  0   0    0         0        0   

   -angels  ...  zombie  zombieland  zone  zoocore  zooey  zorn  zornish  ztt  \
0        0  ...       0           0     0        0      0     0        0    0   
1        0  ...       0           0     0        0      0     0        0    0   
2        0  ...       0           0     0        0      0     0        0    0   
3        0  ...       0           0     0        0      0     0        0    0   
4        0  ...       0           0     0        0      0     0        0    0   

   zu  ärzte  
0   0      0  
1   0      0  
2   0      

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TF-IDF to encode tags into numerical vectors
tfidf = TfidfVectorizer(max_features=20)  # Limit to 10 features for simplicity
artist_tag_matrix = tfidf.fit_transform(artist_profiles_df["tag_text"])

# Convert the matrix to a DataFrame for easier use
artist_features = pd.DataFrame(
    artist_tag_matrix.toarray(), 
    index=artist_profiles_df["artist_name"], 
    columns=tfidf.get_feature_names_out()
)
print(artist_features.head())


                   80s  alternative  dance  electronic  female  hip  hop  \
artist_name                                                                
MALICE MIZER       0.0          0.0    0.0    0.000000     0.0  0.0  0.0   
Diary of Dreams    0.0          0.0    0.0    0.486981     0.0  0.0  0.0   
Carpathian Forest  0.0          0.0    0.0    0.000000     0.0  0.0  0.0   
Moi dix Mois       0.0          0.0    0.0    0.000000     0.0  0.0  0.0   
Bella Morte        0.0          0.0    0.0    0.000000     0.0  0.0  0.0   

                   indie      live  love     metal  new  pop  post  punk  \
artist_name                                                                
MALICE MIZER         0.0  0.000000   0.0  0.000000  0.0  0.0   0.0   0.0   
Diary of Dreams      0.0  0.558006   0.0  0.000000  0.0  0.0   0.0   0.0   
Carpathian Forest    0.0  0.000000   0.0  1.000000  0.0  0.0   0.0   0.0   
Moi dix Mois         0.0  0.000000   0.0  0.832639  0.0  0.0   0.0   0.0   
Bella Morte

In [35]:
from sklearn.preprocessing import MinMaxScaler

# Apply MinMaxScaler to normalize the feature vectors
scaler = MinMaxScaler()
scaled_artist_features = pd.DataFrame(
    scaler.fit_transform(artist_features),
    index=artist_features.index,
    columns=artist_features.columns
)

# Display the scaled features
print(scaled_artist_features.head())


                   80s  alternative  dance  electronic  female  hip  hop  \
artist_name                                                                
MALICE MIZER       0.0          0.0    0.0    0.000000     0.0  0.0  0.0   
Diary of Dreams    0.0          0.0    0.0    0.486981     0.0  0.0  0.0   
Carpathian Forest  0.0          0.0    0.0    0.000000     0.0  0.0  0.0   
Moi dix Mois       0.0          0.0    0.0    0.000000     0.0  0.0  0.0   
Bella Morte        0.0          0.0    0.0    0.000000     0.0  0.0  0.0   

                   indie      live  love     metal  new  pop  post  punk  \
artist_name                                                                
MALICE MIZER         0.0  0.000000   0.0  0.000000  0.0  0.0   0.0   0.0   
Diary of Dreams      0.0  0.558006   0.0  0.000000  0.0  0.0   0.0   0.0   
Carpathian Forest    0.0  0.000000   0.0  1.000000  0.0  0.0   0.0   0.0   
Moi dix Mois         0.0  0.000000   0.0  0.832639  0.0  0.0   0.0   0.0   
Bella Morte

In [40]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import pandas as pd

# Convert the scaled features to a sparse matrix
scaled_features_sparse = csr_matrix(scaled_artist_features.values)

# Compute pairwise cosine similarities (sparse matrix version)
cosine_similarities_sparse = cosine_similarity(scaled_features_sparse, dense_output=False)

# Extract non-zero similarity values (only upper triangle to avoid duplicates)
artist_indices = cosine_similarities_sparse.nonzero()
similarities = cosine_similarities_sparse.data

# Create a DataFrame for better readability
similarities_df = pd.DataFrame({
    "artistID_1": artist_indices[0],
    "artistID_2": artist_indices[1],
    "similarity": similarities
})

# Filter out self-similarities and duplicate pairs
similarities_df = similarities_df[similarities_df["artistID_1"] < similarities_df["artistID_2"]]

# Optionally map indices back to artist names
similarities_df["artistID_1"] = similarities_df["artistID_1"].map(lambda x: scaled_artist_features.index[x])
similarities_df["artistID_2"] = similarities_df["artistID_2"].map(lambda x: scaled_artist_features.index[x])

# Display the top 5 results
print(similarities_df)
print(similarities_df.sort_values(by="similarity", ascending=False).head())


             artistID_1      artistID_2  similarity
0          MALICE MIZER      Apollo 440    0.501864
1          MALICE MIZER       LOSTFREEQ    0.293470
2          MALICE MIZER         Epstein    0.350454
3          MALICE MIZER       Blackfoot    1.000000
4          MALICE MIZER  Stealers Wheel    0.563057
...                 ...             ...         ...
34471620      LOSTFREEQ    Oz Alchemist    0.384612
34472286      LOSTFREEQ      Apollo 440    0.498530
34474005  Ciccone Youth    Oz Alchemist    0.379009
34474006  Ciccone Youth      Apollo 440    0.407239
34480745     Apollo 440    Oz Alchemist    0.745619

[17238166 rows x 3 columns]
             artistID_1   artistID_2  similarity
6287985            Нюша          M2M         1.0
2357660       Lacrimosa       Hunter         1.0
29318593      Vitamin C          M2M         1.0
17474620  Handsome Furs  Paper Route         1.0
17164325         Realms           伍佰         1.0


In [42]:
import pandas as pd


# Step 1: Join user interactions with artist details
user_artist_df = pd.merge(user_artists, artists, left_on="artistID", right_on="id")[["userID", "artistID"]]

# Step 2: Join user interactions with artist similarity data
user_recommendations = pd.merge(
    user_artist_df,
    similarities_df,
    left_on="artistID",
    right_on="artistID_1"
)

# Step 3: Group by userID and artistID_2, calculate the average similarity
user_recommendations_agg = (
    user_recommendations
    .groupby(["userID", "artistID_2"], as_index=False)
    .agg(avg_similarity=("similarity", "mean"))
)

# Step 4: Sort recommendations by userID and average similarity
user_recommendations_sorted = user_recommendations_agg.sort_values(by=["userID", "avg_similarity"], ascending=[True, False])

# Display the top 50 recommendations
print(user_recommendations_sorted.head(50))


ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [43]:
# Convert the artistID columns to the same data type (str is safer in this case)
user_artists["artistID"] = user_artists["artistID"].astype(str)
artists["id"] = artists["id"].astype(str)
similarities_df["artistID_1"] = similarities_df["artistID_1"].astype(str)
similarities_df["artistID_2"] = similarities_df["artistID_2"].astype(str)

# Step 1: Join user interactions with artist details
user_artist_df = pd.merge(user_artists, artists, left_on="artistID", right_on="id")[["userID", "artistID"]]

# Step 2: Join user interactions with artist similarity data
user_recommendations = pd.merge(
    user_artist_df,
    similarities_df,
    left_on="artistID",
    right_on="artistID_1"
)

# Step 3: Group by userID and artistID_2, calculate the average similarity
user_recommendations_agg = (
    user_recommendations
    .groupby(["userID", "artistID_2"], as_index=False)
    .agg(avg_similarity=("similarity", "mean"))
)

# Step 4: Sort recommendations by userID and average similarity
user_recommendations_sorted = user_recommendations_agg.sort_values(by=["userID", "avg_similarity"], ascending=[True, False])

# Display the top 50 recommendations
print(user_recommendations_sorted.head(50))


     userID             artistID_2  avg_similarity
8         3       Antonio Banderas        1.000000
15        3              Bakkushan        1.000000
46        3            Electrovamp        1.000000
56        3                Grenada        1.000000
74        3          KT Wanderlust        1.000000
79        3  Les Rythmes Digitales        1.000000
85        3              Magic Box        1.000000
88        3           Michael Mind        1.000000
93        3               Monolith        1.000000
94        3            Moonbootica        1.000000
95        3                Moondog        1.000000
96        3              Mr. Vegas        1.000000
111       3        Plácido Domingo        1.000000
114       3            Rex the Dog        1.000000
133       3          Steve Angello        1.000000
147       3               Tim Berg        1.000000
158       3                  athas        1.000000
162       3                  ravex        1.000000
32        3           Culture B

In [46]:
# Input a user ID
input_user_id = 2100

# Step 1: Get artists the user has already interacted with
interacted_artists = user_artists[user_artists["userID"] == input_user_id]["artistID"].tolist()

# Step 2: Generate recommendations for the user
user_recommendations_filtered = (
    user_artist_df[user_artist_df["userID"] == input_user_id]  # Filter interactions for the input user
    .merge(similarities_df, left_on="artistID", right_on="artistID_1")  # Join with similarities
)

# Step 3: Exclude already interacted artists
user_recommendations_filtered = user_recommendations_filtered[
    ~user_recommendations_filtered["artistID_2"].isin(interacted_artists)
]

# Step 4: Group by userID and artistID_2, and calculate the average similarity
user_recommendations_agg = (
    user_recommendations_filtered
    .groupby(["userID", "artistID_2"], as_index=False)
    .agg(avg_similarity=("similarity", "mean"))
    .sort_values(by="avg_similarity", ascending=False)  # Sort by similarity
)

# Step 5: Add artist names to the recommendations
user_recommendations_with_names = (
    user_recommendations_agg
    .merge(artists, left_on="artistID_2", right_on="id", how="inner")  # Add artist names
    [["userID", "artistID_2", "name", "avg_similarity"]]  # Select relevant columns
    .rename(columns={"name": "artist_name"})  # Rename the artist name column
    .sort_values(by="avg_similarity", ascending=False)  # Sort again just in case
)

# Display the top 10 recommendations
print(user_recommendations_with_names.head(10))


   userID artistID_2       artist_name  avg_similarity
0    2100      12012  Foxes In Fiction        0.699911
1    2100       1997        Dragonland        0.337469
2    2100        999    Thompson Twins        0.138429
