# Genre Clustering

There are a lot of genres in the Spotify dataset (>5000). We'd like to see if the Spotify Audio features can be used to differentiate between different genres, but to do this we need to know that there is a distinct difference between the genres that we choose to compare. (For example, there is probably a distinct difference between the genres of Dubstep and Classical Piano, but probably less so between Rock and Alternative Rock).

To this end, we are going to attempt to cluster the genres into similar groups, and have a look at how homogenous the audio features are within some of the groups.

## Import Libraries

In [15]:
import pandas as pd
import numpy as np

import json
import time
import sys
import os
from collections import Counter, defaultdict

# import seaborn
# from matplotlib import pyplot as plt

# import sklearn
# from sklearn.cluster import KMeans, SpectralCoclustering, AgglomerativeClustering
# from sklearn.metrics import silhouette_score
# from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
# from sklearn.manifold import TSNE
# import shared_functions as sf
# import clustergrammer2 as cg
# from clustergrammer2 import net
# from scipy.cluster.hierarchy import dendrogram, linkage

### Check for preprocessed Pickle files

In [2]:
load_from_source = True
if os.path.isfile("genre_counts.pkl") and os.path.isfile("sparse_normed_genre_links.pkl"):
    load_from_source = False

## Load Data from Original Source
Skip this if the required Pickle files are present

In [3]:
if load_from_source:
    data = json.load(open("../all_artist_info.json", "r"))
    data_list = []
    for key in data:
        data_list.append((key,
                          data[key]["id"],
                          data[key]["name"],
                          data[key]["followers"],
                          data[key]["popularity"],
                        ", ".join(data[key]["genres"])))
    data_df = pd.DataFrame(data_list)
    data_df.columns = ["orig_id", "id", "name", "followers", "popularity", "genres"]
    data_df

## Get Genre Counts and Co-occurences

In [4]:
if load_from_source:
    genre_counts = {}
    genre_links = {}
    for row in data_df["genres"]:
        genres = row.split(", ")
        if len(genres) > 1:
            for g1 in genres:
                if g1 not in genre_links:
                    genre_links[g1] = {}
                for g2 in genres:
                    if g2 not in genre_links[g1]:
                        genre_links[g1][g2] = 0
                    genre_links[g1][g2] += 1

                if g1 not in genre_counts:
                    genre_counts[g1] = 0
                genre_counts[g1] += 1

    normed_genre_links = {}
    for g1 in genre_links:
        normed_genre_links[g1] = {}
        total_count = genre_links[g1][g1]
        for g2 in genre_links[g1]:
            normed_genre_links[g1][g2] = genre_links[g1][g2]/total_count

    normed_genre_links

## Save Data to Pickle files

### Convert to Sparse Matrix and save

In [5]:
# Convert to DataFrame and fill all NA values with 0 (as that's what they should be)
df = pd.DataFrame(normed_genre_links).fillna(0)

# Convert to a Sparse Array, which makes it much smaller if we save it as a Pickle file
df = df.astype(pd.SparseDtype("float", 0))

# Make the Columns symmetrical to the Index (not values will not be symmetrical, only labels)
df = df[list(df.index)]

# Save the DataFrame as a Pickle file for future use
df.to_pickle("sparse_normed_genre_links.pkl")

df.head()

Unnamed: 0,early us punk,synth punk,pub rock,punk,punk 'n' roll,punk blues,classic rock,permanent wave,rock,skate punk,...,disney russian,mgp,musique traditionnelle comorienne,lagu maluku,mindfulness,tamil worship,classic konkani pop,konkani pop,sound effects,malawian folk
early us punk,1.0,0.030303,0.05303,0.045283,0.016393,0.063694,0.002907,0.009091,0.001773,0.011561,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
synth punk,0.038462,1.0,0.0,0.007547,0.0,0.006369,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pub rock,0.269231,0.0,1.0,0.128302,0.032787,0.10828,0.02907,0.018182,0.01773,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
punk,0.461538,0.060606,0.257576,1.0,0.180328,0.146497,0.011628,0.072727,0.028369,0.624277,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
punk 'n' roll,0.076923,0.0,0.030303,0.083019,1.0,0.101911,0.0,0.0,0.0,0.069364,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save Genre Counts

In [6]:
pd.Series(genre_counts).to_pickle("genre_counts.pkl")

## Load data from Pickle Files
Just to make sure this runs regardless of where the data comes from

In [7]:
df = pd.read_pickle("sparse_normed_genre_links.pkl")
genre_counts = pd.read_pickle("genre_counts.pkl")

## Check parameters of the Data

Have a quick look at some relevant stats about the data

### Sparsity
What percentage of the DataFrame is just zero-values?

In [8]:
print(f"Sparsity: {round((df == 0).values.flatten().mean() * 100, 4)}%")

Sparsity: 99.5209%


### Number of Connections

In [9]:
connection_list = (df != 0).sum()
print(f"Total number of genres: {len(connection_list)}")
print(f"Maximum number of connections: {connection_list.max()} ({connection_list[connection_list == connection_list.max()].index[0]})")
print(f"Minimum number of connections: {connection_list.min()} (>1 so every genre has at least 1 connection as well as itself)")
print(f"Median number of connections: {connection_list.median()}")
print(f"Mean number of connections: {connection_list.mean()}")

Total number of genres: 5528
Maximum number of connections: 315 (art pop)
Minimum number of connections: 2 (>1 so every genre has at least 1 connection as well as itself)
Median number of connections: 16.0
Mean number of connections: 26.482272069464543


## Manual Clustering
The Co-occurrence matrix is very sparse, which seems to imply that not all genres are actually connected with each other. Can we form clusters by just grouping together any genres that are linked at all, or will that devolve into one large cluster just through a few outliers being linked with everything?

In [10]:
a = set([1, 2, 3])
n = set([3, 4, 5])
d = set([5, 6, 7])
print(set.union(*[a, n, d],[8, 9]))

{1, 2, 3, 4, 5, 6, 7, 8, 9}


In [11]:
def create_genre_groups(gmap):
    groups = []
    
    for base_genre in gmap:
        r_genres = set(gmap[base_genre].keys())
        matches = []
        
        for i in range(len(groups)):
            # If the group has any common elements, group them together
            if not (r_genres.isdisjoint(groups[i])):
                groups[i] = groups[i].union(r_genres)
                matches.append(i)
        
        if len(matches) > 0:
            untouched_groups = [groups[i] for i in range(len(groups)) if i not in matches]
            merged_group = set.union(*[groups[i] for i in matches], r_genres)
            groups = untouched_groups + [merged_group]
        
        else:
            # If no groups with common elements were found, create a new group
            groups.append(r_genres)
        
    return groups

output = create_genre_groups(normed_genre_links)

In [12]:
for g in create_genre_groups(normed_genre_links):
    print(len(g))

3
2
5523


### Testing Co-occurrence Threshold

Okay, that didn't really work. We've ended up splitting off a whole 5 genres, and are left with a central supercluster of 5523 genres. It looks like the genres are generally too interconnected. But maybe we can start cutting some of those connections to try and break this down.

What happens if we apply a threshold (e.g. for two genres to count as "related" their co-occurrence has to be above a certain value)?

In [13]:
total_links = 0
link_count = {i: 0 for i in range(10)}
for g in normed_genre_links:
    for rg in normed_genre_links[g]:
        # Skip comparing to itself
        if g == rg:
            continue
        total_links += 1
        i = 0
        while (i+1)/10 < normed_genre_links[g][rg]:
            i += 1
        link_count[i] += 1
        
for n in link_count:
    link_count[n] = round(link_count[n] / total_links * 100, 2)
    
for val, perc in link_count.items():
    print(f"{val/10} < C <= {(val+1)/10}: {perc}%")

0.0 < C <= 0.1: 80.74%
0.1 < C <= 0.2: 10.2%
0.2 < C <= 0.3: 3.52%
0.3 < C <= 0.4: 2.05%
0.4 < C <= 0.5: 1.27%
0.5 < C <= 0.6: 0.73%
0.6 < C <= 0.7: 0.55%
0.7 < C <= 0.8: 0.41%
0.8 < C <= 0.9: 0.24%
0.9 < C <= 1.0: 0.31%


Okay, this looks more promising. With a large percentage of co-occurrences already at very low rates, we should be able to eliminate most of them by applying a low threshold that doesn't remove too much of the existing clusters.

In [16]:
def apply_threshold(gmap, threshold):
    trimmed_gmap = {}
    for genre in gmap:
        trimmed_gmap[genre] = {}
        for related_genre in gmap[genre]:
            if gmap[genre][related_genre] >= threshold:
                trimmed_gmap[genre][related_genre] = gmap[genre][related_genre]
    return trimmed_gmap


# def get_group_sizes_from_threshold(gmap, t):
#     threshold_coocs = apply_threshold(gmap, t)
#     threshold_groups = get_groups(threshold_coocs)
#     group_sizes = sorted([len(v) for v in threshold_groups], reverse=True)
#     return group_sizes
    

for t in range(1, 10):
    threshold = t/10
    print(threshold)
    trimmed_genre_links = apply_threshold(normed_genre_links, threshold)
    genre_groups = create_genre_groups(trimmed_genre_links)
    group_sizes = [len(g) for g in genre_groups]
    print(sorted(Counter(group_sizes).items()))
    print()

0.1
[(1, 4), (2, 2), (3, 2), (4, 1), (5, 1), (5505, 1)]

0.2
[(1, 175), (2, 52), (3, 16), (4, 10), (5, 3), (7, 1), (8, 2), (9, 1), (12, 1), (5102, 1)]

0.3
[(1, 684), (2, 192), (3, 63), (4, 42), (5, 20), (6, 13), (7, 3), (8, 6), (9, 5), (10, 4), (11, 4), (12, 4), (13, 4), (14, 2), (15, 2), (17, 2), (19, 1), (22, 1), (23, 1), (24, 1), (25, 1), (29, 1), (30, 1), (35, 1), (38, 1), (56, 1), (74, 1), (129, 1), (3031, 1)]

0.4
[(1, 1324), (2, 326), (3, 112), (4, 69), (5, 39), (6, 18), (7, 12), (8, 18), (9, 10), (10, 8), (11, 6), (12, 9), (13, 4), (14, 8), (15, 4), (16, 4), (17, 3), (18, 4), (19, 3), (20, 2), (21, 2), (22, 2), (24, 2), (28, 1), (29, 1), (30, 1), (37, 1), (39, 1), (41, 1), (44, 1), (49, 1), (51, 1), (53, 1), (67, 1), (83, 1), (88, 1), (102, 1), (120, 1), (133, 1), (429, 1)]

0.5
[(1, 1894), (2, 388), (3, 153), (4, 97), (5, 44), (6, 30), (7, 15), (8, 10), (9, 17), (10, 5), (11, 13), (12, 16), (13, 3), (14, 3), (15, 3), (16, 9), (17, 1), (18, 2), (19, 2), (20, 2), (26, 1), (27, 

In [17]:
trimmed_genre_links = apply_threshold(normed_genre_links, 0.9)
genre_groups = create_genre_groups(trimmed_genre_links)
for g in genre_groups:
    if len(g) > 3:
        print(g)
        print()

{'swedish trap pop', 'swedish drill', 'swedish trap', 'swedish hip hop', 'swedish gangsta rap'}

{'sarod', 'carnatic instrumental', 'hindustani instrumental', 'indian percussion', 'hindustani vocal', 'khayal', 'carnatic vocal', 'carnatic', 'veena', 'indian classical', 'hindustani classical'}

{'sinhala rap', 'sinhala pop', 'sinhala indie', 'classic sinhala pop', 'sinhala edm'}

{'salsa international', 'tropical', 'salsa', 'combos nacionales'}

{'barnemusikk', 'hollywood', 'eventyr', 'disney norsk'}

{'oromo pop', 'tigrigna pop', 'ethiopian pop', 'amharic pop'}

{'vintage radio show', 'oratory', 'reading', 'drama'}

{'korean worship', 'world worship', 'chinese worship', 'japanese worship'}

{'taiko', 'koto', 'japanese traditional', 'shakuhachi'}

{'tamaulipas rap', 'mexican hip hop', 'indie campechano', 'rap underground mexicano'}

{'utapri', 'actors', 'tsukiuta', 'a3', 'ensemble stars', 'starmyu', 'hypnosis mic', 'j-division', 'honeyworks'}

{'latvian pop', 'classic latvian pop', 'latv

In [19]:
class GroupClassifier():
    # For subsequently lower thresholds, attempt to have a user-input name for each group.
    # Remember group names for future groups, and summarise results using them.
    def __init__(self, full_gmap, threshold_start=0.9, threshold_cap=0.1, threshold_step=0.1, group_min_size=5):
        self.gmap = full_gmap
        self.threshold_start = threshold_start
        self.threshold_cap = threshold_cap
        self.threshold_step = threshold_step
        self.group_min_size = group_min_size
        self.named_groups = []
        self.threshold_groups = {}
        self.main()
    
        
    def get_groups(self, threshold):
        trimmed_gmap = apply_threshold(self.gmap, threshold)
        genre_groups = create_genre_groups(trimmed_gmap)
        return genre_groups
        
        
    def get_full_groups(self):
        full_group_list = []
        for fg in self.named_groups:
            full_group = list(fg)
            while True:
                for index in range(len(full_group)):
                    element = full_group[index]
                    if type(element) == "int":
                        del full_group[index]
                        full_group += self.named_groups[index]
                        continue
                        
                # If we get here, there are no more numbers left
                # I have checked that there are no genre names that can be interpreted as numbers
                break
                
            full_group_list.append(full_group)
            
        return full_group_list
                

    def main(self):
        # Get original clusters
        threshold = self.threshold_start
        
        for threshold in range(self.threshold_start, self.threshold_cap + 0.01, self.threshold_step):
            print(threshold)
            new_clusters = self.check_clusters(threshold)
            
            for cluster in new_clusters:
                for group in self.named_groups:
                    if isdisjoint()
            
            self.threshold_groups[threshold] = new_clusters
            threshold = round(self.threshold_step)
            threshold -= self.threshold_step
        
    
    def check_clusters(self, threshold):
        current_group_list = [g for g in self.get_groups(threshold) if len(g) >= self.group_min_size]
        
        full_group_list = self.get_full_groups()
        new_groups = []
        
        for current_group in current_group_list:
            group = current_group.copy()
            subgroups = []
            
            overlap_found = False
            while True:
                overlap_groups = [(n, g) for n, g in enumerate(full_group_list) if len(group.intersection(g)) > 0]
                
                if len(overlap_groups) == 0:
                    # Current group is not comprised of any subgroups
                    break
                
                max_group_index, max_group = max(overlap_groups, key=lambda x: len(x[1]))
                
                subgroups.append(max_group_index)
                group -= set(max_group)
                overlap_found = True
            
            # There are now 4 scenarios:
            # 1) If len(subgroups) == 0, it's a new cluster
            # 2) If len(subgroups) >= 2, it's a merging of clusters
            # 3) If len(subgroups) == 1, and len(group) > 0, it is an extension of an exisiting cluster
            # 4) If len(subgroups) == 1, and len(group) == 0, it IS an existing cluster.
            # Secnario 4 is the only scenario where we don't save the cluster.
            
            if (len(subgroups) == 1) and (len(group) == 0):
                # Scenario 4
                continue
                
            else:
                # Scenarios 1-3
                # Add subgroups to cluster
                group.update(subgroups)
                self.named_groups.append(group)
                new_groups.append(group)
                
        self.threshold_groups[threshold] = new_groups
        return new_groups
    
    
    def unfold_group(self, group):
        for i in range(len(group)):
            element = group[i]
            if type(element) == type(1):
                group[i] = self.unfold_group_by_index(element)
        return group
        
        
    def unfold_group_by_index(self, group_index):
        group = list(self.named_groups[group_index].copy())
        return self.unfold_group(group)
        
    
    def get_threshold_indices(self):
        keys = self.threshold_groups.keys()
        for i in range(len(keys)):
            print(f"{i}: {keys[i]}")
    
    def get_threshold_groups(self, index):
        key = self.threshold_groups.keys()[index]
        for i in range(index, -1, -1):
            # For each group in current threshold:
            #     Unfold group
            #     Check if the group has any overlap with existing groups in the current list
            #     If not, add to the current list
        
    def flatten(self, group):
        output = []
        for e in arr:
            if isinstance(e, list) or isinstance(e, set):
                print(e)
                output += flatt(e)
            else:
                output.append(e)
        return output
        
        
        
        
GC = GroupClassifier(normed_genre_links)

SyntaxError: invalid syntax (<ipython-input-19-fb4b75a315ee>, line 52)

In [None]:
GC.threshold_groups[0.5000000000000001]

In [None]:
len(GC.named_groups)

In [None]:
GC.named_groups[8]

In [None]:
print(GC.unfold_group(154))

In [None]:
print("Current Groups:")
for g in current_group_list:
    print(g)
print("===\n")

full_group_list = []
for full_group in all_group_list:
    while True:
        for index in range(len(full_group)):
            element = full_group[index]
            if type(element) == "int":
                del full_group[index]
                full_group += all_group_list[index]
                continue
        # If we get here, there are no more numbers left
        # We have checked that no genre names can be interpreted as numbers
        break
    full_group_list.append(full_group)

print("Existing Groups:")
for g in full_group_list:
    print(g)
print("===\n")
    
for current_group in current_group_list:
    print("Original group:")
    print(current_group)
    group = current_group.copy()
    subgroups = []
    
    overlap_found = False
    while True:
        overlap_groups = [(n, g) for n, g in enumerate(full_group_list) if len(group.intersection(g)) > 0]
        if len(overlap_groups) == 0:
            print("no overlap groups")
            break
        max_group_index, max_group = max(overlap_groups, key=lambda x: len(x[1]))
        print(f"mgi: {max_group_index}, mg: {max_group}")
        if len(max_group) == 1:
            break
        subgroups.append(max_group_index)
        print(max_group)
        group -= set(max_group)
        overlap_found = True
    
    if overlap_found:  
        print("Subgroups:")
        print(subgroups)
        print("Leftovers:")
        print(group)
        print("Final outcome:")
        group.update(subgroups)
        print(group)
    print()

In [None]:
for i in range(9, 1, -1):
    print(i)

In [None]:
a = [[1,2,3,4,[9,10,[11,12]]],[5,6,7,8,{1,2,3}]]
a

In [None]:
def flatt(arr):
    output = []
    for e in arr:
        if isinstance(e, list) or isinstance(e, set):
            print(e)
            output += flatt(e)
        else:
            output.append(e)
    return output

In [None]:
flatt(a)

---

## Automated Spectral Clustering

In [None]:
from sklearn.cluster import SpectralClustering

In [None]:
clustering = SpectralClustering(n_clusters=10, n_components=300, assign_labels="discretize",
                                random_state=0, affinity="precomputed")
clustering.fit(df.values)

In [None]:
count = {}
for i in clustering.labels_:
    if i not in count:
        count[i] = 0
    count[i] += 1
group_sizes = {}
for i in count:
    if count[i] not in group_sizes:
        group_sizes[count[i]] = 0
    group_sizes[count[i]] += 1
print(sorted(group_sizes.items(), key=lambda x: x[0], reverse=True))
print(count)
print(len(count))

In [None]:
for k in range(1, 50):
    for i in range(len(clustering.labels_)):
        if clustering.labels_[i] == k:
            print(df.index[i])
    print()

In [None]:
i = 2000
j = 50
clustering.labels_[i:i+j]

In [None]:
clustering.labels_[2002]

In [None]:
df.index[2002]