In [1]:
import numpy as np
import collections
import itertools
# from memory_profiler import memory_usage

In [2]:
class Apriori:
    def __init__(self,
                 data,
                 min_support=0.01,
                 min_confidence=0.01,
                 min_lift=0.01):
        self.data = data
        self.min_support = min_support
        self.min_confidence = min_confidence
        self.min_lift = min_lift

    def prepare_data(self):
        self.playlists = list(spotify_data.item().values())
        self.playlists = [set(playlist) for playlist in self.playlists]
        unique_songs = [item for sublist in self.playlists for item in sublist]
        self.songs_counter = collections.Counter(unique_songs)

    def get_songs_appearances(self):
#         memory_start = memory_usage(-1)
        songs_in_playlists = collections.defaultdict(set)
        for index, playlist in enumerate(self.playlists):
            for song in playlist:
                songs_in_playlists[song].add(index)
        self.songs_in_playlists = songs_in_playlists
#         memory_end = memory_usage(-1)
#         print("Songs appearances dictionary memory usage: {}".format(
#             memory_end[0] - memory_start[0]))

    def generate_L_1(self):
        self.L_1_counter = {
            song: times
            for song, times in self.songs_counter.items()
            if times / len(self.playlists) >= self.min_support
        }
        self.L_1 = [{song} for song in self.L_1_counter.keys()]

    def generate_new_candidates(self, current_itemsets, k):
        C_k = set()
        m = k - 2
        for itemset in current_itemsets:
            for song in self.L_1:
                k_minus_2_not_equal = False
                new_candidate = frozenset(itemset).union(song)
                for combination in itertools.combinations(itemset, m):
                    frequent_tuple = frozenset(combination).union(song)
                    if frequent_tuple not in current_itemsets:
                        k_minus_2_not_equal = True
                        break
                if not k_minus_2_not_equal and len(new_candidate) == k:
                    C_k.add(new_candidate)


#         for candidate in current_itemsets:
#             for aux_candidate in current_itemsets:
#                 new_candidate = frozenset(candidate).union(aux_candidate)
#                 if len(new_candidate) == k:
#                     C_k.add(new_candidate)
        print("Candidates length: {}".format(len(C_k)))
        return C_k

    def calculate_subset_count(self, subset):
        playlists_inter = []
        for song in subset:
            playlists_inter.append(self.songs_in_playlists[song])
        return len(set.intersection(*playlists_inter))

    def prune_itemsets(self, C_k):
        C_k_counter = {}
        for candidate in C_k:
            C_k_counter[candidate] = self.calculate_subset_count(candidate)
        L_k_counter = {
            subset: times
            for subset, times in C_k_counter.items()
            if times / len(self.playlists) >= self.min_support
        }
        return L_k_counter

    def fit(self):
        self.prepare_data()
        self.get_songs_appearances()
        self.generate_L_1()
        self.k_frequent_itemsets = {}
        self.frequent_itemsets = []
        k = 2
        current = self.L_1
        while len(current) != 0:
            C_k = self.generate_new_candidates(current, k)
            L_k_counter = self.prune_itemsets(C_k)
            L_k = L_k_counter.keys()
            self.frequent_itemsets.extend(L_k)
            self.k_frequent_itemsets[k] = sorted(
                L_k_counter.items(), key=lambda x: x[1], reverse=True)
            k += 1
            current = L_k

    def calculate_confidence(self, x, y):
        x_count = self.calculate_subset_count(x)
        x_y_count = self.calculate_subset_count(frozenset(x).intersection(y))
        return x_y_count / x_count

    def get_association_rule_from_itemset(self, itemset):
        rules = []
        itemset_count = self.calculate_subset_count(itemset)
        itemset_support = itemset_count / len(self.playlists)
        print("Itemset support:  {}".format(itemset_support))
        for i in range(1, len(itemset) + 1):
            for x_set in itertools.combinations(itemset, i):
                x_set = frozenset(x_set)
                y_set = frozenset(itemset) - x_set
                x_support = self.calculate_subset_count(x_set) / len(
                    self.playlists)
                rule_confidence = itemset_support / x_support
                if len(x_set) > 0 and len(y_set) > 0:
                    y_support = self.calculate_subset_count(y_set) / len(
                        self.playlists)
                    rule_lift = rule_confidence / y_support
                    print("Rule confidence: {}".format(rule_confidence))
                    print("y support: {}".format(y_support))
                    if rule_lift >= self.min_lift and rule_confidence >= self.min_confidence:
                        print("Rule: {0}->{1}:\nConfidence: {2}\nLift:{3}\n\n".
                              format(x_set, y_set, rule_confidence, rule_lift))

    def generate(self):
        for itemset in self.frequent_itemsets:
            self.get_association_rule_from_itemset(itemset)

In [3]:
spotify_data = np.load("spotify.npy")

In [4]:
apriori = Apriori(
    data=spotify_data, min_support=0.01, min_confidence=0.03, min_lift=0.9)

In [5]:
apriori.fit()

Candidates length: 350703
Candidates length: 319
Candidates length: 3


In [6]:
apriori.generate()

Itemset support:  0.0112
Rule confidence: 0.2779156327543424
y support: 0.0219
Rule: frozenset({'Congratulations'})->frozenset({'Swang'}):
Confidence: 0.2779156327543424
Lift:12.690211541294174


Rule confidence: 0.5114155251141552
y support: 0.0403
Rule: frozenset({'Swang'})->frozenset({'Congratulations'}):
Confidence: 0.5114155251141552
Lift:12.690211541294174


Itemset support:  0.0127
Rule confidence: 0.4276094276094276
y support: 0.0308
Rule: frozenset({'Bounce Back'})->frozenset({'goosebumps'}):
Confidence: 0.4276094276094276
Lift:13.883422974332065


Rule confidence: 0.4123376623376623
y support: 0.0297
Rule: frozenset({'goosebumps'})->frozenset({'Bounce Back'}):
Confidence: 0.4123376623376623
Lift:13.883422974332063


Itemset support:  0.0148
Rule confidence: 0.6271186440677966
y support: 0.0465
Rule: frozenset({'Slippery (feat. Gucci Mane)'})->frozenset({'HUMBLE.'}):
Confidence: 0.6271186440677966
Lift:13.486422453070896


Rule confidence: 0.31827956989247314
y support: 0.0236

y support: 0.0403
Rule: frozenset({'No Problem (feat. Lil Wayne & 2 Chainz)'})->frozenset({'Congratulations'}):
Confidence: 0.31152647975077885
Lift:7.730185601756299


Itemset support:  0.011
Rule confidence: 0.2756892230576441
y support: 0.0297
Rule: frozenset({'Broccoli (feat. Lil Yachty)'})->frozenset({'Fake Love'}):
Confidence: 0.2756892230576441
Lift:9.2824654228163


Rule confidence: 0.37037037037037035
y support: 0.0399
Rule: frozenset({'Fake Love'})->frozenset({'Broccoli (feat. Lil Yachty)'}):
Confidence: 0.37037037037037035
Lift:9.2824654228163


Itemset support:  0.0159
Rule confidence: 0.4162303664921466
y support: 0.0723
Rule: frozenset({'Let Me Love You'})->frozenset({'Closer'}):
Confidence: 0.4162303664921466
Lift:5.7569898546631615


Rule confidence: 0.21991701244813278
y support: 0.0382
Rule: frozenset({'Closer'})->frozenset({'Let Me Love You'}):
Confidence: 0.21991701244813278
Lift:5.756989854663162


Itemset support:  0.0113
Rule confidence: 0.5159817351598174
y supp

In [7]:
len(apriori.songs_in_playlists)

132920