# Data required for the construction of each _chrome_

In this file we intent to obtain all the data required for the construction of each possible chrome to be displayed in our final visualization. For this, we are going to calculate the assosiation rules for this dataset, and finally, using this assosiation rules, build a `json` file with all the information required per each _chrome_.

## Importing the dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [2]:
dataset1 = pd.read_csv('datasets/mushrooms.csv')

In [3]:
dataset1

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [4]:
dataset_dict = dataset1.T.to_dict()
dataset_dict

{0: {'class': 'p',
  'cap-shape': 'x',
  'cap-surface': 's',
  'cap-color': 'n',
  'bruises': 't',
  'odor': 'p',
  'gill-attachment': 'f',
  'gill-spacing': 'c',
  'gill-size': 'n',
  'gill-color': 'k',
  'stalk-shape': 'e',
  'stalk-root': 'e',
  'stalk-surface-above-ring': 's',
  'stalk-surface-below-ring': 's',
  'stalk-color-above-ring': 'w',
  'stalk-color-below-ring': 'w',
  'veil-type': 'p',
  'veil-color': 'w',
  'ring-number': 'o',
  'ring-type': 'p',
  'spore-print-color': 'k',
  'population': 's',
  'habitat': 'u'},
 1: {'class': 'e',
  'cap-shape': 'x',
  'cap-surface': 's',
  'cap-color': 'y',
  'bruises': 't',
  'odor': 'a',
  'gill-attachment': 'f',
  'gill-spacing': 'c',
  'gill-size': 'b',
  'gill-color': 'k',
  'stalk-shape': 'e',
  'stalk-root': 'c',
  'stalk-surface-above-ring': 's',
  'stalk-surface-below-ring': 's',
  'stalk-color-above-ring': 'w',
  'stalk-color-below-ring': 'w',
  'veil-type': 'p',
  'veil-color': 'w',
  'ring-number': 'o',
  'ring-type': 'p',


In [5]:
dataset = dict()
for key in dataset_dict:
    dataset[key] = ['{}|{}'.format(i, dataset_dict[key][i]) for i in dataset_dict[key].keys()]
dataset

{0: ['class|p',
  'cap-shape|x',
  'cap-surface|s',
  'cap-color|n',
  'bruises|t',
  'odor|p',
  'gill-attachment|f',
  'gill-spacing|c',
  'gill-size|n',
  'gill-color|k',
  'stalk-shape|e',
  'stalk-root|e',
  'stalk-surface-above-ring|s',
  'stalk-surface-below-ring|s',
  'stalk-color-above-ring|w',
  'stalk-color-below-ring|w',
  'veil-type|p',
  'veil-color|w',
  'ring-number|o',
  'ring-type|p',
  'spore-print-color|k',
  'population|s',
  'habitat|u'],
 1: ['class|e',
  'cap-shape|x',
  'cap-surface|s',
  'cap-color|y',
  'bruises|t',
  'odor|a',
  'gill-attachment|f',
  'gill-spacing|c',
  'gill-size|b',
  'gill-color|k',
  'stalk-shape|e',
  'stalk-root|c',
  'stalk-surface-above-ring|s',
  'stalk-surface-below-ring|s',
  'stalk-color-above-ring|w',
  'stalk-color-below-ring|w',
  'veil-type|p',
  'veil-color|w',
  'ring-number|o',
  'ring-type|p',
  'spore-print-color|n',
  'population|n',
  'habitat|g'],
 2: ['class|e',
  'cap-shape|b',
  'cap-surface|s',
  'cap-color|w',
 

## Implementation of the Apriori algorithm


### Preamble

In this section some functions used in the algorithm are defined separately. In the following section they are used to construct the complete algorithm

In [6]:
import numpy as np

Define function `large_one_itemsets`, which returns a set with the large 1-itemsets

In [7]:
def large_one_itemsets(d,  min_sup_count):
    
    """
        :param d: list of transactions (list of sets)
        :param min_sup_count: frequency of appearance required
        for an itemset to be considered large
        :returns: set with large 1-itemsets. Set of 1-sets
    """
    
    _one_itemsets = {}  # itemsets with corresponding support count
    for transaction in d:
        for item in transaction:
            current_itemset = frozenset((item,))
            if current_itemset in _one_itemsets:
                _one_itemsets[current_itemset] += 1
            else:
                _one_itemsets[current_itemset] = 1
    
    return {itemset for itemset, itemset_sup_count in _one_itemsets.items() if itemset_sup_count >= min_sup_count}

Define function `candidates`, which generates all large candidates of cardinality `k` from a set of itemsets; each of cardinality `k - 1`.
It consists of 2 steps: the `join` and `prune` steps.

In [8]:
def join(l_k_1, k):
    
    """
        :param l_k_1: large itemsets of size k - 1. Set of sets
        :param k: cardinality of candidates
        :returns: all the unions of cardinality k between itemset 
        pairs of the large itemsets of cardinality k - 1 in lk_1.
        Set of sets
    """
    
    combs = set()
    for l_k_1_itemset1 in l_k_1:
        for l_k_1_itemset2 in l_k_1:
            ck_itemset = l_k_1_itemset1 | l_k_1_itemset2
            # only add the new itemset if it has k elements
            if len(ck_itemset) == k:
                combs.add(ck_itemset)
                
    return frozenset(combs)

In [9]:
def prune(l_k_1, combs_k, k):
    
    """
        :param l_k_1: large itemsets of size k - 1
        :param combs_k: combinations of size k obtained 
        from lk_1. Itemsets of size k
        :returns: set of large candidates of size k
    """
    
    pruned = set()
    for comb in combs_k:
        
        # count the large k-1 itemsets that are subsets of comb
        l_subsets_count = 0
        for itemset_k_1 in l_k_1:
            if itemset_k_1 <= comb:
                l_subsets_count += 1
            
        # if comb is a superset of k large k-1 itemsets, 
        # then it could be large
        if l_subsets_count == k:
            pruned.add(comb)
            
    return frozenset(pruned)

In [10]:
def candidates(l_k_1, k):
    
    """
        :param lk_1: large itemsets of size k - 1. Set of sets
        :param k: cardinality of candidates
        :returns: set of large k-itemsets
    """
    
    # join step
    combs = join(l_k_1, k)
    
    # prune step
    _candidates = prune(l_k_1, combs, k)
    
    return _candidates

Define `subset` function, which obtains the sets in a set of sets that are subsets of another set

In [11]:
def subset(sub, sup):
    
    """
        :param sub: set of sets to evaluate
        :param sup: possible superset of sets in sub
    """
    
    return {_sub for _sub in sub if _sub <= sup}

## Apriori algorithm function

In [12]:
from datetime import datetime as dt

In [13]:
def apriori(d, min_support):
    
    """
        :param d: list of transactions (list of sets)
        :param min_support: minimum support for an 
        itemset to be considered large
        :param candidate_gen: function to be used to
        generate candidate sets
        :returns: large itemsets of d
    """
    
    # convert minimum support to minimum support count
    min_sup_count = int(min_support * len(d))
    
    # get large 1-itemsets (k = 1)
    large_1_itemsets = large_one_itemsets(d, min_sup_count)
    
    # get all large itemsets
    k = 2
    l_k_1 = large_1_itemsets
    all_large = set()  # all large itemsets (k > 1)
        
    while l_k_1:  # large itemsets of size k - 1
        
        # generate candidates of size k from l_k_1
        k_candidates = candidates(l_k_1, k)
        
        # create appearance counters for each candidate
        k_cand_counts = dict.fromkeys(k_candidates, 0)
        
        # count candidate frequencies
        for transaction in d:
            # candidates contained in transaction
            t_candidates = subset(k_candidates, transaction)
            for candidate in t_candidates:
                k_cand_counts[candidate] += 1      
                
        k += 1
            
        # filter candidates by support count
        l_k_1 = {candidate for candidate, count in k_cand_counts.items() if count >= min_sup_count}
        
        # add new large itemsets to the set of large itemsets
        all_large = all_large | l_k_1
        
    return all_large


### Association rules

In [14]:
class ARule:
    
    """Class for association rules"""
    
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    @property
    def as_one(self):
        return self.x | self.y
    
    def support(self, d):
        return sum(1 for transaction in d if self.as_one <= transaction) / len(d)

    def support_count(self, d):
        return sum(1 for transaction in d if self.as_one <= transaction)
    
    def support_x(self, d):
        return sum(1 for transaction in d if self.x <= transaction) / len(d)
    
    def support_y(self, d):
        return sum(1 for transaction in d if self.y <= transaction) / len(d)
    
    def confidence(self, d):
        return self.support(d) / self.support_x(d)
    
    def lift(self, d):
        return self.confidence(d) / self.support_y(d)
    
    def conviction(self, d):
        return (1 - self.support_y(d)) / (1 - self.confidence(d))
        
    def __eq__(self, other):
        return self.x == other.x and self.y == other.y
    
    def __hash__(self):
        return hash(tuple(list(self.x) + list(self.y) + [len(self.y)]))
    
    def __repr__(self):
        return 'x: {}, y: {}'.format(str(self.x)[10:-1], str(self.y)[10:-1])
    
    


`to_k_elem_combinations` recursively computes all the possible combinations of length less than or equal to `k` of the elements of a set 

In [15]:
def to_k_elem_combinations(s, k):
    
    """
        :param s: set of size less than or equal to k
        :returns: combinations of up to length k
    """
    
    if k == 1:
        return frozenset(frozenset([e]) for e in s)
    
    else:
        combs = set()
        for comb_k_1 in to_k_elem_combinations(s, k-1):
            combs.add(comb_k_1)
            for comb_1 in to_k_elem_combinations(s, 1):
                comb_k = comb_k_1 | comb_1
                if len(comb_k) == k:
                    combs.add(comb_k)
        return frozenset(combs)
                

`associate` returns all the association rules that can be extracted from `itemsets`

In [16]:
def associate(itemsets):
    
    """
        :param itemsets: set of itemsets to obtain rules from
        :returns: set of association rules as (x, y) tuples,
        in which x implies y
    """
    
    rules = set()
    
    for itemset in itemsets:
        
        for x in to_k_elem_combinations(itemset, len(itemset) - 1):
            rules.add(ARule(x, itemset - x))
        
        
    return rules
            

## Calculating the rules

convert transactions to sets

In [17]:
dataset = [frozenset(key) for key in dataset.values()]

generate itemsets with a minimum support of 0.1

In [18]:
%%time
itemsets = apriori(dataset, 0.6)
itemsets

Wall time: 230 ms


obtain association rules

In [19]:
rules = associate(itemsets)
rules

{x: {'gill-attachment|f'}, y: {'gill-size|b', 'veil-color|w', 'veil-type|p'},
 x: {'gill-attachment|f'}, y: {'gill-size|b', 'veil-color|w'},
 x: {'gill-attachment|f'}, y: {'gill-size|b', 'veil-type|p'},
 x: {'gill-attachment|f'}, y: {'gill-size|b'},
 x: {'gill-attachment|f'}, y: {'gill-spacing|c', 'veil-color|w', 'veil-type|p'},
 x: {'gill-attachment|f'}, y: {'gill-spacing|c', 'veil-color|w'},
 x: {'gill-attachment|f'}, y: {'gill-spacing|c', 'veil-type|p'},
 x: {'gill-attachment|f'}, y: {'gill-spacing|c'},
 x: {'gill-attachment|f'}, y: {'ring-number|o', 'gill-spacing|c'},
 x: {'gill-attachment|f'}, y: {'ring-number|o', 'veil-color|w', 'gill-spacing|c'},
 x: {'gill-attachment|f'}, y: {'ring-number|o', 'veil-color|w', 'veil-type|p', 'gill-spacing|c'},
 x: {'gill-attachment|f'}, y: {'ring-number|o', 'veil-color|w', 'veil-type|p'},
 x: {'gill-attachment|f'}, y: {'ring-number|o', 'veil-color|w'},
 x: {'gill-attachment|f'}, y: {'ring-number|o', 'veil-type|p', 'gill-spacing|c'},
 x: {'gill-at

In [20]:
len(rules)

266

How many times does each column appear in these rules?

In [21]:
from collections import defaultdict
counter = defaultdict(lambda: 0)
for rule in rules:
    for j in rule.x:
        counter[j.split('|')[0]] += 1
    for j in rule.y:
        counter[j.split('|')[0]] += 1
counter

defaultdict(<function __main__.<lambda>()>,
            {'veil-color': 186,
             'gill-size': 46,
             'gill-attachment': 186,
             'ring-number': 138,
             'veil-type': 194,
             'gill-spacing': 130,
             'stalk-surface-above-ring': 38,
             'stalk-surface-below-ring': 2})

Length of the longest rule

In [22]:
longest_rule = 0
for rule in rules:
    largo = len(rule.x) + len(rule.y)
    if largo > longest_rule:
        longest_rule = largo
print(longest_rule)

5


## Calculating the support, lift, confidence and support count for each rule

In [23]:
rules = list(rules)
statistics = []
for i in rules:
    statistics.append([i.support(dataset), i.support_count(dataset), i.confidence(dataset), i.lift(dataset)])
print(len(rules))
print(len(statistics))
statistics

266
266


[[0.6649433776464796, 5402, 0.6817264008076729, 1.0252397778899545],
 [0.7720334810438207, 6272, 0.8605927552140504, 1.0263440316146426],
 [0.7720334810438207, 6272, 0.8605927552140504, 1.0589905397393131],
 [0.897095027080256, 7288, 0.9218315203642803, 1.0001281078311182],
 [0.897095027080256, 7288, 0.9989035087719298, 1.0241156114668295],
 [0.7956671590349581, 6464, 0.8632478632478632, 1.029510516885737],
 [0.6134908911866076, 4984, 0.9629057187017002, 0.9884566665065216],
 [0.6671590349581487, 5420, 0.6839979808177687, 0.9901638624667772],
 [0.6134908911866076, 4984, 1.0, 1.0252397778899545],
 [0.7720334810438207, 6272, 0.8596491228070176, 1.0549530927015425],
 [0.7720334810438207, 6272, 0.8376068376068376, 1.0279030134015028],
 [0.7720334810438207, 6272, 0.920728126834997, 1.0263440316146428],
 [0.6134908911866076, 4984, 0.6289752650176679, 1.0252397778899547],
 [0.8126538650910882, 6602, 1.0, 1.0252397778899545],
 [0.7720334810438207, 6272, 0.920728126834997, 1.025218654386995],
 

## Using this vector for every rule to clusterize them

In [24]:
clusters = 3
kmeans = KMeans(n_clusters=clusters, random_state=0).fit(statistics)
print(len(kmeans.labels_))
kmeans.labels_

266


array([1, 0, 0, 2, 2, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 2, 0, 2, 1, 0, 0, 2, 0, 2, 1, 2, 2, 2, 1, 1, 2, 0, 1, 1,
       0, 2, 1, 0, 0, 2, 1, 1, 2, 0, 2, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       2, 1, 0, 1, 2, 1, 1, 2, 1, 1, 2, 0, 1, 0, 0, 1, 1, 0, 0, 0, 2, 0,
       0, 2, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 2, 2, 0, 0, 0, 0, 2, 0, 1, 1,
       0, 2, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 2, 0,
       2, 0, 0, 0, 1, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 2, 0, 2, 0, 2, 1, 1,
       0, 0, 0, 0, 2, 0, 1, 1, 1, 0, 1, 1, 2, 2, 0, 2, 0, 1, 0, 0, 1, 0,
       2, 2, 0, 0, 1, 1, 1, 2, 0, 2, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 2, 1, 0, 2, 0, 1, 1, 0, 1, 1, 1, 2, 1, 0, 0, 0, 1, 1, 0, 0, 2,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 2, 2, 0, 0, 0, 2, 2, 0, 0, 1, 0,
       2, 0, 0, 0, 1, 2, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0,
       0, 0])

## Grouping the rules by cluster and getting the chromies data

In [25]:
cluster_rules = defaultdict(lambda: [])
for i in range(len(rules)):
    cluster_rules[kmeans.labels_[i]].append({
        "rule": rules[i],
        "support": statistics[i][0],
        "support_count": statistics[i][1],
        "confidence": statistics[i][2],
        "lift": statistics[i][3],
    })
cluster_rules

defaultdict(<function __main__.<lambda>()>,
            {1: [{'rule': x: {'veil-color|w'}, y: {'gill-size|b', 'gill-attachment|f'},
               'support': 0.6649433776464796,
               'support_count': 5402,
               'confidence': 0.6817264008076729,
               'lift': 1.0252397778899545},
              {'rule': x: {'stalk-surface-above-ring|s'}, y: {'gill-attachment|f'},
               'support': 0.6134908911866076,
               'support_count': 4984,
               'confidence': 0.9629057187017002,
               'lift': 0.9884566665065216},
              {'rule': x: {'veil-color|w', 'veil-type|p'}, y: {'gill-size|b'},
               'support': 0.6671590349581487,
               'support_count': 5420,
               'confidence': 0.6839979808177687,
               'lift': 0.9901638624667772},
              {'rule': x: {'stalk-surface-above-ring|s', 'veil-type|p', 'gill-attachment|f'}, y: {'veil-color|w'},
               'support': 0.6134908911866076,
             

In [26]:
columns = defaultdict(lambda: {
    "predictor": 0,
    "predictee": 0,
    "diversity": 0,
    "cumGain": {
        "supportCount": 0,
        "support": 0,
        "conf": 0,
        "lift": 0
    }
})

In [27]:
for i in range(len(rules)):
    # Should encapsulate the copy paste inside a function, but I'm too lazy
    for j in rules[i].x:
        columns[j.split('|')[0]]["predictor"] += 1
        columns[j.split('|')[0]]["cumGain"]["supportCount"] += statistics[i][1]
        columns[j.split('|')[0]]["cumGain"]["support"] += statistics[i][0]
        columns[j.split('|')[0]]["cumGain"]["conf"] += statistics[i][2]
        columns[j.split('|')[0]]["cumGain"]["lift"] += statistics[i][3]
    for j in rules[i].y:
        columns[j.split('|')[0]]["predictee"] += 1
        columns[j.split('|')[0]]["cumGain"]["supportCount"] += statistics[i][1]
        columns[j.split('|')[0]]["cumGain"]["support"] += statistics[i][0]
        columns[j.split('|')[0]]["cumGain"]["conf"] += statistics[i][2]
        columns[j.split('|')[0]]["cumGain"]["lift"] += statistics[i][3]
columns

defaultdict(<function __main__.<lambda>()>,
            {'veil-color': {'predictor': 93,
              'predictee': 93,
              'diversity': 0,
              'cumGain': {'supportCount': 1160520,
               'support': 142.85081240768073,
               'conf': 165.06421186168248,
               'lift': 188.63405348080306}},
             'gill-size': {'predictor': 23,
              'predictee': 23,
              'diversity': 0,
              'cumGain': {'supportCount': 245648,
               'support': 30.23732151649436,
               'conf': 37.85936508997463,
               'lift': 45.86653749225234}},
             'gill-attachment': {'predictor': 93,
              'predictee': 93,
              'diversity': 0,
              'cumGain': {'supportCount': 1160276,
               'support': 142.82077794190036,
               'conf': 165.05412546944467,
               'lift': 188.65231895812795}},
             'ring-number': {'predictor': 69,
              'predictee': 69,
      

In [28]:
def id_generator():
    n = 0
    while True:
        n += 1
        yield n
generator = id_generator()

In [29]:
nodes = []
columns_id = {}
for i in columns:
    id = next(generator)
    nodes.append({
        "id": id,
        "type": "column",
        "predictor": columns[i]['predictor'],
        "predictee": columns[i]['predictee'],
        "diversity": columns[i]['diversity'],
        "cumGain": columns[i]['cumGain'],
    })
    columns_id[i] = id
nodes

[{'id': 1,
  'type': 'column',
  'predictor': 93,
  'predictee': 93,
  'diversity': 0,
  'cumGain': {'supportCount': 1160520,
   'support': 142.85081240768073,
   'conf': 165.06421186168248,
   'lift': 188.63405348080306}},
 {'id': 2,
  'type': 'column',
  'predictor': 23,
  'predictee': 23,
  'diversity': 0,
  'cumGain': {'supportCount': 245648,
   'support': 30.23732151649436,
   'conf': 37.85936508997463,
   'lift': 45.86653749225234}},
 {'id': 3,
  'type': 'column',
  'predictor': 93,
  'predictee': 93,
  'diversity': 0,
  'cumGain': {'supportCount': 1160276,
   'support': 142.82077794190036,
   'conf': 165.05412546944467,
   'lift': 188.65231895812795}},
 {'id': 4,
  'type': 'column',
  'predictor': 69,
  'predictee': 69,
  'diversity': 0,
  'cumGain': {'supportCount': 895776,
   'support': 110.26292466765138,
   'conf': 124.58007157063213,
   'lift': 140.16698337729179}},
 {'id': 5,
  'type': 'column',
  'predictor': 97,
  'predictee': 97,
  'diversity': 0,
  'cumGain': {'support

In [33]:
links = []
for i in cluster_rules:
    # Create the dict for the cluster
    node_id = next(generator)
    obj = {
        'id': node_id,
        'type': 'aggregator',
        'columns': [],
        'rules': []
    }
    
    # Calculate the column statistics of the cluster
    # Ugly copy paste of what was done before
    inside_columns = defaultdict(lambda: {
        "predictor": 0,
        "predictee": 0,
        "diversity": 0,
        "cumGain": {
            "supportCount": 0,
            "support": 0,
            "conf": 0,
            "lift": 0
        }
    })
    for j in cluster_rules[i]:
        for k in j['rule'].x:
            inside_columns[k.split('|')[0]]["predictor"] += 1
            inside_columns[k.split('|')[0]]['cumGain']['support'] += j['support']
            inside_columns[k.split('|')[0]]['cumGain']['supportCount'] += j['support_count']
            inside_columns[k.split('|')[0]]['cumGain']['conf'] += j['confidence']
            inside_columns[k.split('|')[0]]['cumGain']['lift'] += j['lift']
        for k in j['rule'].y:
            inside_columns[k.split('|')[0]]["predictee"] += 1
            inside_columns[k.split('|')[0]]['cumGain']['support'] += j['support']
            inside_columns[k.split('|')[0]]['cumGain']['supportCount'] += j['support_count']
            inside_columns[k.split('|')[0]]['cumGain']['conf'] += j['confidence']
            inside_columns[k.split('|')[0]]['cumGain']['lift'] += j['lift']
            
    for j in inside_columns:
        id = columns_id[j]
        obj['columns'].append({
            "id": id,
            "predictor": inside_columns[j]['predictor'],
            "predictee": inside_columns[j]['predictee'],
            "diversity": inside_columns[j]['diversity'],
            "cumGain": inside_columns[j]['cumGain'],
        })
    
    # Get each rule present in the cluster and append it to the obj
    for rule in cluster_rules[i]:
        rule_obj = {
            'x': [],
            'y': [],
            'support': rule['support'],
            'supportCount': rule['support_count'],
            'conf': rule['confidence'],
            'lift': rule['lift']
        }
        for k in rule['rule'].x:
            rule_obj['x'].append({
                'col': k.split('|')[0],
                'val': k.split('|')[1]
            })
        for k in rule['rule'].y:
            rule_obj['y'].append({
                'col': k.split('|')[0],
                'val': k.split('|')[1]
            })
        obj['rules'].append(rule_obj)
    
    # Append it as a node, and create it's respective links
    nodes.append(obj)
    for j in obj['columns']:
        links.append({
            'source': node_id,
            'target': j['id']
        })
print(links)
nodes

[{'source': 11, 'target': 1}, {'source': 11, 'target': 2}, {'source': 11, 'target': 3}, {'source': 11, 'target': 7}, {'source': 11, 'target': 5}, {'source': 11, 'target': 4}, {'source': 11, 'target': 8}, {'source': 12, 'target': 4}, {'source': 12, 'target': 1}, {'source': 12, 'target': 5}, {'source': 12, 'target': 3}, {'source': 12, 'target': 6}, {'source': 13, 'target': 1}, {'source': 13, 'target': 5}, {'source': 13, 'target': 3}, {'source': 13, 'target': 4}]


[{'id': 1,
  'type': 'column',
  'predictor': 93,
  'predictee': 93,
  'diversity': 0,
  'cumGain': {'supportCount': 1160520,
   'support': 142.85081240768073,
   'conf': 165.06421186168248,
   'lift': 188.63405348080306}},
 {'id': 2,
  'type': 'column',
  'predictor': 23,
  'predictee': 23,
  'diversity': 0,
  'cumGain': {'supportCount': 245648,
   'support': 30.23732151649436,
   'conf': 37.85936508997463,
   'lift': 45.86653749225234}},
 {'id': 3,
  'type': 'column',
  'predictor': 93,
  'predictee': 93,
  'diversity': 0,
  'cumGain': {'supportCount': 1160276,
   'support': 142.82077794190036,
   'conf': 165.05412546944467,
   'lift': 188.65231895812795}},
 {'id': 4,
  'type': 'column',
  'predictor': 69,
  'predictee': 69,
  'diversity': 0,
  'cumGain': {'supportCount': 895776,
   'support': 110.26292466765138,
   'conf': 124.58007157063213,
   'lift': 140.16698337729179}},
 {'id': 5,
  'type': 'column',
  'predictor': 97,
  'predictee': 97,
  'diversity': 0,
  'cumGain': {'support

In [34]:
import json

data = {
    'nodes': nodes,
    'links': links,
    'ids': {v: k for k, v in columns_id.items()}
}

with open('new_mushroom_data_3_06.json', 'w') as file:
    json.dump(data, file)

<span style="color: red;"> **IMPORTANT:** Old code, not relevant anymore </span>
## Calculating the data for each chrome

In [None]:
chromes = defaultdict(lambda: [])
for rule in rules:
    columns = []
    for i in rule.x:
        columns.append(i.split('|')[0])
    for i in rule.y:
        columns.append(i.split('|')[0])
    columns = frozenset(columns)
    chromes[columns].append(rule)

In [None]:
for chrome in chromes:
    print("{} tiene {} reglas".format(chrome, len(chromes[chrome])))

How many _chromes_ will there be for ammount of columns considered

In [None]:
largos = defaultdict(lambda: 0)
for chrome in chromes:
    largos[len(chrome)] += 1
largos

### Considering only _chromes_ with 4 columns

In [None]:
filtered_chromes = dict()
for chrome in chromes:
    if len(chrome) == 3:
        filtered_chromes[chrome] = {'rules': chromes[chrome]}
filtered_chromes

### Angle and color of each section

In [None]:
for key in filtered_chromes:
    # Get the ammount of total different values of the columns
    total_different_values = dict()
    for column in key:
        total_different_values[column] = len(dataset1[column].unique())
        # If the column has null values
        if dataset1[column].isna().sum() >= 1 or len(dataset1[dataset1[column] == '-']) >= 1:
            total_different_values[column] -= 1
    
    # Get the amount of unique values that a columns contributes to the rules
    different_values = defaultdict(lambda: [])
    for rule in filtered_chromes[key]['rules']:
        for i in rule.x:
            different_values[i.split('|')[0]].append(i.split('|')[1])
        for i in rule.y:
            different_values[i.split('|')[0]].append(i.split('|')[1])
    for i in different_values.keys():
        different_values[i] = len(set(different_values[i]))
    different_values_n = sum(different_values.values())
    
    # Get the angles of each column
    angles = dict()
    for i in different_values.keys():
        angles[i] = different_values[i] / different_values_n
    filtered_chromes[key]['angles'] = angles
    
    # Get the color of each column
    colors = dict()
    for i in different_values.keys():
        colors[i] = different_values[i] / total_different_values[i]
    colors_sum = sum(colors.values())
    for i in colors:
        colors[i] = colors[i] / colors_sum
    filtered_chromes[key]['colors'] = colors
#print(filtered_chromes)

### Radius of each section

In [None]:
for key in filtered_chromes.keys():
    left_side = defaultdict(lambda: 0)
    right_side = defaultdict(lambda: 0)
    for rule in filtered_chromes[key]['rules']:
        for i in rule.x:
            left_side[i.split('|')[0]] += 1
        for i in rule.y:
            right_side[i.split('|')[0]] += 1
    ratios = dict()
    for column in key:
        ratios[column] = left_side[column] / (right_side[column] + 1)
    filtered_chromes[key]['ratios'] = ratios

maximum_ratio = 0
for key in filtered_chromes.keys():
    for ratio in filtered_chromes[key]['ratios'].values():
        if ratio > maximum_ratio:
            maximum_ratio = ratio

for key in filtered_chromes.keys():
    for column in filtered_chromes[key]['ratios'].keys():
        filtered_chromes[key]['ratios'][column] = filtered_chromes[key]['ratios'][column] / maximum_ratio

## Exporting the data

In [None]:
import json
thomas_culiao = []
for key in filtered_chromes.keys():
    chrome = dict()
    chrome['columns'] = [i for i in key]
    chrome['angles'] = filtered_chromes[key]['angles']
    chrome['colors'] = filtered_chromes[key]['colors']
    chrome['ratios'] = filtered_chromes[key]['ratios']
    rules_dict = []
    for rule in filtered_chromes[key]['rules']:
        rule_dict = dict()
        rule_dict['x'] = [i for i in rule.x]
        rule_dict['y'] = [i for i in rule.y]
        rule_dict['confidence'] = rule.confidence(dataset)
        rule_dict['lift'] = rule.lift(dataset)
        # rule_dict['conviction'] = rule.conviction(dataset)
        rules_dict.append(rule_dict)
    chrome['rules'] = rules_dict
    thomas_culiao.append(chrome)
with open('mushroom_data.json', 'w') as file:
    json.dump(thomas_culiao, file)