# Esercizio 2
Calcolare gli *shapley values* di una regola di associazione usando come metrica la *confidence*.

0. Carichiamo il dataset

In [45]:
# Loads dataset function
def load_dataset(selected_dataset_name:str, repeat : bool, verbose : bool = False):
    dataset=[]
    with open('datasets/'+selected_dataset_name+'.txt') as f:
        lines = f.read().splitlines()
        for l in lines:
            seen = set()
            curr_line = l.replace("\t"," ").split(" ")
            if not repeat:
                curr_line = [x for x in curr_line if not (x in seen or seen.add(x))]
            dataset.append(curr_line)
    print(f"Dataset loaded. {len(dataset)} entries found.")
    if(verbose):
        print("Prime 5 entries:")
        for entry in dataset[:5]:
            print(entry)
    return dataset

1. Funzioni di supporto: calcolo supporto e confidenza, oggetto "Shapley table" per calcolare i valori di Shapley

In [29]:
def compute_support(dataset, rule):
    count_rule = 0
    for item in dataset:
        if all(element in item for element in rule):
            count_rule+=1
    return count_rule

def compute_confidence(dataset:list, antecedent, consequent)->float:
    antecedent = [antecedent] if not isinstance(antecedent,list) else antecedent # solo perché a volte non è già una lista e non gli piace
    support_antecedent = compute_support(dataset,antecedent) 
    support_consequent = compute_support(dataset,antecedent+consequent)
    return support_consequent/support_antecedent


In [30]:
from itertools import permutations

class ShapleyTable:
    def __init__(self, dataset, antecedent, consequent, verbose=False):
        table = {} # this will be out final table
        rule_permutations = list(permutations(antecedent)) # All permutations
        self.antecedent = antecedent
        self.consequent = consequent

        # For each permutation of the rule
        for permutation in rule_permutations:

            if ( verbose):
                print(f"--Permutation {permutation}:")

            # Initialize a dict with the *row* of the table
            shapley_values_rows = {}

            # for each increasing portion of the permutation... (es. ABC --> A, AB, ABC)
            for i in range(0,len(permutation)):
                # ...Extract the portion of the permutation...
                to_compute = list(permutation[:i+1])
                # ... and compute the starting shapley value, aka the confidence of the biggest subset of the permutation (es. ABC --> ABC)
                curr_value = compute_confidence(dataset,to_compute,consequent) 

                if(verbose):
                    print(f"> computing {to_compute}: {to_compute}{curr_value:4.2}",end="")

                # Now substract all the smaller ones, in order (es. ABC --> subtract AB and A)
                for j in range(0,len(to_compute)-1):
                    curr_value-=compute_confidence(dataset,to_compute[:j+1],consequent)
                    
                    if(verbose):
                        print(f" - {to_compute[:j+1]}{compute_confidence(dataset,to_compute[:j+1],consequent):4.2}",end="")

                if (verbose):
                    print("")            

                shapley_values_rows[permutation[i]]= curr_value # assign computed value to column in current line

            table[permutation] = shapley_values_rows     # assign completed row to the permutation   

        self.table=table  
        
    def print(self):
        #header
        print("-----------------------"*len(self.antecedent))
        print("Permutation\t\t",end="")
        for item in sorted(self.antecedent):
            print(f"| \t {item}\t", end="")
        print("")  

        print("-----------------------"*len(self.antecedent))
        
        #body
        for permutation in self.table:
            print(f"{permutation}\t",end="")
            for item in sorted(self.antecedent):
                print(f"| {self.table[permutation][item]:10.2}\t",end="")
            print("")

        print("-----------------------"*len(self.antecedent))

        #avg
        print("Average\t\t\t",end="")
        for item in sorted(self.antecedent):
            average = []
            for permutation in self.table:
                average.append(self.table[permutation][item])
            print(f"|{sum(average) / float(len(average)):10.2}\t",end="")
        print("")
        print("-----------------------"*len(self.antecedent))

    def getWinner(self):
        averages = {}
        for item in sorted(self.antecedent):
            average = []
            for permutation in self.table:
                average.append(self.table[permutation][item])
            averages[item]=sum(average) / float(len(average))
        
        return(max(averages.items(), key=lambda x: x[1]))


3. Calcolo vero e proprio

In [31]:
# Selezione dei parametri
selected_dataset = "skating"
selected_rule_antecedent = ["41","1","29"]
selected_rule_consequent = ["17"]


In [46]:
# Calcolo vero e proprio
dataset = load_dataset(selected_dataset,False)

print(f"\n--Rule: {selected_rule_antecedent}-->{selected_rule_consequent}--")
print(f"Antecedent support: {compute_support(dataset,selected_rule_antecedent)}")
print(f"Antecedent U consequent support: {compute_support(dataset,selected_rule_antecedent+selected_rule_consequent)}")
print(f"Confidence: {compute_confidence(dataset,selected_rule_antecedent,selected_rule_consequent):4.3}.")


table = ShapleyTable(dataset,selected_rule_antecedent,selected_rule_consequent)
table.print()
print(f"The most influential item in the rule is {table.getWinner()[0]} with value {table.getWinner()[1]}")

Dataset loaded. 530 entries found.

--Rule: ['41', '1', '29']-->['17']--
Antecedent support: 22
Antecedent U consequent support: 19
Confidence: 0.864.
---------------------------------------------------------------------
Permutation		| 	 1	| 	 29	| 	 41	
---------------------------------------------------------------------
('41', '1', '29')	|        0.0	|      -0.86	|       0.86	
('41', '29', '1')	|      -0.86	|        0.0	|       0.86	
('1', '41', '29')	|       0.92	|      -0.92	|     -0.053	
('1', '29', '41')	|       0.92	|        0.0	|      -0.97	
('29', '41', '1')	|      -0.92	|       0.92	|     -0.053	
('29', '1', '41')	|   -0.00047	|       0.92	|      -0.97	
---------------------------------------------------------------------
Average			|    0.0087	|     0.009	|    -0.053	
---------------------------------------------------------------------
The most influential item in the rule is 29 with value 0.008969560269191509
