# Package Imports

In [1]:
import pandas as pd
import numpy as np
import itertools

# Data Import

In [2]:
data = pd.read_csv('data.csv')
data

Unnamed: 0,car,apt,villa,poor,avg,rich
0,False,True,False,True,False,False
1,True,True,False,False,True,False
2,True,False,True,False,False,True
3,True,False,True,False,False,True
4,True,True,False,False,True,False


# Apriori
## Step 0: Parameter Value Selection

In [3]:
min_sup = 2
min_conf = 1.0
frequent_itemsets = pd.DataFrame(columns=data.columns)
rules = pd.DataFrame(columns=['Premisis', 'Conclusion'])

## Step 1: Get frequent 1-itemsets

In [4]:
def support(data, itemset):
    # Get columns where itemset has True values
    trues = []
    for k in range(itemset.shape[1]):
        if (itemset.loc[0][k] == True):
            trues.append(k)

    # Count number of rows in data that have trues in the same locations as itemset
    count = 0
    for i in range(data.shape[0]):
        in_row = True
        # Check each value in data is true where trues are
        for t in trues:
            if not (data.loc[i][t] == True):
                in_row = False
        if in_row:
            count += 1

    return count

In [5]:
# Create 1-itemsets, then check if frequent
c_1 = pd.DataFrame(columns=data.columns)
for i in range(data.shape[1]):
    # Create a single 1-itemset
    new_row = []
    for j in range(data.shape[1]):
        if j == i:
            new_row.append([True])
        else:
            new_row.append([False])
    # Add 1-itemset to collection of 1-itemsets
    new_row_df = pd.DataFrame(data=np.transpose(new_row), columns=data.columns)
    c_1 = c_1.append(new_row_df, ignore_index=True)

In [6]:
# Now find frequent 1-itemsets
l_1 = pd.DataFrame(columns=data.columns)

for k in range(c_1.shape[0]):
    temp = pd.DataFrame(columns=data.columns)
    temp = temp.append(c_1.loc[k], ignore_index=True)
    if support(data, temp) >= min_sup:
        l_1 = l_1.append(temp, ignore_index=True)
frequent_itemsets = frequent_itemsets.append(l_1, ignore_index=True)

In [7]:
frequent_itemsets

Unnamed: 0,car,apt,villa,poor,avg,rich
0,True,False,False,False,False,False
1,False,True,False,False,False,False
2,False,False,True,False,False,False
3,False,False,False,False,True,False
4,False,False,False,False,False,True


## Step 2: Loop finding more frequent itemsets until cannot find more

In [8]:
def merge_rows(row1, row2):
    # Return a row with True if either row1 or row2 is True
    merged = pd.DataFrame(columns=row1.columns)
    row_to_add = []
    # Loop through columns, if any is True, set merged to True
    for i in range(row1.shape[1]):
        if (row1.loc[0][i] == True or row2.loc[0][i] == True):
            row_to_add.append([True])
        else:
            row_to_add.append([False])
    
    return pd.DataFrame(data=np.transpose(row_to_add), columns=row1.columns)

In [9]:
def supersets(itemsets):
    supersets = pd.DataFrame(columns=itemsets.columns)
    for i in range(itemsets.shape[0]):
        for j in range(i+1, itemsets.shape[0]):
            row1 = pd.DataFrame(columns=itemsets.columns)
            row1 = row1.append(itemsets.loc[i], ignore_index=True)
            row2 = pd.DataFrame(columns=itemsets.columns)
            row2 = row2.append(itemsets.loc[j], ignore_index=True)
            supersets = supersets.append(merge_rows(row1, row2), ignore_index=True)
    return supersets

In [10]:
def reduce(data, itemsets, min_sup):
    # Only keep frequent itemsets, and no duplicates
    reduced = pd.DataFrame(columns=itemsets.columns)
    for i in range(itemsets.shape[0]):
        temp = pd.DataFrame(columns=itemsets.columns)
        temp = temp.append(itemsets.loc[i], ignore_index=True)
        # Select only frequent itemsets
        if (support(data, temp) >= min_sup):
            # Select itemsets that are not already in reduced
            if (support(reduced, temp) == 0):
                reduced = reduced.append(temp, ignore_index=True)
    return reduced

In [11]:
l_k = l_1.copy()
while(not l_k.empty):
    c_k = supersets(l_k)
    l_k = reduce(data, c_k, min_sup)
    frequent_itemsets = frequent_itemsets.append(l_k, ignore_index=True)

In [12]:
frequent_itemsets

Unnamed: 0,car,apt,villa,poor,avg,rich
0,True,False,False,False,False,False
1,False,True,False,False,False,False
2,False,False,True,False,False,False
3,False,False,False,False,True,False
4,False,False,False,False,False,True
5,True,True,False,False,False,False
6,True,False,True,False,False,False
7,True,False,False,False,True,False
8,True,False,False,False,False,True
9,False,True,False,False,True,False


## Step 3: Generate Strong Association Rules

In [13]:
def size(itemset):
    count = 0
    for i in range(itemset.shape[1]):
        if itemset.loc[0][i] == True:
            count += 1
    return count

In [14]:
def itemset_true_at(true_locs, columns):
    row = []
    for i in range(len(columns)):
        if i in true_locs:
            row.append([True])
        else:
            row.append([False])
    return pd.DataFrame(data=np.transpose(row), columns=columns)

In [15]:
def subsets(itemsets):
    subsets = pd.DataFrame(columns=itemsets.columns)
    for i in range(itemsets.shape[0]):
        current_itemset = pd.DataFrame(columns=itemsets.columns)
        current_itemset = current_itemset.append(itemsets.loc[i], ignore_index=True)
        # Get columns where itemset has True values
        current_trues = []
        for k in range(current_itemset.shape[1]):
            if (current_itemset.loc[0][k] == True):
                current_trues.append(k)
        
        # Find all combos of True columns
        all_combos = []
        for j in range(1, len(current_trues) + 1):
            all_combos.append(list(itertools.combinations(current_trues, j)))
            
        # Create all subsets based on all_combos
        for l in range(len(all_combos)):
            for m in range(len(all_combos[l])):
                subsets = subsets.append(itemset_true_at(all_combos[l][m], itemsets.columns))

    return subsets

In [16]:
tester = pd.DataFrame([(False, True, False, True, True, False)], columns=data.columns)
tester
# TODO: remove

Unnamed: 0,car,apt,villa,poor,avg,rich
0,False,True,False,True,True,False


In [322]:
def create_association_rule(setI, setS):
    rule = pd.DataFrame(columns=['Left', 'Right'])
    left = []
    right = []
    for i in range(setS.shape[1]):
        # Accumulate lefthand side
        if setS.loc[0][i] == True:
            left.append(setS.columns[i])
        # Accumulate righthand side (I - S)
        if setI.loc[0][i] == True and setS.loc[0][i] == False:
            right.append(setI.columns[i])
    rule = rule.append({'Left': left, 'Right': right}, ignore_index=True)
    return rule

In [32]:
strong_association_rules = pd.DataFrame(columns=['Left', 'Right'])

for i in range(frequent_itemsets.shape[0]):
#for i in range(3):
    current_itemset = pd.DataFrame(columns=frequent_itemsets.columns)
    current_itemset = current_itemset.append(frequent_itemsets.loc[i], ignore_index=True)
    sup_I = support(data=data, itemset=current_itemset)

    if (size(current_itemset) > 1):
        sub_sets = subsets(current_itemset)

        print(sub_sets) # TODO: remove

        for j in range(sub_sets.shape[0]):
            current_subset = pd.DataFrame(columns=sub_sets.columns)
            current_subset = current_subset.append(sub_sets.loc[j], ignore_index=True)
            print('current subset')
            print(current_subset)
            sup_S = support(data=data, itemset=current_subset)
            if (sup_I / sup_S >= min_conf):
                strong_association_rules = strong_association_rules.append(create_association_rule(current_itemset, current_subset), ignore_index=True)

     car    apt  villa   poor    avg   rich
0   True  False  False  False  False  False
0  False   True  False  False  False  False
0   True   True  False  False  False  False
current subset
     car    apt  villa   poor    avg   rich
0   True  False  False  False  False  False
1  False   True  False  False  False  False
2   True   True  False  False  False  False


KeyError: 1