In [2]:
import numpy as np 
import pandas as pd
import math
from mlxtend.frequent_patterns import apriori, association_rules 
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import *

In [3]:
csv_file = "h2Data.csv"
dataset = pd.read_csv(csv_file, names = ['T1','T2', "T3", "T4", "T5"])

# Convert csv file to nested array so mlxtend library can encode it
dataset = dataset.replace({np.nan: None})

datalist = dataset.values.tolist()
# Library requires we remove None values from arrays
cleaned_data = [[x for x in sublist if x is not None] for sublist in datalist]

# Resulting data is cleaned_data
# cleaned_data


In [4]:
te = TransactionEncoder()
te_ary = te.fit(cleaned_data).transform(cleaned_data)
df = pd.DataFrame(te_ary, columns=te.columns_)
print(df.head(4))

# Result is a sparse dataframe showing the items purchased for each transaction

    1.0    2.0    3.0    4.0    5.0    6.0    7.0    8.0    9.0    10.0  ...  \
0  False  False  False  False   True   True  False  False  False  False  ...   
1  False  False  False  False  False   True  False  False  False  False  ...   
2  False  False  False  False  False  False  False  False   True  False  ...   
3  False  False  False  False  False  False   True  False  False  False  ...   

    41.0   42.0   43.0   44.0   45.0   46.0   47.0   48.0   49.0   50.0  
0  False  False  False  False  False   True  False  False  False  False  
1  False  False  False  False  False  False  False  False  False  False  
2  False  False  False  False  False  False  False  False  False  False  
3   True  False  False  False  False  False  False  False  False  False  

[4 rows x 50 columns]


In [5]:
# Find assoiation rules with at least 2 antecedents, min confidence, min support, min lift

# defining thresholds based on lab 1 advice:
basket_size = 5
item_count = 50
support_threshold = 1 - math.exp((-1 * basket_size) / item_count)
print(f"calculated support threshold: {support_threshold}")
support_threshold = 0.015

support_confidence_proportion = 0.5
confidence_threshold = support_confidence_proportion * support_threshold
lift_threshold = 1.1

# Generate frequent_items sets based on min_support threshold
frequent_itemsets = apriori(df, min_support=support_threshold, use_colnames=True)
print(frequent_itemsets.head(5))
print(frequent_itemsets.shape)

calculated support threshold: 0.09516258196404048
   support itemsets
0    0.094    (1.0)
1    0.107    (2.0)
2    0.110    (3.0)
3    0.076    (4.0)
4    0.103    (5.0)
(88, 2)


In [6]:
# Apply confidence threhsold
confidence_df = association_rules(frequent_itemsets, metric="confidence", min_threshold=confidence_threshold)
print(confidence_df.head(5))
print(confidence_df.shape)


  antecedents consequents  antecedent support  consequent support  support  \
0       (1.0)      (45.0)               0.094               0.107    0.017   
1      (45.0)       (1.0)               0.107               0.094    0.017   
2       (8.0)       (2.0)               0.107               0.107    0.018   
3       (2.0)       (8.0)               0.107               0.107    0.018   
4      (33.0)       (2.0)               0.107               0.107    0.017   

   confidence      lift  leverage  conviction  zhangs_metric  
0    0.180851  1.690197  0.006942    1.090156       0.450721  
1    0.158879  1.690197  0.006942    1.077133       0.457282  
2    0.168224  1.572190  0.006551    1.073607       0.407553  
3    0.168224  1.572190  0.006551    1.073607       0.407553  
4    0.158879  1.484846  0.005551    1.061678       0.365654  
(76, 10)


In [7]:
# Apply lift threshold on the confidence threshold df
lift_confidence_df = confidence_df[(confidence_df['lift'] >= lift_threshold)]
# Drop accessory columns for clarity
lift_confidence_df = lift_confidence_df.drop(columns=["conviction", "zhangs_metric", "leverage", "antecedent support", "consequent support"])
print(lift_confidence_df.head(5))
print(lift_confidence_df.shape)

  antecedents consequents  support  confidence      lift
0       (1.0)      (45.0)    0.017    0.180851  1.690197
1      (45.0)       (1.0)    0.017    0.158879  1.690197
2       (8.0)       (2.0)    0.018    0.168224  1.572190
3       (2.0)       (8.0)    0.018    0.168224  1.572190
4      (33.0)       (2.0)    0.017    0.158879  1.484846
(74, 5)


In [8]:
lift_confidence_df["confidence"].min()

0.12195121951219512

Q1. By trial and error, what are the values of the minimum support, minimum confidence, and minimum lift for which the number of interesting association rules is between 1 and 2 times the number of items?

A: When support_threshold = 0.015, confidence threshold is 0.0075 (proportionally half of support_threshold), and lift_threshold = 1.1, we are left with 74 association rules which is between the range of 1 and 2 times the number of items (50 and 100)

Q2. What relationship between these variables keeps the number of interesting association rules in this range?
I found that the lowest confidence of the 74 association rules was 0.1219, as shown above. This is significantly greater than the current confidence threshold of 0.0075. 

Support seems to be the main driver of keeping the number of association rules in this range. I say this because when the support threshold is set to 0.015, only 88 associtation frequent itemsets are derived, before the confidence and lift thresholds are even applied. Upon applying the confidence threshold that is proportionally 50% of the support, and a lift threshold of 1.1, the number of rules drops from 88 to 74. Manipulating the support threshold selects the number of itemsets that are to be filtered by confidence and lift, and ultimately drives the number of resulting association rules. 


In [9]:
#Q3:

# Find assoiation rules with at least 2 antecedents, min confidence, min support, min lift

support_threshold = 0.015
confidence_threshold = 0.01
lift_threshold = 1

# Genertae frequent_itemsets then apply thresholds
frequent_itemsets_2 = apriori(df, min_support=support_threshold, use_colnames=True)

# Apply confidence threhsold
confidence_df_2 = association_rules(frequent_itemsets_2, metric="confidence", min_threshold=confidence_threshold)

# Apply lift threshold on the confidence threshold df
lift_confidence_df_2 = confidence_df[(confidence_df_2['lift'] >= lift_threshold)]
# Drop accessory columns for clarity
lift_confidence_df_2 = lift_confidence_df_2.drop(columns=["conviction", "zhangs_metric", "leverage", "antecedent support", "consequent support"])

# Adding count column
transactions = 1000
lift_confidence_df_2["count"] = lift_confidence_df_2["support"] * transactions
#Print result
lift_confidence_df_2 = lift_confidence_df_2.sort_values(by=["count"], ascending=False)
print(lift_confidence_df_2)



   antecedents consequents  support  confidence      lift  count
34       (9.0)      (37.0)    0.019    0.166667  1.355014   19.0
35      (37.0)       (9.0)    0.019    0.154472  1.355014   19.0
2        (8.0)       (2.0)    0.018    0.168224  1.572190   18.0
3        (2.0)       (8.0)    0.018    0.168224  1.572190   18.0
46      (35.0)      (14.0)    0.018    0.151261  1.609154   18.0
..         ...         ...      ...         ...       ...    ...
48      (41.0)      (20.0)    0.015    0.138889  1.286008   15.0
49      (20.0)      (41.0)    0.015    0.138889  1.286008   15.0
50      (43.0)      (20.0)    0.015    0.156250  1.446759   15.0
51      (20.0)      (43.0)    0.015    0.138889  1.446759   15.0
75      (43.0)      (48.0)    0.015    0.156250  1.717033   15.0

[76 rows x 6 columns]


In [13]:
#Q4:
top_50_rules = lift_confidence_df_2.sort_values(by=["lift"], ascending=False).head(50)
print(top_50_rules.shape)
print(top_50_rules)


(50, 6)
   antecedents consequents  support  confidence      lift  count
73      (42.0)      (41.0)    0.017    0.186813  1.729752   17.0
72      (41.0)      (42.0)    0.017    0.157407  1.729752   17.0
75      (43.0)      (48.0)    0.015    0.156250  1.717033   15.0
74      (48.0)      (43.0)    0.015    0.164835  1.717033   15.0
20      (18.0)       (6.0)    0.015    0.172414  1.690331   15.0
21       (6.0)      (18.0)    0.015    0.147059  1.690331   15.0
1       (45.0)       (1.0)    0.017    0.158879  1.690197   17.0
0        (1.0)      (45.0)    0.017    0.180851  1.690197   17.0
44      (11.0)      (23.0)    0.015    0.159574  1.628311   15.0
45      (23.0)      (11.0)    0.015    0.153061  1.628311   15.0
47      (14.0)      (35.0)    0.018    0.191489  1.609154   18.0
46      (35.0)      (14.0)    0.018    0.151261  1.609154   18.0
10       (3.0)      (15.0)    0.015    0.136364  1.585624   15.0
11      (15.0)       (3.0)    0.015    0.174419  1.585624   15.0
3        (2.0)   