# Module 8 Practice 1 Answers - Association Rule Mining
We will practice skills related to association rule mining.

In [None]:
import pandas as pd
import numpy as np
import sys
!{sys.executable} -m pip install mlxtend
import mlxtend

pd.set_option('display.max_columns', 0)
pd.set_option('max_colwidth', 0)

## Load the breast cancer data set
Load the bunch.  No need to place data and target into a dataframe, yet.

In [None]:
import sklearn.datasets as d

bc = d.load_breast_cancer()
print(bc.DESCR)


## Bin the features
All of this data is continuous except the target, so bin them using kmeans into 3 buckets.  Create a new data frame named `binned_data` that contains the binned data plus the target.  Set the type of all columns to int.

In [None]:
from sklearn import preprocessing

kmeans_data = preprocessing.KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans').fit_transform(bc.data)
binned_data = pd.DataFrame(np.c_[kmeans_data,bc.target], columns = np.append(bc.feature_names, ['target']), dtype=int)
display(binned_data.head())

## Prepare to find the frequent itemsets
We have to one hot encode all of the features.  Assign the one hot encoded dataframe to a new dataframe named `onehot`

In [None]:
onehot = pd.get_dummies(binned_data, columns = binned_data.columns)
display(onehot.head())

## Get the frequent itemsets
Display the frequent itemsets where there are at least 10 items.  Use a min_support of 0.4.
Warning! Using a very low min_support with a dataset that has many features can be very slow using the apriori method.  Using a lower support will cause the notebook to run for a long time.

This step will take a minute or so to run.

In [None]:
from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(onehot, min_support=0.4, use_colnames=True)

mask = [True if len(x) >= 10 else False for x in frequent_itemsets.itemsets.values]
frequent_itemsets[mask]

## Get the association rules
Using the frequent itemsets, get association rules for which the conviction threshold is at least 10.  This step will take a minute or two.

In [None]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets, metric="conviction", min_threshold=10)
display(rules)

## Find interesting rules
Find and count all of the rules where the target variable is the only consequent item. 

(Do not display them all, as there will be many rules)

In [None]:
target_names = [x for x in onehot.columns if 'target' in x]

mask = [True if c.intersection(target_names) and len(c) == 1 else False for c in rules.consequents.values]
target_rules = rules[mask]
display(target_rules)

## Use the rules for feature selection
Find all the features that are associated with a consequent that contains only target

In [None]:
frozenset.union(*target_rules['antecedents'])
