# Module 8 Practice 1 - Association Rule Mining
We will practice skills related to association rule mining.

In [1]:
import pandas as pd
import numpy as np
import sys
!{sys.executable} -m pip install mlxtend
import mlxtend

pd.set_option('display.max_columns', 0)
pd.set_option('max_colwidth', 0)



## Load the breast cancer data set
Load the bunch.  No need to place data and target into a dataframe, yet.

In [2]:
import sklearn.datasets as d

bc = d.load_breast_cancer()
print(bc.DESCR)


.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

## Bin the features
All of this data is continuous except the target, so bin them using kmeans into 3 buckets.  Create a new data frame named `binned_data` that contains the binned data plus the target.  Set the type of all columns to int.

In [3]:
from sklearn import preprocessing

kmeans_data = preprocessing.KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans').fit_transform(bc.data)
binned_data = pd.DataFrame(np.c_[kmeans_data,bc.target], columns = np.append(bc.feature_names, ['target']), dtype=int)
display(binned_data.head())

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,2,0,2,1,2,2,2,2,2,2,1,0,1,1,0,1,1,1,1,1,2,0,2,1,2,2,2,2,2,2,0
1,2,1,2,1,0,0,1,1,1,0,0,0,0,1,0,0,0,1,0,0,2,1,2,1,0,0,1,2,0,1,0
2,2,1,2,1,2,1,1,2,1,0,1,0,1,1,0,1,0,1,1,0,2,1,2,1,1,1,1,2,1,1,0
3,0,1,0,0,2,2,2,1,2,2,0,0,0,0,1,2,1,1,2,1,0,1,0,0,2,2,2,2,2,2,0
4,2,0,2,1,1,1,1,1,1,0,1,0,1,1,1,0,1,1,0,0,2,0,2,1,1,0,1,1,0,0,0


## Prepare to find the frequent itemsets
We have to one hot encode all of the features.  Assign the one hot encoded dataframe to a new dataframe named `onehot`

In [4]:
onehot = pd.get_dummies(binned_data, columns = binned_data.columns)
display(onehot.head())

Unnamed: 0,mean radius_0,mean radius_1,mean radius_2,mean texture_0,mean texture_1,mean texture_2,mean perimeter_0,mean perimeter_1,mean perimeter_2,mean area_0,mean area_1,mean area_2,mean smoothness_0,mean smoothness_1,mean smoothness_2,mean compactness_0,mean compactness_1,mean compactness_2,mean concavity_0,mean concavity_1,mean concavity_2,mean concave points_0,mean concave points_1,mean concave points_2,mean symmetry_0,mean symmetry_1,mean symmetry_2,mean fractal dimension_0,mean fractal dimension_1,mean fractal dimension_2,radius error_0,radius error_1,radius error_2,texture error_0,texture error_1,texture error_2,perimeter error_0,perimeter error_1,perimeter error_2,area error_0,...,concave points error_1,concave points error_2,symmetry error_0,symmetry error_1,symmetry error_2,fractal dimension error_0,fractal dimension error_1,fractal dimension error_2,worst radius_0,worst radius_1,worst radius_2,worst texture_0,worst texture_1,worst texture_2,worst perimeter_0,worst perimeter_1,worst perimeter_2,worst area_0,worst area_1,worst area_2,worst smoothness_0,worst smoothness_1,worst smoothness_2,worst compactness_0,worst compactness_1,worst compactness_2,worst concavity_0,worst concavity_1,worst concavity_2,worst concave points_0,worst concave points_1,worst concave points_2,worst symmetry_0,worst symmetry_1,worst symmetry_2,worst fractal dimension_0,worst fractal dimension_1,worst fractal dimension_2,target_0,target_1
0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,...,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,1,0
1,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,...,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,1,0
2,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,...,1,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0
3,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,...,1,0,0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,1,0
4,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,...,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0


## Get the frequent itemsets
Display the frequent itemsets where there are at least 10 items.  Use a min_support of 0.4.
Warning! Using a very low min_support with a dataset that has many features can be very slow using the apriori method.  Using a lower support will cause the notebook to run for a long time.

This step will take a minute or so to run.

In [5]:
from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(onehot, min_support=0.4, use_colnames=True)

mask = [True if len(x) >= 10 else False for x in frequent_itemsets.itemsets.values]
frequent_itemsets[mask]

Unnamed: 0,support,itemsets
19206,0.400703,"(worst radius_0, radius error_0, area error_0, perimeter error_0, worst perimeter_0, worst area_0, target_1, mean perimeter_0, mean area_0, mean radius_0)"
19207,0.418278,"(worst radius_0, radius error_0, area error_0, perimeter error_0, mean concave points_0, worst perimeter_0, worst area_0, target_1, mean perimeter_0, mean area_0)"
19208,0.402460,"(mean compactness_0, compactness error_0, mean concavity_0, radius error_0, area error_0, perimeter error_0, concavity error_0, mean concave points_0, worst area_0, mean area_0)"
19209,0.413005,"(mean compactness_0, mean concavity_0, radius error_0, area error_0, perimeter error_0, concavity error_0, mean concave points_0, worst area_0, mean area_0, fractal dimension error_0)"
19210,0.411248,"(mean compactness_0, mean concavity_0, radius error_0, area error_0, perimeter error_0, concavity error_0, mean concave points_0, worst area_0, worst compactness_0, mean area_0)"
...,...,...
19358,0.402460,"(worst radius_0, mean concavity_0, radius error_0, area error_0, perimeter error_0, mean concave points_0, worst perimeter_0, worst area_0, target_1, mean area_0, fractal dimension error_0)"
19359,0.402460,"(mean concavity_0, radius error_0, area error_0, perimeter error_0, mean concave points_0, worst area_0, target_1, worst compactness_0, mean area_0, worst concavity_0, fractal dimension error_0)"
19360,0.407733,"(worst radius_0, mean concavity_0, radius error_0, area error_0, perimeter error_0, mean concave points_0, worst perimeter_0, worst area_0, target_1, worst compactness_0, mean area_0)"
19361,0.402460,"(mean concavity_0, radius error_0, area error_0, perimeter error_0, mean concave points_0, worst perimeter_0, worst area_0, target_1, worst compactness_0, mean area_0, worst concavity_0)"


## Get the association rules
Using the frequent itemsets, get association rules for which the conviction threshold is at least 10.  This step will take a minute or two.

In [6]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets, metric="conviction", min_threshold=10)
display(rules)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(mean radius_0),(mean perimeter_0),0.437610,0.467487,0.434095,0.991968,2.121916,0.229518,66.297891
1,(mean radius_0),(mean area_0),0.437610,0.745167,0.437610,1.000000,1.341981,0.111517,inf
2,(mean radius_0),(area error_0),0.437610,0.827768,0.437610,1.000000,1.208068,0.075370,inf
3,(mean radius_0),(worst radius_0),0.437610,0.534271,0.430580,0.983936,1.841643,0.196778,28.991652
4,(mean radius_0),(worst perimeter_0),0.437610,0.548330,0.427065,0.975904,1.779773,0.187110,18.744288
...,...,...,...,...,...,...,...,...,...
282880,"(compactness error_0, perimeter error_0, concavity error_0, target_1, worst compactness_0, fractal dimension error_0)","(mean concavity_0, radius error_0, area error_0, worst area_0, mean area_0)",0.416520,0.550088,0.400703,0.962025,1.748858,0.171580,11.847686
282881,"(compactness error_0, perimeter error_0, target_1, worst compactness_0, mean area_0, fractal dimension error_0)","(mean concavity_0, radius error_0, area error_0, concavity error_0, worst area_0)",0.414763,0.514938,0.400703,0.966102,1.876150,0.187126,14.309315
282882,"(compactness error_0, concavity error_0, target_1, worst compactness_0, mean area_0, fractal dimension error_0)","(mean concavity_0, radius error_0, area error_0, perimeter error_0, worst area_0)",0.413005,0.557118,0.400703,0.970213,1.741486,0.170610,14.868190
282883,"(compactness error_0, worst area_0, target_1, worst compactness_0, mean area_0, fractal dimension error_0)","(mean concavity_0, radius error_0, area error_0, perimeter error_0, concavity error_0)",0.420035,0.530756,0.400703,0.953975,1.797390,0.177767,10.195399


## Find interesting rules
Find and count all of the rules where the target variable is the only consequent item. 

(Do not display them all, as there will be many rules)

In [7]:
target_names = [x for x in onehot.columns if 'target' in x]

mask = [True if c.intersection(target_names) and len(c) == 1 else False for c in rules.consequents.values]
target_rules = rules[mask]
display(target_rules)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
36,(worst radius_0),(target_1),0.534271,0.627417,0.516696,0.967105,1.541409,0.181486,11.326538
38,(worst perimeter_0),(target_1),0.548330,0.627417,0.534271,0.974359,1.552970,0.190239,14.530756
42,(worst concave points_0),(target_1),0.467487,0.627417,0.453427,0.969925,1.545903,0.160118,12.388401
114,"(worst radius_0, mean radius_0)",(target_1),0.430580,0.627417,0.416520,0.967347,1.541794,0.146367,11.410369
120,"(worst perimeter_0, mean radius_0)",(target_1),0.427065,0.627417,0.414763,0.971193,1.547925,0.146815,12.933969
...,...,...,...,...,...,...,...,...,...
281344,"(worst radius_0, mean concavity_0, radius error_0, area error_0, perimeter error_0, mean concave points_0, worst perimeter_0, worst area_0, mean area_0, fractal dimension error_0)",(target_1),0.404218,0.627417,0.402460,0.995652,1.586908,0.148847,85.694200
281708,"(mean concavity_0, radius error_0, area error_0, mean concave points_0, perimeter error_0, worst area_0, worst compactness_0, mean area_0, worst concavity_0, fractal dimension error_0)",(target_1),0.409490,0.627417,0.402460,0.982833,1.566476,0.145539,21.702988
281982,"(worst radius_0, mean concavity_0, radius error_0, area error_0, perimeter error_0, mean concave points_0, worst perimeter_0, worst area_0, worst compactness_0, mean area_0)",(target_1),0.409490,0.627417,0.407733,0.995708,1.586997,0.150812,86.811951
282320,"(mean concavity_0, radius error_0, area error_0, mean concave points_0, perimeter error_0, worst perimeter_0, worst area_0, worst compactness_0, mean area_0, worst concavity_0)",(target_1),0.404218,0.627417,0.402460,0.995652,1.586908,0.148847,85.694200


## Use the rules for feature selection
Find all the features that are associated with a consequent that contains only target

In [8]:
frozenset.union(*target_rules['antecedents'])


frozenset({'area error_0',
           'compactness error_0',
           'concavity error_0',
           'fractal dimension error_0',
           'mean area_0',
           'mean compactness_0',
           'mean concave points_0',
           'mean concavity_0',
           'mean perimeter_0',
           'mean radius_0',
           'perimeter error_0',
           'radius error_0',
           'worst area_0',
           'worst compactness_0',
           'worst concave points_0',
           'worst concavity_0',
           'worst fractal dimension_0',
           'worst perimeter_0',
           'worst radius_0'})