##Mount to Google Drive Account

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
import os
#from efficient_apriori import apriori # for association analysis
#!pip install apyori

##Import association rules and apriori packages from mlxtend library

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

##Import the uploaded Dataset from Drive.
Dataset source: https://archive-beta.ics.uci.edu/ml/datasets/breast+cancer

In [None]:
path = "/content/drive/MyDrive/breast-cancer.data"
data = pd.read_csv(path, header=None)

#1. Data Preprocessing

##The values entered the dataset are categorical but have different types such as int and string. Therefore, first step is to change all the column types to 'category' as we will perform hot encoding later. 

In [None]:
# define Dataset columns and change their types to categories (objects)
data.columns = ['Class', 'Age', 'Menopause', 'Tumor_Size', 'Inv_Nodes', 'Node_Caps', 'Malignancy_Degree', 'Breast', 'Breast_Quad', 'Irradiat']
data['Malignancy_Degree'] = data['Malignancy_Degree'].astype(str)

data['Class'] = pd.Categorical(data.Class)
data['Age'] = pd.Categorical(data.Age)
data['Menopause'] = pd.Categorical(data.Menopause)
data['Tumor_Size'] = pd.Categorical(data.Tumor_Size)
data['Inv_Nodes'] = pd.Categorical(data.Inv_Nodes)
data['Node_Caps'] = pd.Categorical(data.Node_Caps)
data['Malignancy_Degree'] = pd.Categorical(data.Malignancy_Degree)
data['Breast'] = pd.Categorical(data.Breast)
data['Breast_Quad'] = pd.Categorical(data.Breast_Quad)
data['Irradiat'] = pd.Categorical(data.Irradiat)


df = data

# change the naming of the values in the radiation column to be more understandable
df["Irradiat"].replace({"yes": "Radiation", "no": "No_Radiation"}, inplace=True)

In [None]:
df

Unnamed: 0,Class,Age,Menopause,Tumor_Size,Inv_Nodes,Node_Caps,Malignancy_Degree,Breast,Breast_Quad,Irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,No_Radiation
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,No_Radiation
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,No_Radiation
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,No_Radiation
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,No_Radiation
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,No_Radiation
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,Radiation
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,No_Radiation
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,No_Radiation


In [None]:
df.dtypes

Class                category
Age                  category
Menopause            category
Tumor_Size           category
Inv_Nodes            category
Node_Caps            category
Malignancy_Degree    category
Breast               category
Breast_Quad          category
Irradiat               object
dtype: object

In [None]:
df.shape

(286, 10)

In [None]:
# create a list of lists (itemsets)
transactions = []
for i in range(0, 286):
 transactions.append([str(df.values[i,j]) for j in range(0,10)])

# disaplay list
transactions

[['no-recurrence-events',
  '30-39',
  'premeno',
  '30-34',
  '0-2',
  'no',
  '3',
  'left',
  'left_low',
  'No_Radiation'],
 ['no-recurrence-events',
  '40-49',
  'premeno',
  '20-24',
  '0-2',
  'no',
  '2',
  'right',
  'right_up',
  'No_Radiation'],
 ['no-recurrence-events',
  '40-49',
  'premeno',
  '20-24',
  '0-2',
  'no',
  '2',
  'left',
  'left_low',
  'No_Radiation'],
 ['no-recurrence-events',
  '60-69',
  'ge40',
  '15-19',
  '0-2',
  'no',
  '2',
  'right',
  'left_up',
  'No_Radiation'],
 ['no-recurrence-events',
  '40-49',
  'premeno',
  '0-4',
  '0-2',
  'no',
  '2',
  'right',
  'right_low',
  'No_Radiation'],
 ['no-recurrence-events',
  '60-69',
  'ge40',
  '15-19',
  '0-2',
  'no',
  '2',
  'left',
  'left_low',
  'No_Radiation'],
 ['no-recurrence-events',
  '50-59',
  'premeno',
  '25-29',
  '0-2',
  'no',
  '2',
  'left',
  'left_low',
  'No_Radiation'],
 ['no-recurrence-events',
  '60-69',
  'ge40',
  '20-24',
  '0-2',
  'no',
  '1',
  'left',
  'left_low',
  '

###In this step, the coloums were modified to include the categorical features of the records within the dataset through hot encoding.

In [None]:
# perform hot encoding on all values in the dataset
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,0-2,0-4,1,10-14,12-14,15-17,15-19,2,20-24,20-29,...,left_up,lt40,no,no-recurrence-events,premeno,recurrence-events,right,right_low,right_up,yes
0,True,False,False,False,False,False,False,False,False,False,...,False,False,True,True,True,False,False,False,False,False
1,True,False,False,False,False,False,False,True,True,False,...,False,False,True,True,True,False,True,False,True,False
2,True,False,False,False,False,False,False,True,True,False,...,False,False,True,True,True,False,False,False,False,False
3,True,False,False,False,False,False,True,True,False,False,...,True,False,True,True,False,False,True,False,False,False
4,True,True,False,False,False,False,False,True,False,False,...,False,False,True,True,True,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,True,False,False,False,False,False,False,True,False,False,...,True,False,True,False,True,True,False,False,False,False
282,True,False,False,False,False,False,False,False,True,False,...,True,False,True,False,True,True,False,False,False,False
283,True,False,True,False,False,False,False,False,True,False,...,True,False,True,False,False,True,True,False,False,False
284,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,True,False,False,False,False




In [None]:
df.columns

Index(['0-2', '0-4', '1', '10-14', '12-14', '15-17', '15-19', '2', '20-24',
       '20-29', '24-26', '25-29', '3', '3-5', '30-34', '30-39', '35-39',
       '40-44', '40-49', '45-49', '5-9', '50-54', '50-59', '6-8', '60-69',
       '70-79', '9-11', '?', 'No_Radiation', 'Radiation', 'central', 'ge40',
       'left', 'left_low', 'left_up', 'lt40', 'no', 'no-recurrence-events',
       'premeno', 'recurrence-events', 'right', 'right_low', 'right_up',
       'yes'],
      dtype='object')

In [None]:

# rename columns to include decsription of values
df.rename(columns = {'0-2':'inv_nodes:0-2', '3-5':'inv_nodes:3-5', '6-8':'inv_nodes:6-8', '9-11':'inv_nodes:9-11','12-14':'inv_nodes:12-14', '15-17':'inv_nodes:15-17', '18-20':'inv_nodes:18-20', '21-23':'inv_nodes:21-23','24-26':'inv_nodes:24-26', 
                     '0-4':'tumor_size:0-4', '10-14':'tumor_size:10-14', '15-19':'tumor_size:15-19', '20-24':'tumor_size:20-24', '25-29':'tumor_size:25-29', '30-34':'tumor_size:30-34', '35-39':'tumor_size:35-39', '40-44':'tumor_size:40-44', '45-49':'tumor_size:45-49', '50-54':'tumor_size:50-54',
                     '20-29':'age:20-29', '30-39':'age:30-39', '40-49':'age:40-49', '50-59':'age:50-59', '60-69':'age:60-69', '70-79':'age:70-79',
                     'left':'breast:left', 'right':'breast:right',
                     'yes':'node_caps:yes', 'no':'node_caps:no',
                     '1':'deg_malig:1', '2': 'deg_malig:2', '3':'deg_malig:3',
                     'left_up':'breast_quad:left_up', 'left_low':'breast_quad:left_low', 'right_low':'breast_quad:right_low', 'central':'breast_quad:central',
                     'lt40':'menopause:lt40', 'ge40':'menopause:ge40', 'premeno':'menopause:premeno'}, inplace = True)


#remove missing values
df.drop('?', axis=1, inplace=True)

In [None]:
df

Unnamed: 0,inv_nodes:0-2,tumor_size:0-4,deg_malig:1,tumor_size:10-14,inv_nodes:12-14,inv_nodes:15-17,tumor_size:15-19,deg_malig:2,tumor_size:20-24,age:20-29,...,breast_quad:left_up,menopause:lt40,node_caps:no,no-recurrence-events,menopause:premeno,recurrence-events,breast:right,breast_quad:right_low,right_up,node_caps:yes
0,True,False,False,False,False,False,False,False,False,False,...,False,False,True,True,True,False,False,False,False,False
1,True,False,False,False,False,False,False,True,True,False,...,False,False,True,True,True,False,True,False,True,False
2,True,False,False,False,False,False,False,True,True,False,...,False,False,True,True,True,False,False,False,False,False
3,True,False,False,False,False,False,True,True,False,False,...,True,False,True,True,False,False,True,False,False,False
4,True,True,False,False,False,False,False,True,False,False,...,False,False,True,True,True,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,True,False,False,False,False,False,False,True,False,False,...,True,False,True,False,True,True,False,False,False,False
282,True,False,False,False,False,False,False,False,True,False,...,True,False,True,False,True,True,False,False,False,False
283,True,False,True,False,False,False,False,False,True,False,...,True,False,True,False,False,True,True,False,False,False
284,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,True,False,False,False,False


#2. Apriori Implementation

### 1. This is a general implementation of the association rules.

In [None]:
#Apriori min support
min_support = 0.05

#Max lenght of apriori itemsets
max_len = 3

frequent_items = apriori(df, use_colnames=True, min_support=min_support, max_len=max_len + 1)
rules = association_rules(frequent_items, metric='lift', min_threshold=1)
rules['antecedents'] = rules['antecedents'].astype('string')
rules['consequents'] = rules['consequents'].astype('string')


print(frequent_items.head(10))
print(rules.dtypes)

# print first 10 rules
rules.head(10).sort_values(by='confidence', ascending=False)

    support            itemsets
0  0.744755     (inv_nodes:0-2)
1  0.248252       (deg_malig:1)
2  0.097902  (tumor_size:10-14)
3  0.104895  (tumor_size:15-19)
4  0.454545       (deg_malig:2)
5  0.174825  (tumor_size:20-24)
6  0.188811  (tumor_size:25-29)
7  0.297203       (deg_malig:3)
8  0.125874     (inv_nodes:3-5)
9  0.209790  (tumor_size:30-34)
antecedents            string
consequents            string
antecedent support    float64
consequent support    float64
support               float64
confidence            float64
lift                  float64
leverage              float64
conviction            float64
dtype: object


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,frozenset({'deg_malig:1'}),frozenset({'inv_nodes:0-2'}),0.248252,0.744755,0.234266,0.943662,1.267077,0.049379,4.530594
2,frozenset({'tumor_size:10-14'}),frozenset({'inv_nodes:0-2'}),0.097902,0.744755,0.090909,0.928571,1.246814,0.017996,3.573427
4,frozenset({'tumor_size:15-19'}),frozenset({'inv_nodes:0-2'}),0.104895,0.744755,0.090909,0.866667,1.163693,0.012788,1.914336
8,frozenset({'age:40-49'}),frozenset({'inv_nodes:0-2'}),0.314685,0.744755,0.237762,0.755556,1.014502,0.003399,1.044183
6,frozenset({'deg_malig:2'}),frozenset({'inv_nodes:0-2'}),0.454545,0.744755,0.342657,0.753846,1.012207,0.004132,1.036932
7,frozenset({'inv_nodes:0-2'}),frozenset({'deg_malig:2'}),0.744755,0.454545,0.342657,0.460094,1.012207,0.004132,1.010277
9,frozenset({'inv_nodes:0-2'}),frozenset({'age:40-49'}),0.744755,0.314685,0.237762,0.319249,1.014502,0.003399,1.006704
1,frozenset({'inv_nodes:0-2'}),frozenset({'deg_malig:1'}),0.744755,0.248252,0.234266,0.314554,1.267077,0.049379,1.096729
3,frozenset({'inv_nodes:0-2'}),frozenset({'tumor_size:10-14'}),0.744755,0.097902,0.090909,0.122066,1.246814,0.017996,1.027523
5,frozenset({'inv_nodes:0-2'}),frozenset({'tumor_size:15-19'}),0.744755,0.104895,0.090909,0.122066,1.163693,0.012788,1.019558


In [None]:
df['recurrence-events'].value_counts()

False    201
True      85
Name: recurrence-events, dtype: int64

### 2. This is a filtered implementation of the association rules as the main target is to identify the pattern of frequent cancer characteristics in patients with recurrent- and non-recurrent incidents.

In [None]:
'''
Filtering only consequents with recurrence-events
'''

#Apriori min support
min_support = 0.1

#Max lenght of apriori itemsets
max_len = 3

frequent_items = apriori(df, use_colnames=True, min_support=min_support, max_len=max_len + 1)
rules = association_rules(frequent_items, metric='lift', min_threshold=1)

target = '{\'recurrence-events\'}'

results_filter = rules[rules['consequents'].astype(str).str.contains(target, na=False)].sort_values(by='confidence', ascending=False)

results_filter.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
141,(node_caps:yes),(recurrence-events),0.195804,0.297203,0.108392,0.553571,1.862605,0.050198,1.574266
61,(deg_malig:3),(recurrence-events),0.297203,0.297203,0.157343,0.529412,1.781315,0.069013,1.493444
102,(Radiation),(recurrence-events),0.237762,0.297203,0.108392,0.455882,1.53391,0.037728,1.291627
113,(breast:left),(recurrence-events),0.531469,0.297203,0.171329,0.322368,1.084675,0.013375,1.037138
135,(menopause:premeno),(recurrence-events),0.524476,0.297203,0.167832,0.32,1.076706,0.011957,1.033525
119,(breast_quad:left_low),(recurrence-events),0.384615,0.297203,0.122378,0.318182,1.070588,0.008069,1.030769
973,"(No_Radiation, menopause:premeno)",(recurrence-events),0.388112,0.297203,0.115385,0.297297,1.000318,3.7e-05,1.000134


In [None]:
'''
Filtering only consequents no-recurrence-events
'''

#Apriori min support
min_support = 0.1

#Max lenght of apriori itemsets
max_len = 42

frequent_items = apriori(df, use_colnames=True, min_support=min_support, max_len=max_len + 1)
rules = association_rules(frequent_items, metric='lift', min_threshold=1)

target = '{\'no-recurrence-events\'}'

results_filter = rules[rules['consequents'].astype(str).str.contains(target, na=False)].sort_values(by='confidence', ascending=False)

results_filter.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
5522,"(node_caps:no, No_Radiation, deg_malig:1, meno...",(no-recurrence-events),0.104895,0.702797,0.101399,0.966667,1.375456,0.027679,8.916084
6006,"(No_Radiation, menopause:ge40, inv_nodes:0-2, ...",(no-recurrence-events),0.104895,0.702797,0.101399,0.966667,1.375456,0.027679,8.916084
3583,"(inv_nodes:0-2, No_Radiation, deg_malig:1, men...",(no-recurrence-events),0.104895,0.702797,0.101399,0.966667,1.375456,0.027679,8.916084
2484,"(No_Radiation, deg_malig:1, menopause:ge40)",(no-recurrence-events),0.104895,0.702797,0.101399,0.966667,1.375456,0.027679,8.916084
6069,"(No_Radiation, inv_nodes:0-2, deg_malig:1, bre...",(no-recurrence-events),0.111888,0.702797,0.104895,0.9375,1.333955,0.02626,4.755245
5553,"(breast:left, No_Radiation, deg_malig:1, node_...",(no-recurrence-events),0.111888,0.702797,0.104895,0.9375,1.333955,0.02626,4.755245
1185,"(inv_nodes:0-2, deg_malig:1, menopause:ge40)",(no-recurrence-events),0.108392,0.702797,0.101399,0.935484,1.331087,0.025221,4.606643
3760,"(inv_nodes:0-2, node_caps:no, deg_malig:1, men...",(no-recurrence-events),0.108392,0.702797,0.101399,0.935484,1.331087,0.025221,4.606643
2564,"(node_caps:no, deg_malig:1, menopause:ge40)",(no-recurrence-events),0.108392,0.702797,0.101399,0.935484,1.331087,0.025221,4.606643
519,"(deg_malig:1, menopause:ge40)",(no-recurrence-events),0.118881,0.702797,0.108392,0.911765,1.297337,0.024842,3.368298


##Sources:

https://gist.github.com/eduardoftdo/e3d2b7ca4a06d8d86b144482d0aed5a1

http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/