# DA lab 6주차 연관규칙분석: mlxtend 라이브러리의 활용

## Association Rule Mining

In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

### Contents
* TransactionEncoder 함수
* One-hot Array 변환
* Apriori 알고리즘 구현
* 빈발집합 도출
* 연관규칙 분석

### Used Library
* mlxtend
* pandas

### 예제 데이터셋

In [2]:
dataset=[['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

### TransactionEncoder 함수: 트랜잭션 데이터를 one-hot array로 바꿔줌

In [3]:
te=TransactionEncoder()
te_ary=te.fit(dataset).transform(dataset)

### Boolean 타입으로 encode된 one-hot array

In [4]:
te_ary

array([[False, False, False,  True, False,  True,  True,  True,  True,
        False,  True],
       [False, False,  True,  True, False,  True, False,  True,  True,
        False,  True],
       [ True, False, False,  True, False,  True,  True, False, False,
        False, False],
       [False,  True, False, False, False,  True,  True, False, False,
         True,  True],
       [False,  True, False,  True,  True,  True, False, False,  True,
        False, False]], dtype=bool)

### Int타입으로 encode된 one-hot array

In [5]:
te_ary.astype("int")

array([[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1],
       [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1],
       [0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]])

### one-hot array의 칼럼은 트랜잭션의 아이템들

In [6]:
te.columns_

['Apple',
 'Corn',
 'Dill',
 'Eggs',
 'Ice cream',
 'Kidney Beans',
 'Milk',
 'Nutmeg',
 'Onion',
 'Unicorn',
 'Yogurt']

### 아이템 목록

In [7]:
te.columns_mapping_

{'Apple': 0,
 'Corn': 1,
 'Dill': 2,
 'Eggs': 3,
 'Ice cream': 4,
 'Kidney Beans': 5,
 'Milk': 6,
 'Nutmeg': 7,
 'Onion': 8,
 'Unicorn': 9,
 'Yogurt': 10}

In [8]:
df=pd.DataFrame(te_ary, columns=te.columns_)

In [9]:
df

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


### Apriori 알고리즘 실행하기

In [10]:
from mlxtend.frequent_patterns import apriori

In [11]:
apriori(df, min_support=0.6)

Unnamed: 0,support,itemsets
0,0.8,(3)
1,1.0,(5)
2,0.6,(6)
3,0.6,(8)
4,0.6,(10)
5,0.8,"(3, 5)"
6,0.6,"(8, 3)"
7,0.6,"(5, 6)"
8,0.6,"(8, 5)"
9,0.6,"(10, 5)"


In [12]:
apriori(df, min_support=0.2)

Unnamed: 0,support,itemsets
0,0.2,(0)
1,0.4,(1)
2,0.2,(2)
3,0.8,(3)
4,0.2,(4)
5,1.0,(5)
6,0.6,(6)
7,0.4,(7)
8,0.6,(8)
9,0.2,(9)


### 아이템 셋(아이템 이름 명시)

In [13]:
apriori(df, min_support=0.6, use_colnames=True)

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Kidney Beans, Eggs)"
6,0.6,"(Eggs, Onion)"
7,0.6,"(Milk, Kidney Beans)"
8,0.6,"(Kidney Beans, Onion)"
9,0.6,"(Kidney Beans, Yogurt)"


### 빈발집합의 길이 

In [14]:
frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

In [15]:
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.4,(Corn),1
1,0.8,(Eggs),1
2,1.0,(Kidney Beans),1
3,0.6,(Milk),1
4,0.4,(Nutmeg),1
5,0.6,(Onion),1
6,0.6,(Yogurt),1
7,0.4,"(Corn, Kidney Beans)",2
8,0.8,"(Kidney Beans, Eggs)",2
9,0.4,"(Milk, Eggs)",2


### 길이가 2이며, Support 값이 0.8인 빈발집합 찾기

In [16]:
frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.8) ]

Unnamed: 0,support,itemsets,length
8,0.8,"(Kidney Beans, Eggs)",2


### 특정 빈발집합 조회
* 집합내 아이템 순서는 상관없음
* frozenset

In [17]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'Onion', 'Eggs'} ]

Unnamed: 0,support,itemsets,length
11,0.6,"(Eggs, Onion)",2


### 특정 아이템을 포함한 빈발집합 조회

In [19]:
array=['Onion','Eggs']
frequent_itemsets.loc[frequent_itemsets['itemsets'].isin(array)]

Unnamed: 0,support,itemsets,length


### 연관규칙 생성 from 빈발집합

In [63]:
from mlxtend.frequent_patterns import association_rules

### confindence 기준으로 cut-off

In [65]:
rules=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(Yogurt, Kidney Beans)",(Milk),0.6,0.6,0.4,0.666667,1.111111,0.04,1.200000
1,"(Yogurt, Milk)",(Kidney Beans),0.4,1.0,0.4,1.000000,1.000000,0.00,inf
2,"(Milk, Kidney Beans)",(Yogurt),0.6,0.6,0.4,0.666667,1.111111,0.04,1.200000
3,(Yogurt),"(Milk, Kidney Beans)",0.6,0.6,0.4,0.666667,1.111111,0.04,1.200000
4,(Kidney Beans),"(Yogurt, Milk)",1.0,0.4,0.4,0.400000,1.000000,0.00,1.000000
5,(Milk),"(Yogurt, Kidney Beans)",0.6,0.6,0.4,0.666667,1.111111,0.04,1.200000
6,"(Yogurt, Nutmeg)",(Eggs),0.4,0.8,0.4,1.000000,1.250000,0.08,inf
7,"(Yogurt, Eggs)",(Nutmeg),0.4,0.4,0.4,1.000000,2.500000,0.24,inf
8,"(Nutmeg, Eggs)",(Yogurt),0.4,0.6,0.4,1.000000,1.666667,0.16,inf
9,(Yogurt),"(Nutmeg, Eggs)",0.6,0.4,0.4,0.666667,1.666667,0.16,1.800000


### 조건에 맞는 규칙 조회

In [67]:
rules[ (rules['support'] >= 0.8) &
                   (rules['lift'] >= 0.8) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
76,(Kidney Beans),(Eggs),1.0,0.8,0.8,0.8,1.0,0.0,1.0
77,(Eggs),(Kidney Beans),0.8,1.0,0.8,1.0,1.0,0.0,inf


### 칼럼추가(antecedents 길이)

In [68]:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))

In [69]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,"(Yogurt, Kidney Beans)",(Milk),0.6,0.6,0.4,0.666667,1.111111,0.04,1.200000,2
1,"(Yogurt, Milk)",(Kidney Beans),0.4,1.0,0.4,1.000000,1.000000,0.00,inf,2
2,"(Milk, Kidney Beans)",(Yogurt),0.6,0.6,0.4,0.666667,1.111111,0.04,1.200000,2
3,(Yogurt),"(Milk, Kidney Beans)",0.6,0.6,0.4,0.666667,1.111111,0.04,1.200000,1
4,(Kidney Beans),"(Yogurt, Milk)",1.0,0.4,0.4,0.400000,1.000000,0.00,1.000000,1
5,(Milk),"(Yogurt, Kidney Beans)",0.6,0.6,0.4,0.666667,1.111111,0.04,1.200000,1
6,"(Yogurt, Nutmeg)",(Eggs),0.4,0.8,0.4,1.000000,1.250000,0.08,inf,2
7,"(Yogurt, Eggs)",(Nutmeg),0.4,0.4,0.4,1.000000,2.500000,0.24,inf,2
8,"(Nutmeg, Eggs)",(Yogurt),0.4,0.6,0.4,1.000000,1.666667,0.16,inf,2
9,(Yogurt),"(Nutmeg, Eggs)",0.6,0.4,0.4,0.666667,1.666667,0.16,1.800000,1
