# 연관규칙(Association Rule)

## 평가지표

### 지지도(Support) : 전체 거래 중에서 A,B가 포함된 거래의 수

### 신뢰도(Confidence) : A가 구매되었을 때 품목 B가 추라고 구매될 학률. (=조건부확률)

신뢰도가 높다면 유용한 규칙일 가능성이 높다. 집합 간의 연관성 강도 측정
    
    
### 향상도(Lift) : A를 구매할 때, B도 구매하는지 서로간의 연관성을 파악하는 비율

생성된 규칙이 효용가치가 있는지 분석. 두 사건이 동시에 얼마나 발생하는지 비율로 나타냄

- 향상도 > 1 -> 양의 상관관계
- 향상도 = 1 -> 독립적인 관계
- 향상도 < 1 -> 음의 상관관계

## 분석방법

일정 수준 이하 지지도(Support), 신뢰도(Confidence) 자료는 제외시키고 

향상도(Lift) 내림차순으로 sorting을 해서 rule을 평가하는 식으로 이용


그리고 관심이 있는 상품이나 item이 있으면 목적에 맞게 해당 item이 
left-hand side 나 right-hand side 에 있는 rule만을 subset으로 선별해서 보기도 함

위의 지표를 보완하기 위해 IS 측도, 교차지지도(cross support) 지표가 있다.

In [2]:
import pandas as pd
import numpy as np
# 데이터 전처리 : 항목값(집합)에 대한 index생성
from mlxtend.preprocessing import TransactionEncoder
#지지도 계산
from mlxtend.frequent_patterns import apriori
#연관규칙
from mlxtend.frequent_patterns import association_rules 

In [4]:
df_raw = pd.read_csv("./실습화일/2. Big Data 분석/상품구매.csv", encoding = 'euc-kr')

ID = list(set(df_raw['ID']))
ID.sort()

# 고객ID별 상품매칭
list_association=[]
for i in ID:
    #ID별 구매 데이터 추출
    tmp_list=list(df_raw[df_raw["ID"]==i]["PRODUCT"])
    tmp_list.sort()
    #ID별 구매 데이터를 리스트에 담기
    list_association.append(tmp_list)
    
for row in list_association:
    print(row)

['APPLES', 'ARTICHOKE', 'AVOCADO', 'BAGUETTE', 'CORNED BEEF', 'HEINEKEN', 'HERRING']
['APPLES', 'CORNED BEEF', 'HEINEKEN', 'HERRING', 'OLIVES', 'SARDINES', 'STEAK']
['APPLES', 'AVOCADO', 'BAGUETTE', 'ICE CREAM', 'PEPPERS', 'SARDINES', 'STEAK']
['APPLES', 'COKE', 'CORNED BEEF', 'HAM', 'HERRING', 'OLIVES', 'TURKEY']
['ARTICHOKE', 'BOURBON', 'COKE', 'HAM', 'ICE CREAM', 'OLIVES', 'TURKEY']
['ARTICHOKE', 'AVOCADO', 'BAGUETTE', 'COKE', 'HEINEKEN', 'HERRING', 'TURKEY']
['APPLES', 'CHICKEN', 'COKE', 'CORNED BEEF', 'HEINEKEN', 'ICE CREAM', 'SARDINES']
['BAGUETTE', 'BOURBON', 'CRACKERS', 'HEINEKEN', 'OLIVES', 'PEPPERS', 'SODA']
['BOURBON', 'CRACKERS', 'HEINEKEN', 'HERRING', 'OLIVES', 'SODA', 'STEAK']
['APPLES', 'BAGUETTE', 'CORNED BEEF', 'HAM', 'HERRING', 'OLIVES', 'TURKEY']
['ARTICHOKE', 'AVOCADO', 'BAGUETTE', 'BOURBON', 'CORNED BEEF', 'HEINEKEN', 'HERRING']
['ARTICHOKE', 'BOURBON', 'CRACKERS', 'HEINEKEN', 'OLIVES', 'SODA', 'STEAK']
['BOURBON', 'CORNED BEEF', 'CRACKERS', 'HEINEKEN', 'HERRING', 

In [8]:
# items 기준으로 포함 여부에 따라 True/Fasle 로 변환
enc = TransactionEncoder()
df_raw_enc = enc.fit_transform(X=list_association)

# fit_transform : 데이터를 행렬 형태로 반환
df_asso=pd.DataFrame(df_raw_enc, columns=enc.columns_)
df_asso.head()

Unnamed: 0,APPLES,ARTICHOKE,AVOCADO,BAGUETTE,BOURBON,CHICKEN,COKE,CORNED BEEF,CRACKERS,HAM,HEINEKEN,HERRING,ICE CREAM,OLIVES,PEPPERS,SARDINES,SODA,STEAK,TURKEY
0,True,True,True,True,False,False,False,True,False,False,True,True,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,True,False,False,True,True,False,True,False,True,False,True,False
2,True,False,True,True,False,False,False,False,False,False,False,False,True,False,True,True,False,True,False
3,True,False,False,False,False,False,True,True,False,True,False,True,False,True,False,False,False,False,True
4,False,True,False,False,True,False,True,False,False,True,False,False,True,True,False,False,False,False,True


In [9]:
# 규칙 선택 최소 지지도
v_min_support=0.4

# 연관규칙 생성 및 지지도 임계값 적용
df_freq = apriori(df_asso, min_support = v_min_support, use_colnames = True)
df_freq

Unnamed: 0,support,itemsets
0,0.4,(APPLES)
1,0.4,(BAGUETTE)
2,0.55,(BOURBON)
3,0.5,(COKE)
4,0.4,(CORNED BEEF)
5,0.6,(HEINEKEN)
6,0.4,(HERRING)
7,0.45,(ICE CREAM)
8,0.65,(OLIVES)
9,0.4,(SODA)


In [11]:
# 연관규칙 선택 및 해석
# 신뢰도 기준 : confidence
df_asso_rule = association_rules(df_freq, metric='confidence', min_threshold = 0.7)
df_asso_rule.round(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(OLIVES),(BOURBON),0.65,0.55,0.5,0.769,1.399,0.142,1.95,0.814
1,(BOURBON),(OLIVES),0.55,0.65,0.5,0.909,1.399,0.142,3.85,0.633
2,(BOURBON),(SODA),0.55,0.4,0.4,0.727,1.818,0.18,2.2,1.0
3,(SODA),(BOURBON),0.4,0.55,0.4,1.0,1.818,0.18,inf,0.75
4,(ICE CREAM),(COKE),0.45,0.5,0.4,0.889,1.778,0.175,4.5,0.795
5,(COKE),(ICE CREAM),0.5,0.45,0.4,0.8,1.778,0.175,2.75,0.875
6,(SODA),(OLIVES),0.4,0.65,0.4,1.0,1.538,0.14,inf,0.583
7,"(OLIVES, BOURBON)",(SODA),0.5,0.4,0.4,0.8,2.0,0.2,3.0,1.0
8,"(OLIVES, SODA)",(BOURBON),0.4,0.55,0.4,1.0,1.818,0.18,inf,0.75
9,"(BOURBON, SODA)",(OLIVES),0.4,0.65,0.4,1.0,1.538,0.14,inf,0.583


In [12]:
# 향상도 기준 : lift
df_asso_rule = association_rules(df_freq, metric='lift', min_threshold = 1.3)
df_asso_rule.round(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(OLIVES),(BOURBON),0.65,0.55,0.5,0.769,1.399,0.142,1.95,0.814
1,(BOURBON),(OLIVES),0.55,0.65,0.5,0.909,1.399,0.142,3.85,0.633
2,(BOURBON),(SODA),0.55,0.4,0.4,0.727,1.818,0.18,2.2,1.0
3,(SODA),(BOURBON),0.4,0.55,0.4,1.0,1.818,0.18,inf,0.75
4,(ICE CREAM),(COKE),0.45,0.5,0.4,0.889,1.778,0.175,4.5,0.795
5,(COKE),(ICE CREAM),0.5,0.45,0.4,0.8,1.778,0.175,2.75,0.875
6,(OLIVES),(SODA),0.65,0.4,0.4,0.615,1.538,0.14,1.56,1.0
7,(SODA),(OLIVES),0.4,0.65,0.4,1.0,1.538,0.14,inf,0.583
8,"(OLIVES, BOURBON)",(SODA),0.5,0.4,0.4,0.8,2.0,0.2,3.0,1.0
9,"(OLIVES, SODA)",(BOURBON),0.4,0.55,0.4,1.0,1.818,0.18,inf,0.75
