In [2]:
import pandas as pd 
import numpy as np 

# 데이터 전처리: 항목 값에 대한 인덱스 생성
from mlxtend.preprocessing import TransactionEncoder
# 지지도 계산
from mlxtend.frequent_patterns import apriori
# 연관규칙
from mlxtend.frequent_patterns import association_rules

In [4]:
ds_raw = [['소주', '콜라', '맥주'],
         ['소주', '콜라', '와인'],
         ['소주', '주스'],
         ['콜라', '맥주'],
         ['소주', '콜라', '맥주', '와인'],
         ['주스']]
ds_raw

[['소주', '콜라', '맥주'],
 ['소주', '콜라', '와인'],
 ['소주', '주스'],
 ['콜라', '맥주'],
 ['소주', '콜라', '맥주', '와인'],
 ['주스']]

### 데아터 구성하기

In [6]:
# 데이터 전처리
enc = TransactionEncoder()
ds_raw_enc = enc.fit_transform(X = ds_raw)
# 위의 함수는 데이터를 행렬의 형태로 반환, pandas의 DF 형태로 반환
df_asso = pd.DataFrame(ds_raw_enc, columns = enc.columns_)
df_asso.head()

Unnamed: 0,맥주,소주,와인,주스,콜라
0,True,True,False,False,True
1,False,True,True,False,True
2,False,True,False,True,False
3,True,False,False,False,True
4,True,True,True,False,True


### 연관 규칙 분석

In [7]:
# 최소 지지도
min_support = 0.5

# 지지도 계산
df_freq = apriori(df_asso, min_support = 0.5, use_colnames= True)
df_freq.round(3)

Unnamed: 0,support,itemsets
0,0.5,(맥주)
1,0.667,(소주)
2,0.667,(콜라)
3,0.5,"(콜라, 맥주)"
4,0.5,"(콜라, 소주)"


In [8]:
# 지지도, 신뢰도, 향상도 산출

df_asso_rule = association_rules(df_freq, metric='confidence', min_threshold=0.5)
df_asso_rule.round(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(콜라),(맥주),0.667,0.5,0.5,0.75,1.5,0.167,2.0
1,(맥주),(콜라),0.5,0.667,0.5,1.0,1.5,0.167,inf
2,(콜라),(소주),0.667,0.667,0.5,0.75,1.125,0.056,1.333
3,(소주),(콜라),0.667,0.667,0.5,0.75,1.125,0.056,1.333


# 실습

In [9]:
df = pd.read_csv('data/purchase.csv')
df

Unnamed: 0,ID,PRODUCT
0,C-11,BREAD
1,C-11,COKE
2,C-11,BEER
3,C-11,CHICKEN
4,C-11,APPLES
...,...,...
116,C-30,COKE
117,C-30,CHICKEN
118,C-30,ICE CREAM
119,C-30,SODA


In [23]:
df_cross = pd.crosstab(df['ID'], df['PRODUCT'])
df_cross.head()

PRODUCT,APPLES,BEEF,BEER,BREAD,CHICKEN,COFFEE,COKE,CRACKERS,HAM,ICE CREAM,OLIVES,PEPPERS,SODA,WINE
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
C-11,1,1,1,1,1,0,1,0,0,0,0,0,0,0
C-12,1,2,1,0,0,0,0,0,0,0,1,0,0,0
C-13,1,1,0,1,0,1,0,0,0,1,0,1,0,0
C-14,1,1,0,0,1,0,1,0,1,0,1,0,0,0
C-15,0,0,0,0,1,0,1,0,1,1,1,0,0,1


In [24]:
df_cross.reset_index(inplace = True)
df_cross.head()

PRODUCT,ID,APPLES,BEEF,BEER,BREAD,CHICKEN,COFFEE,COKE,CRACKERS,HAM,ICE CREAM,OLIVES,PEPPERS,SODA,WINE
0,C-11,1,1,1,1,1,0,1,0,0,0,0,0,0,0
1,C-12,1,2,1,0,0,0,0,0,0,0,1,0,0,0
2,C-13,1,1,0,1,0,1,0,0,0,1,0,1,0,0
3,C-14,1,1,0,0,1,0,1,0,1,0,1,0,0,0
4,C-15,0,0,0,0,1,0,1,0,1,1,1,0,0,1


In [26]:
df_cross.set_index('ID', inplace = True)
df_cross.head()

PRODUCT,APPLES,BEEF,BEER,BREAD,CHICKEN,COFFEE,COKE,CRACKERS,HAM,ICE CREAM,OLIVES,PEPPERS,SODA,WINE
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
C-11,1,1,1,1,1,0,1,0,0,0,0,0,0,0
C-12,1,2,1,0,0,0,0,0,0,0,1,0,0,0
C-13,1,1,0,1,0,1,0,0,0,1,0,1,0,0
C-14,1,1,0,0,1,0,1,0,1,0,1,0,0,0
C-15,0,0,0,0,1,0,1,0,1,1,1,0,0,1


In [17]:
# mask : 조건에 해당하는 값을 1로 바꾼다. 대체치를 비워두면 Nan값으로 들어감
df_cross1 = df_cross.mask(df_cross.values > 1, 1)
df_cross1.head()

PRODUCT,APPLES,BEEF,BEER,BREAD,CHICKEN,COFFEE,COKE,CRACKERS,HAM,ICE CREAM,OLIVES,PEPPERS,SODA,WINE
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
C-11,1,1,1,1,1,0,1,0,0,0,0,0,0,0
C-12,1,1,1,0,0,0,0,0,0,0,1,0,0,0
C-13,1,1,0,1,0,1,0,0,0,1,0,1,0,0
C-14,1,1,0,0,1,0,1,0,1,0,1,0,0,0
C-15,0,0,0,0,1,0,1,0,1,1,1,0,0,1


In [27]:
### 연관 규칙 생성

# 최소 지지도
min_support = 0.5

# 지지도 계산
df_freq = apriori(df_cross1, min_support = 0.5, use_colnames= True)
df_freq.round(3)

Unnamed: 0,support,itemsets
0,0.55,(BEEF)
1,0.6,(BEER)
2,0.6,(CHICKEN)
3,0.65,(COKE)
4,0.65,(OLIVES)
5,0.6,(WINE)
6,0.55,"(COKE, CHICKEN)"
7,0.5,"(WINE, OLIVES)"


In [28]:
# 지지도, 신뢰도, 향상도 산출

df_cross_rule = association_rules(df_freq, metric='confidence', min_threshold=0.5)
df_cross_rule.round(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(COKE),(CHICKEN),0.65,0.6,0.55,0.846,1.41,0.16,2.6
1,(CHICKEN),(COKE),0.6,0.65,0.55,0.917,1.41,0.16,4.2
2,(WINE),(OLIVES),0.6,0.65,0.5,0.833,1.282,0.11,2.1
3,(OLIVES),(WINE),0.65,0.6,0.5,0.769,1.282,0.11,1.733
