In [1]:
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
df = pd.read_csv('data/clean_retail.csv')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Class,Price
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom,high,15.3
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,mid,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom,low,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,low,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,low,20.34


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10617 entries, 0 to 10616
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    10617 non-null  int64  
 1   StockCode    10617 non-null  object 
 2   Description  10617 non-null  object 
 3   Quantity     10617 non-null  int64  
 4   InvoiceDate  10617 non-null  object 
 5   UnitPrice    10617 non-null  float64
 6   CustomerID   10617 non-null  float64
 7   Country      10617 non-null  object 
 8   Class        10617 non-null  object 
 9   Price        10617 non-null  float64
dtypes: float64(3), int64(2), object(5)
memory usage: 829.6+ KB


In [4]:
df.describe()

Unnamed: 0,InvoiceNo,Quantity,UnitPrice,CustomerID,Price
count,10617.0,10617.0,10617.0,10617.0,10617.0
mean,537002.568993,10.544598,3.070126,15598.064896,19.468702
std,350.949586,45.732462,4.587039,1763.597277,55.401605
min,536365.0,1.0,0.0,12347.0,0.0
25%,536690.0,1.0,1.25,14237.0,3.75
50%,537050.0,4.0,2.1,15708.0,10.0
75%,537254.0,12.0,3.75,17218.0,17.7
max,537636.0,2880.0,295.0,18239.0,1627.2


In [5]:
df.shape

(10617, 10)

In [6]:
df['Description'].value_counts()

HAND WARMER UNION JACK                85
HAND WARMER SCOTTY DOG DESIGN         77
WHITE HANGING HEART T-LIGHT HOLDER    75
PAPER CHAIN KIT 50'S CHRISTMAS        70
HAND WARMER OWL DESIGN                68
                                      ..
SET OF 2 ROUND TINS DUTCH CHEESE       1
BATHROOM SCALES FOOTPRINTS IN SAND     1
GLASS JAR DAISY FRESH COTTON WOOL      1
GLASS JAR KINGS CHOICE                 1
DECORATIVE ROSE BATHROOM BOTTLE        1
Name: Description, Length: 1852, dtype: int64

In [7]:
df['Country'].value_counts()

United Kingdom    9882
Germany            197
France             167
EIRE               132
Norway              73
Lithuania           34
Iceland             31
Italy               24
Japan               16
Australia           14
Portugal            14
Belgium             12
Poland               8
Switzerland          6
Spain                5
Netherlands          2
Name: Country, dtype: int64

In [8]:
# 공백 제거
df['Description'] = df['Description'].str.strip()

# InvoiceNo 형 변환
df['InvoiceNo'] = df['InvoiceNo'].astype('str')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10617 entries, 0 to 10616
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    10617 non-null  object 
 1   StockCode    10617 non-null  object 
 2   Description  10617 non-null  object 
 3   Quantity     10617 non-null  int64  
 4   InvoiceDate  10617 non-null  object 
 5   UnitPrice    10617 non-null  float64
 6   CustomerID   10617 non-null  float64
 7   Country      10617 non-null  object 
 8   Class        10617 non-null  object 
 9   Price        10617 non-null  float64
dtypes: float64(3), int64(1), object(6)
memory usage: 829.6+ KB


In [9]:
# 독일 데이터만 추출
german_df = df[df['Country'] == 'Germany']
german_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Class,Price
1098,536527,22809,SET OF 6 T-LIGHTS SANTA,6,12/1/2010 13:04,2.95,12662.0,Germany,mid,17.7
1099,536527,84347,ROTATING SILVER ANGELS T-LIGHT HLDR,6,12/1/2010 13:04,2.55,12662.0,Germany,mid,15.3
1100,536527,84945,MULTI COLOUR SILVER T-LIGHT HOLDER,12,12/1/2010 13:04,0.85,12662.0,Germany,mid,10.2
1101,536527,22242,5 HOOK HANGER MAGIC TOADSTOOL,12,12/1/2010 13:04,1.65,12662.0,Germany,mid,19.8
1102,536527,22244,3 HOOK HANGER MAGIC GARDEN,12,12/1/2010 13:04,1.95,12662.0,Germany,mid,23.4


In [10]:
german_df['Description'].value_counts()

POSTAGE                               11
JAM MAKING SET PRINTED                 4
JAM JAR WITH GREEN LID                 4
JAM JAR WITH PINK LID                  4
IVORY KITCHEN SCALES                   3
                                      ..
3 PIECE SPACEBOY COOKIE CUTTER SET     1
GUMBALL MAGAZINE RACK                  1
DOORMAT RED RETROSPOT                  1
ROUND CAKE TIN VINTAGE GREEN           1
SET OF 6 STRAWBERRY CHOPSTICKS         1
Name: Description, Length: 154, dtype: int64

In [11]:
# InvoiceNo : 행 // Description : 열
mybasket = german_df.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().fillna(0)
mybasket

Description,3 HOOK HANGER MAGIC GARDEN,3 HOOK PHOTO SHELF ANTIQUE WHITE,3 PIECE SPACEBOY COOKIE CUTTER SET,5 HOOK HANGER MAGIC TOADSTOOL,5 HOOK HANGER RED MAGIC TOADSTOOL,6 RIBBONS RUSTIC CHARM,60 CAKE CASES VINTAGE CHRISTMAS,60 TEATIME FAIRY CAKE CASES,ANGEL DECORATION PAINTED ZINC,ASSORTED COLOUR LIZARD SUCTION HOOK,...,STARS GIFT TAPE,SWEETHEART CAKESTAND 3 TIER,TEA TIME OVEN GLOVE,TOADSTOOL MONEY BOX,WOODEN HEART CHRISTMAS SCANDINAVIAN,WOODEN STAR CHRISTMAS SCANDINAVIAN,WOODEN TREE CHRISTMAS SCANDINAVIAN,WOODLAND HEIGHT CHART STICKERS,WOODLAND CHARLOTTE BAG,WOODLAND PARTY BAG + STICKER SET
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536527,12.0,0.0,0.0,12.0,12.0,0.0,0.0,0.0,0.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536840,0.0,0.0,0.0,0.0,0.0,12.0,24.0,24.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536861,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0
536967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536983,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
537197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537201,12.0,0.0,18.0,12.0,0.0,24.0,0.0,0.0,24.0,0.0,...,0.0,0.0,0.0,0.0,12.0,12.0,24.0,0.0,30.0,0.0
537212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0
537250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- unstack : 인덱스 레벨에서 컬럼레벨로 데이터 프레임 변경
- 쌓은 것을 옆으로 늘어 놓는 개념

In [12]:
# 0보다 큰 값들을 1로 변경
mybasket = (mybasket > 0).astype(int)
mybasket

Description,3 HOOK HANGER MAGIC GARDEN,3 HOOK PHOTO SHELF ANTIQUE WHITE,3 PIECE SPACEBOY COOKIE CUTTER SET,5 HOOK HANGER MAGIC TOADSTOOL,5 HOOK HANGER RED MAGIC TOADSTOOL,6 RIBBONS RUSTIC CHARM,60 CAKE CASES VINTAGE CHRISTMAS,60 TEATIME FAIRY CAKE CASES,ANGEL DECORATION PAINTED ZINC,ASSORTED COLOUR LIZARD SUCTION HOOK,...,STARS GIFT TAPE,SWEETHEART CAKESTAND 3 TIER,TEA TIME OVEN GLOVE,TOADSTOOL MONEY BOX,WOODEN HEART CHRISTMAS SCANDINAVIAN,WOODEN STAR CHRISTMAS SCANDINAVIAN,WOODEN TREE CHRISTMAS SCANDINAVIAN,WOODLAND HEIGHT CHART STICKERS,WOODLAND CHARLOTTE BAG,WOODLAND PARTY BAG + STICKER SET
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536527,1,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
536840,0,0,0,0,0,1,1,1,0,0,...,0,1,0,0,0,0,0,0,0,0
536861,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
536967,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536983,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
537197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537201,1,0,1,1,0,1,0,0,1,0,...,0,0,0,0,1,1,1,0,1,0
537212,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
537250,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Apriori 알고리즘 적용

In [13]:
my_frequent_itemsets = apriori(mybasket, min_support=0.1, use_colnames=True)
my_frequent_itemsets.head()



Unnamed: 0,support,itemsets
0,0.272727,(3 HOOK HANGER MAGIC GARDEN)
1,0.181818,(5 HOOK HANGER MAGIC TOADSTOOL)
2,0.272727,(6 RIBBONS RUSTIC CHARM)
3,0.181818,(BREAD BIN DINER STYLE IVORY)
4,0.181818,(CHILDREN'S CIRCUS PARADE MUG)


In [14]:
my_rules = association_rules(my_frequent_itemsets, metric='lift', min_threshold=0.1)
my_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(5 HOOK HANGER MAGIC TOADSTOOL),(3 HOOK HANGER MAGIC GARDEN),0.181818,0.272727,0.181818,1.0,3.666667,0.132231,inf
1,(3 HOOK HANGER MAGIC GARDEN),(5 HOOK HANGER MAGIC TOADSTOOL),0.272727,0.181818,0.181818,0.666667,3.666667,0.132231,2.454545
2,(CHILDREN'S CIRCUS PARADE MUG),(3 HOOK HANGER MAGIC GARDEN),0.181818,0.272727,0.181818,1.0,3.666667,0.132231,inf
3,(3 HOOK HANGER MAGIC GARDEN),(CHILDREN'S CIRCUS PARADE MUG),0.272727,0.181818,0.181818,0.666667,3.666667,0.132231,2.454545
4,(HOT WATER BOTTLE BABUSHKA),(3 HOOK HANGER MAGIC GARDEN),0.181818,0.272727,0.181818,1.0,3.666667,0.132231,inf


In [15]:
rules = my_rules[['antecedents' ,'consequents', 'support', 'confidence', 'lift']]
rules.head()

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(5 HOOK HANGER MAGIC TOADSTOOL),(3 HOOK HANGER MAGIC GARDEN),0.181818,1.0,3.666667
1,(3 HOOK HANGER MAGIC GARDEN),(5 HOOK HANGER MAGIC TOADSTOOL),0.181818,0.666667,3.666667
2,(CHILDREN'S CIRCUS PARADE MUG),(3 HOOK HANGER MAGIC GARDEN),0.181818,1.0,3.666667
3,(3 HOOK HANGER MAGIC GARDEN),(CHILDREN'S CIRCUS PARADE MUG),0.181818,0.666667,3.666667
4,(HOT WATER BOTTLE BABUSHKA),(3 HOOK HANGER MAGIC GARDEN),0.181818,1.0,3.666667


In [16]:
rules = rules.sort_values('lift', ascending=False)
rules.head()

Unnamed: 0,antecedents,consequents,support,confidence,lift
1711,(WOODLAND CHARLOTTE BAG),"(JAM MAKING SET WITH JARS, JAM JAR WITH PINK L...",0.181818,1.0,5.5
1152,"(HOT WATER BOTTLE BABUSHKA, 5 HOOK HANGER MAGI...","(CHILDREN'S CIRCUS PARADE MUG, 3 HOOK HANGER M...",0.181818,1.0,5.5
519,(ROUND SNACK BOXES SET OF4 WOODLAND),"(ROUND SNACK BOXES SET OF 4 FRUITS, POSTAGE)",0.181818,1.0,5.5
520,(ROUND SNACK BOXES SET OF 4 FRUITS),"(ROUND SNACK BOXES SET OF4 WOODLAND, POSTAGE)",0.181818,1.0,5.5
1130,"(JAM MAKING SET PRINTED, WOODLAND CHARLOTTE BAG)","(JAM MAKING SET WITH JARS, POSTAGE)",0.181818,1.0,5.5


In [17]:
rules['antecedents_len'] = rules['antecedents'].apply(lambda x:len(x))
rules['consequents_len'] = rules['consequents'].apply(lambda x:len(x))
rules.head()

Unnamed: 0,antecedents,consequents,support,confidence,lift,antecedents_len,consequents_len
1711,(WOODLAND CHARLOTTE BAG),"(JAM MAKING SET WITH JARS, JAM JAR WITH PINK L...",0.181818,1.0,5.5,1,5
1152,"(HOT WATER BOTTLE BABUSHKA, 5 HOOK HANGER MAGI...","(CHILDREN'S CIRCUS PARADE MUG, 3 HOOK HANGER M...",0.181818,1.0,5.5,3,2
519,(ROUND SNACK BOXES SET OF4 WOODLAND),"(ROUND SNACK BOXES SET OF 4 FRUITS, POSTAGE)",0.181818,1.0,5.5,1,2
520,(ROUND SNACK BOXES SET OF 4 FRUITS),"(ROUND SNACK BOXES SET OF4 WOODLAND, POSTAGE)",0.181818,1.0,5.5,1,2
1130,"(JAM MAKING SET PRINTED, WOODLAND CHARLOTTE BAG)","(JAM MAKING SET WITH JARS, POSTAGE)",0.181818,1.0,5.5,2,2


In [18]:
rules[(rules['antecedents_len']>=2) & (rules['support'] >= 0.2) & (rules['lift']>=2)].head()

Unnamed: 0,antecedents,consequents,support,confidence,lift,antecedents_len,consequents_len
397,"(JAM MAKING SET PRINTED, JAM JAR WITH PINK LID)",(JAM JAR WITH GREEN LID),0.363636,1.0,2.75,2,1
396,"(JAM MAKING SET PRINTED, JAM JAR WITH GREEN LID)",(JAM JAR WITH PINK LID),0.363636,1.0,2.75,2,1
398,"(JAM JAR WITH GREEN LID, JAM JAR WITH PINK LID)",(JAM MAKING SET PRINTED),0.363636,1.0,2.75,2,1
464,"(POSTAGE, JAM JAR WITH PINK LID)",(JAM MAKING SET PRINTED),0.363636,1.0,2.75,2,1
462,"(JAM MAKING SET PRINTED, POSTAGE)",(JAM JAR WITH PINK LID),0.363636,1.0,2.75,2,1


In [19]:
rules[rules['consequents']=={'WOODLAND CHARLOTTE BAG'}].head()

Unnamed: 0,antecedents,consequents,support,confidence,lift,antecedents_len,consequents_len
1124,"(JAM MAKING SET PRINTED, JAM MAKING SET WITH J...",(WOODLAND CHARLOTTE BAG),0.181818,1.0,5.5,3,1
498,"(JAM MAKING SET PRINTED, JAM MAKING SET WITH J...",(WOODLAND CHARLOTTE BAG),0.181818,1.0,5.5,2,1
510,"(JAM MAKING SET WITH JARS, POSTAGE)",(WOODLAND CHARLOTTE BAG),0.181818,1.0,5.5,2,1
1083,"(JAM MAKING SET PRINTED, JAM MAKING SET WITH J...",(WOODLAND CHARLOTTE BAG),0.181818,1.0,5.5,3,1
1111,"(JAM MAKING SET WITH JARS, POSTAGE, JAM JAR WI...",(WOODLAND CHARLOTTE BAG),0.181818,1.0,5.5,3,1
