<a href="https://colab.research.google.com/github/ToumaTanaka/Data_Science/blob/main/Tabel_Data/Machine_Learning/Unsupervised_Learning/association_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# アソシエーション分析
* アソシエーション分析は大量の購買履歴データからセットで購入されている商品の組み合わせを探すための分析手法である
* 以下ではOnlineRetailというデータを使いアソシエーション分析を行う
* データマイニングなどでよく使われる
* マーケットバスケット分析とも呼ばれる

In [24]:
#以下がアソシエーション分析に必要なコード
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


#OnlineRetailデータセットをインストールする
import pandas as pd

df = pd.read_excel("https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx")

In [5]:
#データの中身を見る
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [14]:
#InvoiceNoの先頭により発注種別、対象の国をドイツに限定する
df2 = df.copy()
df2['InvoiceNo2'] = df2['InvoiceNo'].map(lambda x: str(x)[0])
df2 = df2[df2['InvoiceNo2']=='5']

In [15]:
df3 = df2[df2['Country']=='Germany']

In [20]:
#InvoiceNo(発注番号)でグループを作り、グループのStockCode(商品番号)を並べその個数をカウント
temp1 = df3.groupby(['InvoiceNo','StockCode'])['Quantity'].sum()
temp1

InvoiceNo  StockCode
536527     20712        10
           20713        10
           22242        12
           22243        12
           22244        12
                        ..
581578     23550        25
           84997B        8
           84997C        8
           84997D        8
           POST          3
Name: Quantity, Length: 9015, dtype: int64

In [25]:
#二段階のindexを持っているデータはunstack()を使うことで横文字のデータに変換できる
#fillna()でNullを0に置き換える
temp2 = temp1.unstack().fillna(0)

#購入数が1以上の商品はTrue、購入数が0の商品はFalseとする
association_df = temp2.apply(lambda x: x>0)

## アソシエーション分析を行う
アソシエーション分析は二段階で行う。　　
* アプリオリ分析  
組み合わせの爆発が起きてしまうため、指示度という閾値を使いデータを厳選する。

* ルール抽出  
リフト値の値を設定し、それより大きいか小さいかでルール抽出をおこなう。

In [29]:
#アプリオリ分析を行う
freq_items1 = apriori(association_df, min_support = 0.06, use_colnames=True)

#アプリオリ分析の表示
freq_items1.sort_values('support',ascending=False)

Unnamed: 0,support,itemsets
35,0.818381,(POST)
13,0.245077,(22326)
47,0.225383,"(POST, 22326)"
14,0.157549,(22328)
48,0.150985,"(22328, POST)"
...,...,...
49,0.061269,"(POST, 22331)"
16,0.061269,(22333)
26,0.061269,(22907)
29,0.061269,(23199)


In [30]:
#POSTと22326という商品が何かを確認する
df3[df3['StockCode']=='POST']

#以下よりPOSTはpostage(送料)のことなのが確認できる

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,InvoiceNo2
1123,536527,POST,POSTAGE,1,2010-12-01 13:04:00,18.0,12662.0,Germany,5
5073,536840,POST,POSTAGE,1,2010-12-02 18:27:00,18.0,12738.0,Germany,5
5369,536861,POST,POSTAGE,3,2010-12-03 10:44:00,18.0,12427.0,Germany,5
6602,536967,POST,POSTAGE,1,2010-12-03 12:57:00,18.0,12600.0,Germany,5
6973,536983,POST,POSTAGE,1,2010-12-03 14:30:00,18.0,12712.0,Germany,5
...,...,...,...,...,...,...,...,...,...
537459,581266,POST,POSTAGE,5,2011-12-08 11:25:00,18.0,12621.0,Germany,5
541216,581494,POST,POSTAGE,2,2011-12-09 10:13:00,18.0,12518.0,Germany,5
541730,581570,POST,POSTAGE,1,2011-12-09 11:59:00,18.0,12662.0,Germany,5
541767,581574,POST,POSTAGE,2,2011-12-09 12:09:00,18.0,12526.0,Germany,5


In [31]:
#ルール抽出を行う
rules = association_rules(freq_items1, metric='lift', min_threshold=1)

In [32]:
#値の大きい順番に並び替える
rules = rules.sort_values('lift',ascending=False)

#indexをリセット
rules = rules.reset_index(drop=True)

In [33]:
#以下に表示されているものを確認すると、動物の絆創膏とサーカスの絆創膏が一番多く一緒に売れていることがわかる。
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(22554),(22556),0.137856,0.115974,0.067834,0.492063,4.242887,0.051846,1.740427
1,(22556),(22554),0.115974,0.137856,0.067834,0.584906,4.242887,0.051846,2.076984
2,(22554),(22551),0.137856,0.107221,0.061269,0.444444,4.145125,0.046488,1.607002
3,(22551),(22554),0.107221,0.137856,0.061269,0.571429,4.145125,0.046488,2.011670
4,(22328),"(POST, 22326)",0.157549,0.225383,0.124726,0.791667,3.512540,0.089218,3.718162
...,...,...,...,...,...,...,...,...,...
61,(POST),(22554),0.818381,0.137856,0.118162,0.144385,1.047364,0.005344,1.007631
62,(POST),(21731),0.818381,0.096280,0.080963,0.098930,1.027528,0.002169,1.002941
63,(21731),(POST),0.096280,0.818381,0.080963,0.840909,1.027528,0.002169,1.141607
64,(POST),(23240),0.818381,0.076586,0.063457,0.077540,1.012452,0.000780,1.001034


In [35]:
df3[df3['StockCode']==22554]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,InvoiceNo2
9707,537212,22554,PLASTERS IN TIN WOODLAND ANIMALS,12,2010-12-05 15:21:00,1.65,12720.0,Germany,5
22548,538174,22554,PLASTERS IN TIN WOODLAND ANIMALS,24,2010-12-10 09:35:00,1.65,12471.0,Germany,5
34489,539395,22554,PLASTERS IN TIN WOODLAND ANIMALS,12,2010-12-17 11:52:00,1.65,12471.0,Germany,5
37466,539466,22554,PLASTERS IN TIN WOODLAND ANIMALS,12,2010-12-19 12:46:00,1.65,12668.0,Germany,5
52365,540769,22554,PLASTERS IN TIN WOODLAND ANIMALS,12,2011-01-11 10:38:00,1.65,12601.0,Germany,5
...,...,...,...,...,...,...,...,...,...
509644,579393,22554,PLASTERS IN TIN WOODLAND ANIMALS,12,2011-11-29 12:07:00,1.65,12627.0,Germany,5
523414,580512,22554,PLASTERS IN TIN WOODLAND ANIMALS,12,2011-12-04 13:55:00,1.65,12673.0,Germany,5
532904,581000,22554,PLASTERS IN TIN WOODLAND ANIMALS,12,2011-12-07 08:03:00,1.65,12720.0,Germany,5
534969,581179,22554,PLASTERS IN TIN WOODLAND ANIMALS,24,2011-12-07 15:43:00,1.65,12471.0,Germany,5


In [36]:
df3[df3['StockCode']==22556]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,InvoiceNo2
9407,537201,22556,PLASTERS IN TIN CIRCUS PARADE,12,2010-12-05 14:19:00,1.65,12472.0,Germany,5
19798,537892,22556,PLASTERS IN TIN CIRCUS PARADE,12,2010-12-09 10:13:00,1.65,12481.0,Germany,5
22549,538174,22556,PLASTERS IN TIN CIRCUS PARADE,24,2010-12-10 09:35:00,1.65,12471.0,Germany,5
34491,539395,22556,PLASTERS IN TIN CIRCUS PARADE,12,2010-12-17 11:52:00,1.65,12471.0,Germany,5
47410,540414,22556,PLASTERS IN TIN CIRCUS PARADE,36,2011-01-07 10:32:00,1.65,12481.0,Germany,5
69182,541965,22556,PLASTERS IN TIN CIRCUS PARADE,12,2011-01-24 13:27:00,1.65,12625.0,Germany,5
72066,542229,22556,PLASTERS IN TIN CIRCUS PARADE,12,2011-01-26 13:38:00,1.65,12474.0,Germany,5
73619,542369,22556,PLASTERS IN TIN CIRCUS PARADE,24,2011-01-27 13:08:00,1.65,12645.0,Germany,5
73655,542371,22556,PLASTERS IN TIN CIRCUS PARADE,12,2011-01-27 13:29:00,1.65,12468.0,Germany,5
101827,544933,22556,PLASTERS IN TIN CIRCUS PARADE,12,2011-02-25 08:44:00,1.65,12633.0,Germany,5
