# アソシエーション分析

アソシエーション分析は、データ内のアイテム間の関連を発見する手法です。主に市場バスケット分析に用いられ、買い物のデータから商品の共通購入パターンを見つけます。この分析は特定の開発元によるものではなく、データマイニングの一般的な手法として広く用いられています。特徴としては、ルールの発見や商品推薦など、顧客の行動パターンに基づいた意思決定支援に役立ちます。他にも在庫管理、さらには医療や生物学的な研究においても使用されます。購買データなどで「どの商品が一緒に買われるか」に焦点を当てたものをバスケット分析と言います．
- 参考文献:https://qiita.com/makaishi2/items/c5f06f844cdb8454b6c3

<a href="https://colab.research.google.com/github/fuyu-quant/data-science-wiki/blob/main/tabledata/marketing/association_analysis.ipynb" target="_blank" rel="noopener noreferrer"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
%%capture
!pip install mlxtend==0.23.0

In [5]:
#以下がアソシエーション分析に必要なコード
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

import pandas as pd

#### データセットの用意

In [13]:
url = 'https://raw.githubusercontent.com/fuyu-quant/data-science-wiki/develop/datasets/retail-france.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,発注番号,商品番号,商品説明,商品個数,明細書発行日,商品単価,顧客番号,国名,発注種別
0,536370,22728,ALARM CLOCK BAKELIKE PINK,24,2010-12-01 08:45:00,3.75,12583.0,France,5
1,536370,22727,ALARM CLOCK BAKELIKE RED,24,2010-12-01 08:45:00,3.75,12583.0,France,5
2,536370,22726,ALARM CLOCK BAKELIKE GREEN,12,2010-12-01 08:45:00,3.75,12583.0,France,5
3,536370,21724,PANDA AND BUNNIES STICKER SHEET,12,2010-12-01 08:45:00,0.85,12583.0,France,5
4,536370,21883,STARS GIFT TAPE,24,2010-12-01 08:45:00,0.65,12583.0,France,5


### 前処理
* 商品番号...商品に対してユニークな値
* 発注番号...複数の商品番号を含むまとめて発注した際の番号

In [15]:
w1 = df.groupby(['発注番号', '商品番号'])['商品個数'].sum()

# 商品個数のカウント
print(w1.head())

発注番号    商品番号 
536370  10002    48
        21035    18
        21724    12
        21731    24
        21791    24
Name: 商品個数, dtype: int64


In [9]:
# 商品番号を列に移動する
w2 = w1.unstack().reset_index().fillna(0).set_index('発注番号')

# サイズ確認
print(w2.shape)
display(w2.head())

(392, 1542)


商品番号,10002,10120,10125,10135,11001,15036,15039,15044C,15056BL,15056N,...,90030C,90031,90099,90184B,90184C,90201B,90201C,C2,M,POST
発注番号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
536852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
536974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
537065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
537463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [10]:
# 購入数が０か1以上かでバイナリー変数にする
basket_df = w2.apply(lambda x: x>0)

# 結果確認
display(basket_df.head())


商品番号,10002,10120,10125,10135,11001,15036,15039,15044C,15056BL,15056N,...,90030C,90031,90099,90184B,90184C,90201B,90201C,C2,M,POST
発注番号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
536852,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
536974,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,True
537065,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
537463,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


### アプリオリ分析の実行

In [11]:
freq_items1 = apriori(
    basket_df,
    min_support = 0.06,
    use_colnames = True
    )

# 結果確認
display(freq_items1.sort_values('support',
    ascending = False).head(10))

# itemset数確認
print(freq_items1.shape[0])


Unnamed: 0,support,itemsets
61,0.765306,(POST)
52,0.188776,(23084)
14,0.181122,(21731)
37,0.170918,(22554)
39,0.168367,(22556)
114,0.165816,"(23084, POST)"
24,0.158163,(22326)
82,0.158163,"(POST, 21731)"
4,0.153061,(20725)
89,0.147959,"(POST, 22326)"


134


### アソシエーション分析
* サポート...アイテムセットが登場するトランザクション(購買や取引など)を全トランザクションで割る
* 信頼度...アイテムAとアイテムBが同時に出現するトランザクションをAが出現するトランザクションで割る
* リフト...リフト値が1より高い場合はAとBの出現は依存している．

In [14]:
# アソシエーションルールの抽出
a_rules1 = association_rules(
    freq_items1,
    metric = "lift",
    min_threshold = 1
    )

# リフト値でソート
a_rules1 = a_rules1.sort_values(
    'lift',
    ascending = False
    ).reset_index(drop=True)

# 結果確認
display(a_rules1.head(10))

print('ルール数の確認')
print(a_rules1.shape[0])


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(23256),(23254),0.068878,0.071429,0.063776,0.925926,12.962963,0.058856,12.535714,0.991123
1,(23254),(23256),0.071429,0.068878,0.063776,0.892857,12.962963,0.058856,8.690476,0.993846
2,"(22726, 22728)",(22727),0.07398,0.094388,0.063776,0.862069,9.133271,0.056793,6.565689,0.961653
3,(22727),"(22726, 22728)",0.094388,0.07398,0.063776,0.675676,9.133271,0.056793,2.85523,0.983324
4,(22727),"(22726, POST)",0.094388,0.084184,0.071429,0.756757,8.989353,0.063483,3.765023,0.981388
5,"(22726, POST)",(22727),0.084184,0.094388,0.071429,0.848485,8.989353,0.063483,5.977041,0.970454
6,"(22728, 22727)",(22726),0.07398,0.096939,0.063776,0.862069,8.892922,0.056604,6.547194,0.958457
7,(22726),"(22728, 22727)",0.096939,0.07398,0.063776,0.657895,8.892922,0.056604,2.706829,0.982825
8,(22727),(22726),0.094388,0.096939,0.079082,0.837838,8.642959,0.069932,5.568878,0.976465
9,(22726),(22727),0.096939,0.094388,0.079082,0.815789,8.642959,0.069932,4.916181,0.979224


ルール数の確認
206
