## 關聯性分析 - 購物籃分析

#### Support 支持度: 表示這個購買組合，在整個樣本中出現的機率
#### Confidence 信心度: 買A產品又買B產品的機率
#### Lift 提升度: 兩項商品之間的關聯性
+ 正相關: lift > 1 - 兩項商品是有關聯性的
+ 負相關: lift < 1 - 兩項商品呈現負相關性
+ 不相關: lift = 1 - 兩項商品沒有相關性

Lift與Confidence要一起看，要先檢視兩項商品是正相關，才會用Confidence去找買下一個產品的機率

In [1]:
import pandas as pd
from apyori import apriori
from tqdm import tqdm
import platform

In [2]:
theOS = list(platform.uname())[0]
if theOS == 'Windows':
    theOS = '\\'
    theEncode = 'utf-8-sig'
else:
    theOS = '/'
    theEncode = 'utf-8'

df = pd.read_csv('../data/ta_feng_all_months_merged.csv')
df

Unnamed: 0,TRANSACTION_DT,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,ASSET,SALES_PRICE
0,11/1/2000,1104905,45-49,115,110411,4710199010372,2,24,30
1,11/1/2000,418683,45-49,115,120107,4710857472535,1,48,46
2,11/1/2000,1057331,35-39,115,100407,4710043654103,2,142,166
3,11/1/2000,1849332,45-49,Others,120108,4710126092129,1,32,38
4,11/1/2000,1981995,50-54,115,100205,4710176021445,1,14,18
...,...,...,...,...,...,...,...,...,...
817736,2/28/2001,312790,35-39,114,530501,4713317035042,2,80,118
817737,2/28/2001,57486,40-44,115,530209,4710731060124,1,40,55
817738,2/28/2001,733526,>65,Unknown,510539,4716340052307,1,78,115
817739,2/28/2001,173704,45-49,115,520457,4714276145315,1,90,96


## 將當天同一個顧客購買的商品視為一個訂單 並創建訂單編號

In [3]:
## 日期 + Customer ID
df['orderID'] = df['CUSTOMER_ID'].astype(str) + df['TRANSACTION_DT']
df

Unnamed: 0,TRANSACTION_DT,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,ASSET,SALES_PRICE,orderID
0,11/1/2000,1104905,45-49,115,110411,4710199010372,2,24,30,110490511/1/2000
1,11/1/2000,418683,45-49,115,120107,4710857472535,1,48,46,41868311/1/2000
2,11/1/2000,1057331,35-39,115,100407,4710043654103,2,142,166,105733111/1/2000
3,11/1/2000,1849332,45-49,Others,120108,4710126092129,1,32,38,184933211/1/2000
4,11/1/2000,1981995,50-54,115,100205,4710176021445,1,14,18,198199511/1/2000
...,...,...,...,...,...,...,...,...,...,...
817736,2/28/2001,312790,35-39,114,530501,4713317035042,2,80,118,3127902/28/2001
817737,2/28/2001,57486,40-44,115,530209,4710731060124,1,40,55,574862/28/2001
817738,2/28/2001,733526,>65,Unknown,510539,4716340052307,1,78,115,7335262/28/2001
817739,2/28/2001,173704,45-49,115,520457,4714276145315,1,90,96,1737042/28/2001


In [None]:
record=[]
for i in tqdm(df['orderID'].value_counts().index):
    member = df[df['orderID']==i]
    record.append(member['PRODUCT_SUBCLASS'].values.tolist()) 

  2%|▊                                  | 2803/119578 [01:53<1:19:39, 24.43it/s]

In [None]:
record

## 計算出所有最小信心水準有2%且最小正相關要大於1的商品組合

In [None]:
association_rules = apriori(record, min_support = 0.02, min_lift = 1.00000001)
association_results = list(association_rules)

In [None]:
association_results

In [None]:
## 支持度
association_results[0][1]

## 計算出所有最小信心水準有1%且最小正相關要大於1的商品組合

In [None]:
association_rules = apriori(record, min_support = 0.01, min_lift = 1.00000001)
association_results = list(association_rules)

In [None]:
association_results

## 針對我們的常貴客進行關聯性分析

In [None]:
## 找出所有常貴客的CUSTOMER ID
target_date = pd.read_csv('purchase_list.csv')
ta_list = target_date[target_date['customer'] == '常貴客']['CUSTOMER_ID'].unique()

In [None]:
## 過濾出所有常貴客的資料
new_df = df[df['CUSTOMER_ID'].isin(ta_list)]
new_df

## 日期 + Customer ID
new_df['orderID'] = new_df['CUSTOMER_ID'].astype(str) + new_df['TRANSACTION_DT']
new_df

In [None]:
record=[]
for i in tqdm(new_df['orderID'].value_counts().index):
    member = new_df[new_df['orderID']==i]
    record.append(member['PRODUCT_SUBCLASS'].values.tolist()) 

In [None]:
association_rules = apriori(record, min_support = 0.02, min_lift = 1.00000001)
association_results = list(association_rules)

In [None]:
association_results