## 關聯性分析 - 購物籃分析

#### Support 支持度: 表示這個購買組合，在整個樣本中出現的機率
#### Confidence 信心度: 買A產品又買B產品的機率
#### Lift 提升度: 兩項商品之間的關聯性
+ 正相關: lift > 1 - 兩項商品是有關聯性的
+ 負相關: lift < 1 - 兩項商品呈現負相關性
+ 不相關: lift = 1 - 兩項商品沒有相關性

Lift與Confidence要一起看，要先檢視兩項商品是正相關，才會用Confidence去找買下一個產品的機率

In [1]:
import pandas as pd
from apyori import apriori
from tqdm import tqdm
import platform

In [2]:
theOS = list(platform.uname())[0]
if theOS == 'Windows':
    theOS = '\\'
    theEncode = 'utf-8-sig'
else:
    theOS = '/'
    theEncode = 'utf-8'

df = pd.read_csv('../data/ta_feng_all_months_merged.csv')
df

Unnamed: 0,TRANSACTION_DT,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,ASSET,SALES_PRICE
0,11/1/2000,1104905,45-49,115,110411,4710199010372,2,24,30
1,11/1/2000,418683,45-49,115,120107,4710857472535,1,48,46
2,11/1/2000,1057331,35-39,115,100407,4710043654103,2,142,166
3,11/1/2000,1849332,45-49,Others,120108,4710126092129,1,32,38
4,11/1/2000,1981995,50-54,115,100205,4710176021445,1,14,18
...,...,...,...,...,...,...,...,...,...
817736,2/28/2001,312790,35-39,114,530501,4713317035042,2,80,118
817737,2/28/2001,57486,40-44,115,530209,4710731060124,1,40,55
817738,2/28/2001,733526,>65,Unknown,510539,4716340052307,1,78,115
817739,2/28/2001,173704,45-49,115,520457,4714276145315,1,90,96


## 將當天同一個顧客購買的商品視為一個訂單 並創建訂單編號

In [3]:
## 日期 + Customer ID
df['orderID'] = df['CUSTOMER_ID'].astype(str) + df['TRANSACTION_DT']
df

Unnamed: 0,TRANSACTION_DT,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,ASSET,SALES_PRICE,orderID
0,11/1/2000,1104905,45-49,115,110411,4710199010372,2,24,30,110490511/1/2000
1,11/1/2000,418683,45-49,115,120107,4710857472535,1,48,46,41868311/1/2000
2,11/1/2000,1057331,35-39,115,100407,4710043654103,2,142,166,105733111/1/2000
3,11/1/2000,1849332,45-49,Others,120108,4710126092129,1,32,38,184933211/1/2000
4,11/1/2000,1981995,50-54,115,100205,4710176021445,1,14,18,198199511/1/2000
...,...,...,...,...,...,...,...,...,...,...
817736,2/28/2001,312790,35-39,114,530501,4713317035042,2,80,118,3127902/28/2001
817737,2/28/2001,57486,40-44,115,530209,4710731060124,1,40,55,574862/28/2001
817738,2/28/2001,733526,>65,Unknown,510539,4716340052307,1,78,115,7335262/28/2001
817739,2/28/2001,173704,45-49,115,520457,4714276145315,1,90,96,1737042/28/2001


In [4]:
record=[]
for i in tqdm(df['orderID'].value_counts().index):
    member = df[df['orderID']==i]
    record.append(member['PRODUCT_SUBCLASS'].values.tolist()) 

100%|█████████████████████████████████| 119578/119578 [1:20:23<00:00, 24.79it/s]


In [5]:
record

[[100110,
  110407,
  570404,
  500206,
  110203,
  530104,
  100217,
  110209,
  100106,
  500201,
  520314,
  530502,
  110508,
  130317,
  100312,
  110208,
  100106,
  100308,
  110411,
  100308,
  100205,
  100323,
  130206,
  590531,
  110508,
  100201,
  100310,
  100304,
  120204,
  110105,
  100324,
  500103,
  110814,
  100324,
  510101,
  100324,
  570308,
  100301,
  110508,
  100102,
  100306,
  100205,
  110121,
  100205,
  100324,
  110207,
  100102,
  100603,
  100324,
  530108,
  110207,
  560201,
  100303,
  100201,
  100310,
  110108,
  110112,
  100320,
  110505,
  110217,
  110102,
  501002,
  110117,
  510302,
  100205,
  100322,
  500106,
  100109,
  110136,
  530302,
  501129,
  100322,
  110106,
  100312,
  100217,
  500201,
  130106,
  500203,
  130205,
  530403,
  100307,
  100110,
  100303,
  100309,
  100303,
  110137,
  130317,
  110103,
  500201,
  110507,
  530110,
  100309,
  501001,
  100301,
  100310,
  110508,
  500205,
  100212,
  501001,
  100304,


## 計算出所有最小信心水準有1%且最小正相關要大於1的商品組合

In [6]:
association_rules = apriori(record, min_support = 0.01, min_lift = 1.00000001)
association_results = list(association_rules)

In [7]:
association_results

[RelationRecord(items=frozenset({100205, 100102}), support=0.016533141547776346, ordered_statistics=[OrderedStatistic(items_base=frozenset({100102}), items_add=frozenset({100205}), confidence=0.2960910588587689, lift=2.4120155757349866), OrderedStatistic(items_base=frozenset({100205}), items_add=frozenset({100102}), confidence=0.13468219905988146, lift=2.4120155757349866)]),
 RelationRecord(items=frozenset({100201, 100205}), support=0.01992005218351202, ordered_statistics=[OrderedStatistic(items_base=frozenset({100201}), items_add=frozenset({100205}), confidence=0.3576039633688636, lift=2.913111705955581), OrderedStatistic(items_base=frozenset({100205}), items_add=frozenset({100201}), confidence=0.16227263437563869, lift=2.9131117059555804)]),
 RelationRecord(items=frozenset({100312, 100205}), support=0.025305658231447255, ordered_statistics=[OrderedStatistic(items_base=frozenset({100205}), items_add=frozenset({100312}), confidence=0.2061448327542748, lift=3.6320004141875164), OrderedS

In [8]:
## 支持度
association_results[0][1]

0.016533141547776346

In [9]:
product = []
support = []
for i in range(len(association_results)):
    product.append(association_results[i][0])
    support.append(association_results[i][1])
product_df = pd.DataFrame()
product_df['商品組合'] = product
product_df['支持度'] = support
product_df.to_csv('整體顧客-推薦商品組合1.csv')

## 計算出所有最小信心水準有2%且最小正相關要大於1的商品組合

In [10]:
association_rules = apriori(record, min_support = 0.02, min_lift = 1.00000001)
association_results = list(association_rules)

In [11]:
product = []
support = []
for i in range(len(association_results)):
    product.append(association_results[i][0])
    support.append(association_results[i][1])
product_df = pd.DataFrame()
product_df['商品組合'] = product
product_df['支持度'] = support
product_df.to_csv('整體顧客-推薦商品組合2.csv')

## 計算出所有最小信心水準有1%且最小正相關要大於1的商品組合

In [9]:
association_rules = apriori(record, min_support = 0.01, min_lift = 1.00000001)
association_results = list(association_rules)

In [10]:
association_results

[RelationRecord(items=frozenset({100205, 100102}), support=0.016533141547776346, ordered_statistics=[OrderedStatistic(items_base=frozenset({100102}), items_add=frozenset({100205}), confidence=0.2960910588587689, lift=2.4120155757349866), OrderedStatistic(items_base=frozenset({100205}), items_add=frozenset({100102}), confidence=0.13468219905988146, lift=2.4120155757349866)]),
 RelationRecord(items=frozenset({100201, 100205}), support=0.01992005218351202, ordered_statistics=[OrderedStatistic(items_base=frozenset({100201}), items_add=frozenset({100205}), confidence=0.3576039633688636, lift=2.913111705955581), OrderedStatistic(items_base=frozenset({100205}), items_add=frozenset({100201}), confidence=0.16227263437563869, lift=2.9131117059555804)]),
 RelationRecord(items=frozenset({100312, 100205}), support=0.025305658231447255, ordered_statistics=[OrderedStatistic(items_base=frozenset({100205}), items_add=frozenset({100312}), confidence=0.2061448327542748, lift=3.6320004141875164), OrderedS

## 針對我們的常貴客進行關聯性分析

In [11]:
## 找出所有常貴客的CUSTOMER ID
target_date = pd.read_csv('purchase_list.csv')
ta_list = target_date[target_date['customer'] == '常貴客']['CUSTOMER_ID'].unique()

In [12]:
## 過濾出所有常貴客的資料
new_df = df[df['CUSTOMER_ID'].isin(ta_list)]
new_df

## 日期 + Customer ID
new_df['orderID'] = new_df['CUSTOMER_ID'].astype(str) + new_df['TRANSACTION_DT']
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['orderID'] = new_df['CUSTOMER_ID'].astype(str) + new_df['TRANSACTION_DT']


Unnamed: 0,TRANSACTION_DT,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,ASSET,SALES_PRICE,orderID
118,11/1/2000,439725,55-59,115,130315,4714981010038,2,56,44,43972511/1/2000
350,11/1/2000,439725,55-59,115,110411,4710088414106,2,56,66,43972511/1/2000
378,11/1/2000,439725,55-59,115,110411,4710085120628,3,60,57,43972511/1/2000
472,11/1/2000,439725,55-59,115,110411,4710085172900,2,46,54,43972511/1/2000
487,11/1/2000,439725,55-59,115,110504,4710626111610,1,23,26,43972511/1/2000
...,...,...,...,...,...,...,...,...,...,...
816314,2/28/2001,2112596,35-39,115,100208,4710144202227,1,33,45,21125962/28/2001
816541,2/28/2001,2112596,35-39,115,100213,4710304111147,1,44,49,21125962/28/2001
816760,2/28/2001,2112596,35-39,115,100315,4711202220016,1,30,39,21125962/28/2001
816945,2/28/2001,2112596,35-39,115,731302,4717536400292,1,62,85,21125962/28/2001


In [13]:
record=[]
for i in tqdm(new_df['orderID'].value_counts().index):
    member = new_df[new_df['orderID']==i]
    record.append(member['PRODUCT_SUBCLASS'].values.tolist()) 

100%|█████████████████████████████████████| 1330/1330 [00:01<00:00, 1224.01it/s]


In [14]:
association_rules = apriori(record, min_support = 0.02, min_lift = 1.00000001)
association_results = list(association_rules)

In [15]:
association_results

[RelationRecord(items=frozenset({100205, 100102}), support=0.03684210526315789, ordered_statistics=[OrderedStatistic(items_base=frozenset({100102}), items_add=frozenset({100205}), confidence=0.4537037037037036, lift=2.4331690561529267), OrderedStatistic(items_base=frozenset({100205}), items_add=frozenset({100102}), confidence=0.1975806451612903, lift=2.4331690561529267)]),
 RelationRecord(items=frozenset({130204, 100102}), support=0.02180451127819549, ordered_statistics=[OrderedStatistic(items_base=frozenset({100102}), items_add=frozenset({130204}), confidence=0.2685185185185185, lift=1.6457586618876938), OrderedStatistic(items_base=frozenset({130204}), items_add=frozenset({100102}), confidence=0.1336405529953917, lift=1.645758661887694)]),
 RelationRecord(items=frozenset({130206, 100102}), support=0.02330827067669173, ordered_statistics=[OrderedStatistic(items_base=frozenset({100102}), items_add=frozenset({130206}), confidence=0.28703703703703703, lift=1.5518669075579643), OrderedStat

In [29]:
str(association_results[0][0])
product = product.append(association_results[i][0])

'frozenset({100205, 100102})'

In [37]:
product = []
support = []
for i in range(len(association_results)):
    product.append(association_results[i][0])
    support.append(association_results[i][1])
product_df = pd.DataFrame()
product_df['商品組合'] = product
product_df['支持度'] = support
product_df.to_csv('常規客-推薦商品組合.csv')