In [1]:
import pyfpgrowth
import pandas as pd
import numpy as np
import json
import fp_growth_py3 as fpg
import matplotlib.pyplot as plt
import pylab as pl
from collections import defaultdict

## 1. 数据预处理
从文件中读取数据集，对related_same_month_brand属性进行关联规则挖掘。
数据预处理主要是丢弃丢失值

In [2]:
def prefunction(x) :
    x = json.loads(x)
    if not x :
        return np.nan
    else :
        return x

converts = {
    "visitor_home_cbgs":prefunction,
    "visitor_work_cbgs":prefunction,
    "related_same_day_brand":prefunction,
    "related_same_month_brand":prefunction,
    "top_brands":prefunction,
    "popularity_by_hour":prefunction,
    "popularity_by_day":prefunction
}
    
data = pd.read_csv("cbg_patterns.csv",converters = converts)

dataSet = list(data["related_same_month_brand"].dropna())
dataSet

[['walmart',
  'mcdonalds',
  'Dollar General',
  'Chick-fil-A',
  'Marathon Petroleum',
  'Shell Oil',
  'Waffle House',
  'SUBWAY',
  'Publix Super Markets',
  'Circle K Stores'],
 ['walmart',
  'mcdonalds',
  'Shell Oil',
  'Chick-fil-A',
  'Dollar General',
  'SUBWAY',
  'Chevron',
  'Taco Bell',
  'Cracker Barrel',
  "Jack's Family Restaurants"],
 ['walmart',
  'Dollar General',
  'mcdonalds',
  'Chevron',
  'Shell Oil',
  'Sonic',
  'SUBWAY',
  "Wendy's",
  'Marathon Petroleum',
  "Jack's Family Restaurants"],
 ['walmart',
  'Dollar General',
  'mcdonalds',
  'Marathon Petroleum',
  'Chick-fil-A',
  'Waffle House',
  'Sonic',
  "Hardee's",
  'Publix Super Markets',
  "America's Thrift Store"],
 ['walmart',
  'Chevron',
  'Dollar General',
  'Shell Oil',
  "Jack's Family Restaurants",
  'mcdonalds',
  'Chick-fil-A',
  "America's Thrift Store",
  'Taco Bell',
  'SUBWAY'],
 ['walmart',
  'Shell Oil',
  'Dollar General',
  'mcdonalds',
  'Chevron',
  'Exxon Mobil',
  'SUBWAY',
  'Cir

## 2.找出频繁项集
利用FP-Growth算法，构造FP-tree，从FP-tree中找到频繁项集。设置最小支持度为0.2

输出所有频繁项集，及其支持度。

In [3]:
frequent_itemsets = fpg.find_frequent_itemsets(dataSet, minimum_support=0.2*len(dataSet), include_support=True)
print(type(frequent_itemsets))   # print type

result = []
for itemset, support in frequent_itemsets:    # 将generator结果存入list
    result.append((itemset, support/len(dataSet)))

    
result_patterns = [i[0] for i in result]
result_support = [i[1] for i in result]
patterns_df = pd.DataFrame({"fluent_patterns":result_patterns,"support":result_support})
patterns = {}
for i in result :
    patterns[frozenset(sorted(i[0]))] = i[1]
patterns_df

<class 'generator'>


Unnamed: 0,fluent_patterns,support
0,[mcdonalds],0.948042
1,[walmart],0.754744
2,"[mcdonalds, walmart]",0.740714
3,[SUBWAY],0.48163
4,"[mcdonalds, SUBWAY]",0.474283
5,"[walmart, SUBWAY]",0.360681
6,"[mcdonalds, walmart, SUBWAY]",0.358426
7,[Shell Oil],0.358114
8,"[mcdonalds, Shell Oil]",0.346091
9,"[walmart, Shell Oil]",0.303474


## 3.导出关联规则
从FP-tree和频繁项集中导出关联规则，并计算关联规则的置信度。

In [4]:
def generate_rules(patterns, min_confidence):
    patterns_group = group_patterns_by_length(patterns)
    raw_rules = defaultdict(set)
    for length, pattern_list in patterns_group.items():
        if length == 1:
            continue
        for pattern, support in pattern_list:
            item_list = list(pattern)
            for window_size in range(1, length):
                for i in range(0, length - window_size):
                    for j in range(i + window_size, length):
                        base_set = frozenset(item_list[i:j])
                        predict_set = frozenset(pattern - base_set)
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))

                        base_set, predict_set = predict_set, base_set
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))
    return raw_rules

def group_patterns_by_length(patterns):
    result = defaultdict(list)
    for pattern, support in patterns.items():
        result[len(pattern)].append((pattern, support))
    return result

def transform(raw_rules):
    result = list()
    for base_set, predict_set_list in raw_rules.items():
        for predict_set, confidence in predict_set_list:
            result.append((base_set, predict_set, confidence))
    
    return result

raw_rules = generate_rules(patterns, 0.7)
rules = transform(raw_rules)
rules.sort(key=lambda x: x[2], reverse=True)
rules_a = [i[0] for i in rules]
rules_b = [i[1] for i in rules]
confidence = [i[2] for i in rules]
rules_df = pd.DataFrame({"rules_a":rules_a,"rules_b":rules_b,"confidence":confidence})
rules_df

Unnamed: 0,rules_a,rules_b,confidence
0,"(SUBWAY, walmart)",(mcdonalds),0.993748
1,(BP),(mcdonalds),0.985789
2,(SUBWAY),(mcdonalds),0.984744
3,"(Chick-fil-A, walmart)",(mcdonalds),0.984686
4,"(starbucks, walmart)",(mcdonalds),0.984399
5,"(Shell Oil, walmart)",(mcdonalds),0.983475
6,"(Dollar General, walmart)",(mcdonalds),0.983036
7,(walmart),(mcdonalds),0.981411
8,(Dollar General),(mcdonalds),0.978254
9,(Chick-fil-A),(mcdonalds),0.978045


## 4.对关联规则进行评价
使用Lift和全置信度指标进行评价。

In [None]:
lift= []
fsupport = []

# 计算Lift
for index,row in rules_df.iterrows():
    temp = patterns[row["rules_a"]|row["rules_b"]] / patterns[row["rules_a"]] / patterns[row["rules_b"]] 
    lift.append(temp)
rules_df["lift"] = lift

# 计算kulc
for index,row in rules_df.iterrows():
    temp = patterns[row["rules_a"]|row["rules_b"]] / patterns[row["rules_b"]]
    fsupport.append( (row["confidence"]+temp)/2 )
rules_df["kulc"] = fsupport
rules_df

## 5.可视化展示挖掘结果 

In [None]:
fig,axes = plt.subplots(1,2,figsize=(20,5))
rules_df.plot(kind='hexbin',x="confidence",y="lift",C="kulc",gridsize=10,ax = axes[0],title='Picture 1: evaluation')

axes[1].bar([str(i) for i in patterns_df["fluent_patterns"]],patterns_df["support"])
axes[1].set_xticklabels([str(i) for i in patterns_df["fluent_patterns"]],rotation=90)
axes[1].set_xlabel("frequent item set")
axes[1].set_ylabel("support")
axes[1].set_title("Picture2: support of frequent item set")
plt.show()

Picture1展示了每个频繁规则的置信度、提升度、kulc。一般情况下，我们认为提升度大于3的关联规则是一个比较强的关联规则，当提升度为1时两者没有关联，但本数据集中关联规则的提升度大多在1-1.3，所以是比较弱的关联规则。
Picture2展示了所有频繁项集的支持度，可见最受欢迎的品牌是麦当劳，紧随其后的是沃尔玛，同时去麦当劳和沃尔玛的人也很多。

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
ax.barh([str(i)+"->"+str(j) for i,j in zip(rules_df["rules_a"],rules_df["rules_b"])], [i for i in rules_df["confidence"]], height=0.7, align='center', color='#AAAAAA')
# ax.set_yticklabels(rules_df["confidence"])      # 也可以在这里设置 条条 的标签~
ax.set_xlabel('confidence')
ax.set_ylabel('association rules')
ax.invert_yaxis()
ax.set_title('Picture3: confidence of association rules')

plt.show()

Picture3展示了所有的关联规则及其对应的置信度，从图中可以看出，所有关联规则的置信度都大于70%。尤其是第一条，去过SUBWAY和walmart的人几乎肯定会去mcdonalds。