## 方法一: 使用apyori套件

In [1]:
## Import package
from apyori import apriori
## Data 自行定義數據
market_data = [['T-Shirt','Pants','Jeans','Jersy','Socks','Basketball','Bottle','Shorts'],
 ['T-Shirt','Jeans'],
 ['Jersy','Basketball','Socks','Bottle'],
 ['Jeans','Pants','Bottle'],
 ['Shorts','Basketball'],
 ['Shorts','Jersy'],
 ['T-Shirt'],
 ['Basketball','Jersy'],
 ]
association_rules = apriori(market_data, min_support=0.2, min_confidence=0.2, min_lift=2, max_length=2)
association_results = list(association_rules)
##print(association_results )
for product in association_results:
 #print(product) # ex. RelationRecord(items=frozenset({'Basketball', 'Socks'}), support=0.25, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Basketball'}), items_add=frozenset({'Socks'}), confidence=0.5, lift=2.0), OrderedStatistic(items_base=frozenset({'Socks'}), items_add=frozenset({'Basketball'}), confidence=1.0, lift=2.0)])
 pair = product[0] 
 ##print(pair) ## ex. frozenset({'Basketball', 'Socks'})
 products = [x for x in pair]
 print(products) # ex. ['Basketball', 'Socks']
 print("Rule: " + products[0] + " →" + products[1])
 print("Support: " + str(product[1]))
 print("Lift: " + str(product[2][0][3]))
 print("==================================")

['Basketball', 'Socks']
Rule: Basketball →Socks
Support: 0.25
Lift: 2.0
['Pants', 'Bottle']
Rule: Pants →Bottle
Support: 0.25
Lift: 2.6666666666666665
['Bottle', 'Socks']
Rule: Bottle →Socks
Support: 0.25
Lift: 2.6666666666666665
['Pants', 'Jeans']
Rule: Pants →Jeans
Support: 0.25
Lift: 2.6666666666666665
['Jersy', 'Socks']
Rule: Jersy →Socks
Support: 0.25
Lift: 2.0


## 方法二: 使用mlxtend套件，將數據轉換成one-hot編碼

In [3]:
## Import Package
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
## Data 自行定義數據
market_data = {
 'Transaction ID': [1,2,3,4,5,6,7,8],
 'Items':[['T-Shirt','Pants','Jeans','Jersy','Socks','Basketball','Bottle','Shorts'],
 ['T-Shirt','Jeans'],
 ['Jersy','Basketball','Socks','Bottle'],
 ['Jeans','Pants','Bottle'],
 ['Shorts','Basketball'],
 ['Shorts','Jersy'],
 ['T-Shirt'],
 ['Basketball','Jersy'],
 ]}
## 轉成DataFrame
data = pd.DataFrame(market_data)
## 讓DataFrame 能呈現的寬度大一點
pd.options.display.max_colwidth = 100
## 轉成數值編碼，目前都是字串的組合
data_id = data.drop('Items', 1)
data_items = data.Items.str.join(',')
## 轉成數值
data_items = data_items.str.get_dummies(',')
## 接上Transaction ID
data = data_id.join(data_items)
## 計算支持度 Support
Support_items = apriori(data[['T-Shirt','Pants','Jeans','Jersy','Socks','Basketball','Bottle','Shorts']], min_support=0.20, use_colnames = True)
## 計算關聯規則 Association Rule
Association_Rules = association_rules(Support_items, metric = 'lift', min_threshold=1)

Association_Rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Jeans),(T-Shirt),0.375,0.375,0.25,0.666667,1.777778,0.109375,1.875
1,(T-Shirt),(Jeans),0.375,0.375,0.25,0.666667,1.777778,0.109375,1.875
2,(Pants),(Jeans),0.250,0.375,0.25,1.000000,2.666667,0.156250,inf
3,(Jeans),(Pants),0.375,0.250,0.25,0.666667,2.666667,0.156250,2.250
4,(Pants),(Bottle),0.250,0.375,0.25,1.000000,2.666667,0.156250,inf
...,...,...,...,...,...,...,...,...,...
63,"(Socks, Basketball)","(Jersy, Bottle)",0.250,0.250,0.25,1.000000,4.000000,0.187500,inf
64,(Jersy),"(Bottle, Socks, Basketball)",0.500,0.250,0.25,0.500000,2.000000,0.125000,1.500
65,(Bottle),"(Jersy, Socks, Basketball)",0.375,0.250,0.25,0.666667,2.666667,0.156250,2.250
66,(Socks),"(Jersy, Bottle, Basketball)",0.250,0.250,0.25,1.000000,4.000000,0.187500,inf


## 方法三: 基本分析方法

In [4]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [5]:
data = {'ID': [1,2,3,4,5,6],
        'Onion':[1,0,0,1,1,1],
        'Potato': [1,1,0,1,1,1],
        'Burger':[1,1,0,0,1,1],
        'Milk':[0,1,1,1,0,1],
        'Beer': [0,0,1,0,1,0]}

In [6]:
df = pd.DataFrame(data)

In [7]:
df = df[['ID', 'Onion', 'Potato', 'Burger', 'Milk', 'Beer']]

In [8]:
df

Unnamed: 0,ID,Onion,Potato,Burger,Milk,Beer
0,1,1,1,1,0,0
1,2,0,1,1,1,0
2,3,0,0,0,1,1
3,4,1,1,0,1,0
4,5,1,1,1,0,1
5,6,1,1,1,1,0


In [9]:
frequent_itemsets = apriori(df[['Onion', 'Potato', 'Burger','Milk','Beer']], min_support =0.50, use_colnames=True)

In [10]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.666667,(Onion)
1,0.833333,(Potato)
2,0.666667,(Burger)
3,0.666667,(Milk)
4,0.666667,"(Onion, Potato)"
5,0.5,"(Burger, Onion)"
6,0.666667,"(Burger, Potato)"
7,0.5,"(Potato, Milk)"
8,0.5,"(Burger, Onion, Potato)"


In [11]:
# min_threshlod = 1 最小的lift值須等於1 不然沒有意義
rules = association_rules(frequent_itemsets, metric = 'lift', min_threshold=1)

In [12]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Onion),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf
1,(Potato),(Onion),0.833333,0.666667,0.666667,0.8,1.2,0.111111,1.666667
2,(Burger),(Onion),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333
3,(Onion),(Burger),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333
4,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf
5,(Potato),(Burger),0.833333,0.666667,0.666667,0.8,1.2,0.111111,1.666667
6,"(Burger, Onion)",(Potato),0.5,0.833333,0.5,1.0,1.2,0.083333,inf
7,"(Burger, Potato)",(Onion),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333
8,"(Onion, Potato)",(Burger),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333
9,(Burger),"(Onion, Potato)",0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333


In [13]:
rules [( rules['lift']>1.125) & (rules['confidence']>0.8)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Onion),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf
4,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf
6,"(Burger, Onion)",(Potato),0.5,0.833333,0.5,1.0,1.2,0.083333,inf


ONE HOT ENCODER

## 方法二: 如何把項目集資料轉化為可以分析的格式(ONE HOT ENCODER)

In [11]:
retail_shopping_basket = {'ID':[1,2,3,4,5,6],
                         'Basket':[['Beer','Diaper', 'Pretzels', 'Chips', 'Aspirin'],
                         ['Diaper', 'Beer', 'Chips', 'Lotion', 'Juice', 'Babyfood', 'Milk'],
                         ['Soda', 'Chips', 'Milk'],
                         ['Soup', 'Beer', 'Diaper', 'Milk', 'IceCream'],
                         ['Soda', 'Coffee', 'Milk', 'Bread'],
                         ['Beer', 'Chips']
                                  ]
                         }

retail = pd.DataFrame(retail_shopping_basket)

In [12]:
retail

Unnamed: 0,ID,Basket
0,1,"[Beer, Diaper, Pretzels, Chips, Aspirin]"
1,2,"[Diaper, Beer, Chips, Lotion, Juice, Babyfood,..."
2,3,"[Soda, Chips, Milk]"
3,4,"[Soup, Beer, Diaper, Milk, IceCream]"
4,5,"[Soda, Coffee, Milk, Bread]"
5,6,"[Beer, Chips]"


In [13]:
retail = retail[['ID', 'Basket']]

In [14]:
pd.options.display.max_colwidth = 100

In [15]:
retail

Unnamed: 0,ID,Basket
0,1,"[Beer, Diaper, Pretzels, Chips, Aspirin]"
1,2,"[Diaper, Beer, Chips, Lotion, Juice, Babyfood, Milk]"
2,3,"[Soda, Chips, Milk]"
3,4,"[Soup, Beer, Diaper, Milk, IceCream]"
4,5,"[Soda, Coffee, Milk, Bread]"
5,6,"[Beer, Chips]"


In [16]:
retail_id = retail.drop('Basket', axis =1)
retail_id

Unnamed: 0,ID
0,1
1,2
2,3
3,4
4,5
5,6


In [17]:
retail_Basket = retail.Basket.str.join(',')
retail_Basket

0              Beer,Diaper,Pretzels,Chips,Aspirin
1    Diaper,Beer,Chips,Lotion,Juice,Babyfood,Milk
2                                 Soda,Chips,Milk
3                  Soup,Beer,Diaper,Milk,IceCream
4                          Soda,Coffee,Milk,Bread
5                                      Beer,Chips
Name: Basket, dtype: object

In [18]:
retail_Basket = retail_Basket.str.get_dummies(',')
retail_Basket

Unnamed: 0,Aspirin,Babyfood,Beer,Bread,Chips,Coffee,Diaper,IceCream,Juice,Lotion,Milk,Pretzels,Soda,Soup
0,1,0,1,0,1,0,1,0,0,0,0,1,0,0
1,0,1,1,0,1,0,1,0,1,1,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0,1,0
3,0,0,1,0,0,0,1,1,0,0,1,0,0,1
4,0,0,0,1,0,1,0,0,0,0,1,0,1,0
5,0,0,1,0,1,0,0,0,0,0,0,0,0,0


In [19]:
retail = retail_id.join(retail_Basket)
retail

Unnamed: 0,ID,Aspirin,Babyfood,Beer,Bread,Chips,Coffee,Diaper,IceCream,Juice,Lotion,Milk,Pretzels,Soda,Soup
0,1,1,0,1,0,1,0,1,0,0,0,0,1,0,0
1,2,0,1,1,0,1,0,1,0,1,1,1,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,1,0,1,0
3,4,0,0,1,0,0,0,1,1,0,0,1,0,0,1
4,5,0,0,0,1,0,1,0,0,0,0,1,0,1,0
5,6,0,0,1,0,1,0,0,0,0,0,0,0,0,0


In [20]:
frequent_itemsets_2 = apriori(retail.drop('ID',1), use_colnames=True)

In [21]:
frequent_itemsets_2

Unnamed: 0,support,itemsets
0,0.666667,(Beer)
1,0.666667,(Chips)
2,0.5,(Diaper)
3,0.666667,(Milk)
4,0.5,"(Beer, Chips)"
5,0.5,"(Beer, Diaper)"


In [22]:
association_rules(frequent_itemsets_2, metric='lift')

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Beer),(Chips),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333
1,(Chips),(Beer),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333
2,(Beer),(Diaper),0.666667,0.5,0.5,0.75,1.5,0.166667,2.0
3,(Diaper),(Beer),0.5,0.666667,0.5,1.0,1.5,0.166667,inf


## 方法三: 電影題材關聯 導入數據庫CSV檔案進行分析

In [15]:
movies = pd.read_csv('movies.csv')

In [16]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [17]:
movies_ohe = movies.drop('genres', 1).join(movies.genres.str.get_dummies())

In [18]:
pd.options.display.max_columns=100

In [19]:
movies_ohe.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
movies_ohe.shape

(9742, 20)

In [36]:
movies_ohe.columns
movies_ohe.set_index(['movieId','title'],inplace=True)
## inplace = True : 不創建新的對象，直接對原本的對象進行修改
## inplace = False : 對數據進行修改，創建並返回新的對象承載它的修改結果

Index(['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western'],
      dtype='object')

In [26]:
movies_ohe.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
frequent_itemsets_movies = apriori(movies_ohe,use_colnames=True, min_support=0.025)

In [28]:
frequent_itemsets_movies

Unnamed: 0,support,itemsets
0,0.187641,(Action)
1,0.129645,(Adventure)
2,0.062718,(Animation)
3,0.068158,(Children)
4,0.385547,(Comedy)
5,0.123075,(Crime)
6,0.045165,(Documentary)
7,0.447649,(Drama)
8,0.079963,(Fantasy)
9,0.10039,(Horror)


In [29]:
rules_movies = association_rules(frequent_itemsets_movies, metric='lift', min_threshold=1.25)

In [30]:
rules_movies

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Action),(Adventure),0.187641,0.129645,0.062615,0.333698,2.57394,0.038289,1.306247
1,(Adventure),(Action),0.129645,0.187641,0.062615,0.482977,2.57394,0.038289,1.571224
2,(Action),(Crime),0.187641,0.123075,0.042907,0.228665,1.857929,0.019813,1.136892
3,(Crime),(Action),0.123075,0.187641,0.042907,0.348624,1.857929,0.019813,1.247142
4,(Sci-Fi),(Action),0.100595,0.187641,0.046294,0.460204,2.452576,0.027419,1.504937
5,(Action),(Sci-Fi),0.187641,0.100595,0.046294,0.246718,2.452576,0.027419,1.193981
6,(Thriller),(Action),0.194416,0.187641,0.067235,0.345829,1.843034,0.030754,1.241814
7,(Action),(Thriller),0.187641,0.194416,0.067235,0.358315,1.843034,0.030754,1.25542
8,(Adventure),(Animation),0.129645,0.062718,0.025354,0.195566,3.118175,0.017223,1.165145
9,(Animation),(Adventure),0.062718,0.129645,0.025354,0.404255,3.118175,0.017223,1.460953


In [31]:
rules_movies[(rules_movies.lift>4)].sort_values(by=['lift'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
16,(Animation),(Children),0.062718,0.068158,0.031,0.494272,7.251799,0.026725,1.842573
17,(Children),(Animation),0.068158,0.062718,0.031,0.454819,7.251799,0.026725,1.719213


In [32]:
movies[(movies.genres.str.contains('Children')) & (movies.genres.str.contains('Animation'))]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
12,13,Balto (1995),Adventure|Animation|Children
44,48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance
205,239,"Goofy Movie, A (1995)",Animation|Children|Comedy|Romance
272,313,"Swan Princess, The (1994)",Animation|Children
...,...,...,...
9629,178827,Paddington 2 (2017),Adventure|Animation|Children|Comedy
9657,180987,Ferdinand (2017),Animation|Children|Comedy
9664,182293,Hare-um Scare-um (1939),Animation|Children|Comedy
9666,182299,Porky's Hare Hunt (1938),Animation|Children|Comedy


In [68]:
movies[(movies.genres.str.contains('Children')) & (~movies.genres.str.contains('Animation'))]

Unnamed: 0,movieId,title,genres
1,2,Jumanji (1995),Adventure|Children|Fantasy
7,8,Tom and Huck (1995),Adventure|Children
26,27,Now and Then (1995),Children|Drama
32,34,Babe (1995),Children|Drama
34,38,It Takes Two (1995),Children|Comedy
...,...,...,...
9636,179401,Jumanji: Welcome to the Jungle (2017),Action|Adventure|Children
9670,182731,Pixel Perfect (2004),Children|Comedy|Sci-Fi
9679,183301,The Tale of the Bunny Picnic (1986),Children
9697,184987,A Wrinkle in Time (2018),Adventure|Children|Fantasy|Sci-Fi
