# Association rule mining
Can you bundle items that go together based on historical transactions?
###### This is not recommendations (netflix, amazon, etc)

In [1]:
import pandas as pd

In [6]:
dfRet = pd.read_csv('retail_transactions.csv', encoding='ISO-8859-1')

In [7]:
dfRet.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


## Prepare the data for basket analysis

In [14]:
# Replace spaces/special characters in 'Description'
dfRet['clean_description'] = dfRet['Description'].str.replace(' ', '_')
dfRet['clean_description'].str.replace('\W', '')

0           WHITE_HANGING_HEART_TLIGHT_HOLDER
1                         WHITE_METAL_LANTERN
2              CREAM_CUPID_HEARTS_COAT_HANGER
3         KNITTED_UNION_FLAG_HOT_WATER_BOTTLE
4               RED_WOOLLY_HOTTIE_WHITE_HEART
                         ...                 
541904            PACK_OF_20_SPACEBOY_NAPKINS
541905            CHILDRENS_APRON_DOLLY_GIRL_
541906          CHILDRENS_CUTLERY_DOLLY_GIRL_
541907        CHILDRENS_CUTLERY_CIRCUS_PARADE
541908          BAKING_SET_9_PIECE_RETROSPOT_
Name: clean_description, Length: 541909, dtype: object

In [19]:
# Drop null values
dfRet.dropna(inplace=True)

In [20]:
dfRet.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,clean_description
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom,WHITE_HANGING_HEART_T-LIGHT_HOLDER
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,WHITE_METAL_LANTERN
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom,CREAM_CUPID_HEARTS_COAT_HANGER
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,KNITTED_UNION_FLAG_HOT_WATER_BOTTLE
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,RED_WOOLLY_HOTTIE_WHITE_HEART.


In [21]:
# Convert to list format

dfRetList = dfRet.groupby('InvoiceNo').clean_description.apply(list)

In [22]:
dfRetList

InvoiceNo
536365     [WHITE_HANGING_HEART_T-LIGHT_HOLDER, WHITE_MET...
536366     [HAND_WARMER_UNION_JACK, HAND_WARMER_RED_POLKA...
536367     [ASSORTED_COLOUR_BIRD_ORNAMENT, POPPY'S_PLAYHO...
536368     [JAM_MAKING_SET_WITH_JARS, RED_COAT_RACK_PARIS...
536369                            [BATH_BUILDING_BLOCK_WORD]
                                 ...                        
C581484                        [PAPER_CRAFT_,_LITTLE_BIRDIE]
C581490    [VICTORIAN_GLASS_HANGING_T-LIGHT, ZINC_T-LIGHT...
C581499                                             [Manual]
C581568                         [VICTORIAN_SEWING_BOX_LARGE]
C581569    [HANGING_HEART_JAR_T-LIGHT_HOLDER, 36_PENCILS_...
Name: clean_description, Length: 22190, dtype: object

## Build Model

In [23]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.17.2-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 1.9 MB/s eta 0:00:01
Installing collected packages: mlxtend
Successfully installed mlxtend-0.17.2


### Association Rule Mining (Mkt. Bsk. Analysis)

In [25]:
# Encode data as transaction matrix
from mlxtend.preprocessing import TransactionEncoder

mdlRetTe = TransactionEncoder()
mdlRetTe_array = mdlRetTe.fit(dfRetList).transform(dfRetList)
dfRtb = pd.DataFrame(mdlRetTe_array, columns=mdlRetTe.columns_)

Unnamed: 0,10_COLOUR_SPACEBOY_PEN,12_COLOURED_PARTY_BALLOONS,12_DAISY_PEGS_IN_WOOD_BOX,12_EGG_HOUSE_PAINTED_WOOD,12_HANGING_EGGS_HAND_PAINTED,12_IVORY_ROSE_PEG_PLACE_SETTINGS,12_MESSAGE_CARDS_WITH_ENVELOPES,12_PENCILS_SMALL_TUBE_RED_RETROSPOT,12_PENCILS_SMALL_TUBE_SKULL,12_PENCILS_TALL_TUBE_POSY,...,_DOLLY_GIRL_BEAKER,_I_LOVE_LONDON_MINI_BACKPACK,_I_LOVE_LONDON_MINI_RUCKSACK,_NINE_DRAWER_OFFICE_TIDY,_OVAL_WALL_MIRROR_DIAMANTE_,_RED_SPOT_GIFT_BAG_LARGE,_SET_2_TEA_TOWELS_I_LOVE_LONDON_,_SPACEBOY_BABY_GIFT_SET,_TOADSTOOL_BEDSIDE_LIGHT_,_TRELLIS_COAT_RACK
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22185,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
22186,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
22187,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
22188,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [26]:
#frequent_itemsetsthe items and itemset with at least 1% support

from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(dfRtb, min_support=0.01, use_colnames=True)

In [29]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.011221,(10_COLOUR_SPACEBOY_PEN)
1,0.014015,(12_PENCILS_SMALL_TUBE_RED_RETROSPOT)
2,0.013249,(12_PENCILS_SMALL_TUBE_SKULL)
3,0.010680,(12_PENCILS_TALL_TUBE_RED_RETROSPOT)
4,0.012528,(12_PENCIL_SMALL_TUBE_WOODLAND)
...,...,...
734,0.010140,"(LUNCH_BAG_RED_RETROSPOT, LUNCH_BAG_WOODLAND, ..."
735,0.011447,"(LUNCH_BAG_RED_RETROSPOT, LUNCH_BAG__BLACK_SKU..."
736,0.010455,"(LUNCH_BAG__BLACK_SKULL., LUNCH_BAG_RED_RETROS..."
737,0.012213,"(PINK_REGENCY_TEACUP_AND_SAUCER, ROSES_REGENCY..."


In [30]:
# Evaluate metrics and filter items/itemsets that have at least 70% confidence

from mlxtend.frequent_patterns import association_rules

a_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

In [32]:
a_rules.sort_values(by=['confidence'], ascending=False, inplace=True)
a_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
34,"(ROSES_REGENCY_TEACUP_AND_SAUCER_, PINK_REGENC...",(GREEN_REGENCY_TEACUP_AND_SAUCER),0.012213,0.033033,0.010861,0.889299,26.921613,0.010457,8.734936
23,"(ROSES_REGENCY_TEACUP_AND_SAUCER_, PINK_REGENC...",(GREEN_REGENCY_TEACUP_AND_SAUCER),0.020324,0.033033,0.017891,0.880266,26.648164,0.01722,8.075966
33,"(GREEN_REGENCY_TEACUP_AND_SAUCER, PINK_REGENCY...",(ROSES_REGENCY_TEACUP_AND_SAUCER_),0.012348,0.037675,0.010861,0.879562,23.34627,0.010396,7.990217
21,"(PINK_REGENCY_TEACUP_AND_SAUCER, REGENCY_CAKES...",(GREEN_REGENCY_TEACUP_AND_SAUCER),0.014376,0.033033,0.012348,0.858934,26.002386,0.011873,6.854722
32,"(PINK_REGENCY_TEACUP_AND_SAUCER, REGENCY_CAKES...",(ROSES_REGENCY_TEACUP_AND_SAUCER_),0.014376,0.037675,0.012213,0.84953,22.549122,0.011671,6.395454
