# Unsupervised Learning 2 

## Lesson 3: Market Basket Analysis

### Part 1: Identifying the 'Small' Customers

In [25]:
import numpy as np
import pandas as pd

In [26]:
# Load in the datasets identified in the previous lesson as small customers
df = pd.read_csv("small_cust.csv")

In [27]:
# Display some simple information to get a big picture of the data
df.head()

Unnamed: 0,CustomerID,Recency,Frequency,Monetary
0,12821,215,6,92.72
1,12823,75,5,1759.5
2,12829,337,11,293.0
3,12831,263,9,215.05
4,12833,146,24,417.38


In [28]:
df.describe()

Unnamed: 0,CustomerID,Recency,Frequency,Monetary
count,1327.0,1327.0,1327.0,1327.0
mean,15579.752826,163.141673,14.232856,284.25517
std,1610.318755,101.54073,10.731013,261.069377
min,12821.0,2.0,1.0,3.75
25%,14175.0,68.0,7.0,142.13
50%,15565.0,160.0,12.0,223.85
75%,16989.0,250.0,20.0,344.52
max,18282.0,363.0,77.0,4055.72


In [29]:
df.shape

(1327, 4)

In [30]:
# Get the total number of transactions for these customers
sum(df.Frequency)

18887

### Part 2: Getting the transactions

In [31]:
# Read in the filtered transaction data since the RFM was generated from this
tr = pd.read_csv("OnlineRetail_Filtered.csv")
print(tr.shape)

(339702, 8)


In [32]:
tr[tr.Description == 'POSTAGE'].shape

(27, 8)

In [33]:
# Filter dataframe to keep only transactions from the small customers
tf_f = tr[tr['CustomerID'].isin(df.CustomerID)]

In [34]:
tf_f.shape # Validate that transaction count is the same

(18887, 8)

In [35]:
# These 2 features will be used to group the data.
# Description is used instead of StockID as it is human readable when we analyze the assoc rules later
tf_f['Description'] = tf_f['Description'].str.strip()
tf_f['InvoiceNo'] = tf_f['InvoiceNo'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [36]:
# The result should be 1 row per transaction. Each column is the quantiy purchased for that item
basket = (tf_f.groupby(['InvoiceNo','Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))

#drop postage entries
basket.drop('POSTAGE', inplace=True, axis=1)

In [37]:
'POSTAGE' in (basket.columns)

False

In [38]:
basket.head()

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,...,ZINC HEARTS PLANT POT HOLDER,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC STAR T-LIGHT HOLDER,ZINC SWEETHEART SOAP DISH,ZINC SWEETHEART WIRE LETTER RACK,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
538368,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Part 3: Generating the Association Rules

In [39]:
# import libraries
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [40]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

# We only need binary encoding for the assoc rules
basket_sets = basket.applymap(encode_units)

In [41]:
frequent_itemsets = apriori(basket_sets, min_support=0.01, use_colnames=True)

In [42]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(PAINTED METAL PEARS ASSORTED),(ASSORTED COLOUR BIRD ORNAMENT),0.023729,0.050282,0.019209,0.809524,16.099518,0.018016,4.986017
1,(ASSORTED COLOUR BIRD ORNAMENT),(PAINTED METAL PEARS ASSORTED),0.050282,0.023729,0.019209,0.382022,16.099518,0.018016,1.579784
2,(BAKING SET 9 PIECE RETROSPOT),(BAKING SET SPACEBOY DESIGN),0.042373,0.019774,0.015819,0.373333,18.88,0.014981,1.56419
3,(BAKING SET SPACEBOY DESIGN),(BAKING SET 9 PIECE RETROSPOT),0.019774,0.042373,0.015819,0.8,18.88,0.014981,4.788136
4,(BLUE HAPPY BIRTHDAY BUNTING),(PINK HAPPY BIRTHDAY BUNTING),0.015254,0.014124,0.011299,0.740741,52.444444,0.011084,3.802663


In [43]:
rules[ (rules['lift'] >= 6) &
       (rules['confidence'] >= 0.8) ].sort_values(['lift','confidence'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
5,(PINK HAPPY BIRTHDAY BUNTING),(BLUE HAPPY BIRTHDAY BUNTING),0.014124,0.015254,0.011299,0.8,52.444444,0.011084,4.923729
38,"(PINK REGENCY TEACUP AND SAUCER, ROSES REGENCY...",(GREEN REGENCY TEACUP AND SAUCER),0.016384,0.031073,0.014689,0.896552,28.852665,0.01418,9.36629
8,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.024294,0.031073,0.019774,0.813953,26.194503,0.019019,5.20798
32,"(REGENCY CAKESTAND 3 TIER, PINK REGENCY TEACUP...",(GREEN REGENCY TEACUP AND SAUCER),0.014689,0.031073,0.011864,0.807692,25.993007,0.011408,5.038418
0,(PAINTED METAL PEARS ASSORTED),(ASSORTED COLOUR BIRD ORNAMENT),0.023729,0.050282,0.019209,0.809524,16.099518,0.018016,4.986017
