## Objective
##### recommend product using association rule

## Load Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from apyori import apriori

In [2]:
data = pd.read_csv('Groceries_dataset.csv')
data.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [3]:
all_products = data['itemDescription'].unique()

## Data Understanding

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.7+ KB


In [5]:
data.describe()

Unnamed: 0,Member_number
count,38765.0
mean,3003.641868
std,1153.611031
min,1000.0
25%,2002.0
50%,3005.0
75%,4007.0
max,5000.0


## Data Pre-processing

#### Null & Missing Values

In [11]:
data_null = data.isnull().sum().reset_index()
data_null.columns = ['feature','missing_value']
#data_null['percentage'] = round((data_null['missing_value']/len(data))*100,2)
#data_null = data_null.sort_values('percentage',ascending=False).reset_index(drop=True)
#data_null = data_null[data_null['percentage']>0]
data_null

Unnamed: 0,feature,missing_value
0,Member_number,0
1,Date,0
2,itemDescription,0


#### Duplicate Values

In [12]:
data.duplicated().sum()

759

In [13]:
data = data.drop_duplicates()

In [15]:
data.duplicated().sum()

0

In [16]:
product = data['itemDescription'].value_counts().reset_index()
product.columns = ['product','frequency']
product

Unnamed: 0,product,frequency
0,whole milk,2363
1,other vegetables,1827
2,rolls/buns,1646
3,soda,1453
4,yogurt,1285
...,...,...
162,frozen chicken,5
163,bags,4
164,baby cosmetics,3
165,kitchen utensil,1


#### Transform Dataset

In [17]:
one_hot = pd.get_dummies(data['itemDescription'])
data.drop('itemDescription', inplace=True, axis=1)
data = data.join(one_hot)
data.head()

Unnamed: 0,Member_number,Date,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,1808,21-07-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2552,05-01-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2300,19-09-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1187,12-12-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3037,01-02-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [18]:
records = data.groupby(['Member_number','Date'])[all_products[:]].apply(sum)
records.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
Member_number,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1000,15-03-2015,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,24-06-2014,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,24-07-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,25-11-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,27-05-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
records = records.reset_index()[all_products]
records.head()

Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Note : if a customer bought multiple products on same day, we will consider it one transaction

In [20]:
records = data.groupby(['Member_number', 'Date'])[all_products[:]].apply(sum)
records = records.reset_index()[all_products]
records.head()

Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
records.describe()

Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
count,14963.0,14963.0,14963.0,14963.0,14963.0,14963.0,14963.0,14963.0,14963.0,14963.0,...,14963.0,14963.0,14963.0,14963.0,14963.0,14963.0,14963.0,14963.0,14963.0,14963.0
mean,0.067767,0.157923,0.049054,0.122101,0.110005,0.007819,0.053131,0.03395,0.03776,0.027869,...,0.004611,0.003275,0.001804,0.000401,0.000735,0.001136,0.001002,0.000334,0.000334,6.7e-05
std,0.251354,0.364681,0.215989,0.327414,0.312906,0.088083,0.224302,0.181108,0.190621,0.164602,...,0.067753,0.057134,0.042442,0.020021,0.027105,0.033689,0.031647,0.018278,0.018278,0.008175
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
def get_Pnames(x):
    for product in all_products:
        if x[product] > 0:
            x[product] = product
    return x

records = records.apply(get_Pnames, axis=1)
records.head()

Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
0,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
print('total transactions: {}'.format(len(records)))

total transactions: 14963


In [25]:
x = records.values
x = [sub[~(sub==0)].tolist() for sub in x if sub[sub !=0].tolist()]
transactions = x

In [27]:
# example of transactions:
transactions = [[x for x in y if x != ''] for y in transactions]

In [28]:
transactions[0:5]

[['whole milk', 'yogurt', 'sausage', 'semi-finished bread'],
 ['whole milk', 'pastry', 'salty snack'],
 ['canned beer', 'misc. beverages'],
 ['sausage', 'hygiene articles'],
 ['soda', 'pickled vegetables']]

In [29]:
transactions[0:10]

[['whole milk', 'yogurt', 'sausage', 'semi-finished bread'],
 ['whole milk', 'pastry', 'salty snack'],
 ['canned beer', 'misc. beverages'],
 ['sausage', 'hygiene articles'],
 ['soda', 'pickled vegetables'],
 ['frankfurter', 'curd'],
 ['whole milk', 'rolls/buns', 'sausage'],
 ['whole milk', 'soda'],
 ['beef', 'white bread'],
 ['frankfurter', 'soda', 'whipped/sour cream']]

## Modelling

In [30]:
rules = apriori(transactions,min_support=0.00030,min_confidance=0.05,min_lift=3,min_length=2)
association_results = list(rules)

In [32]:
association_results[0]

RelationRecord(items=frozenset({'specialty chocolate', 'frozen fish'}), support=0.0003341575887188398, ordered_statistics=[OrderedStatistic(items_base=frozenset({'frozen fish'}), items_add=frozenset({'specialty chocolate'}), confidence=0.049019607843137254, lift=3.0689556157190907), OrderedStatistic(items_base=frozenset({'specialty chocolate'}), items_add=frozenset({'frozen fish'}), confidence=0.02092050209205021, lift=3.0689556157190907)])

In [34]:
association_results[1]

RelationRecord(items=frozenset({'fruit/vegetable juice', 'liver loaf'}), support=0.00040098910646260775, ordered_statistics=[OrderedStatistic(items_base=frozenset({'fruit/vegetable juice'}), items_add=frozenset({'liver loaf'}), confidence=0.011787819253438114, lift=3.52762278978389), OrderedStatistic(items_base=frozenset({'liver loaf'}), items_add=frozenset({'fruit/vegetable juice'}), confidence=0.12, lift=3.5276227897838903)])

## Insight

In [35]:
def inspect(results):
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    confidences = [result[2][0][2] for result in results]
    lifts       = [result[2][0][3] for result in results]
    
    return list(zip(lhs, rhs, supports, confidences, lifts))

resultsinDF = pd.DataFrame(inspect(association_results), columns = ['Antecedent', 'Consequent', 'Support', 'Confidence', 'Lift'])

In [37]:
resultsinDF

Unnamed: 0,Antecedent,Consequent,Support,Confidence,Lift
0,frozen fish,specialty chocolate,0.000334,0.04902,3.068956
1,fruit/vegetable juice,liver loaf,0.000401,0.011788,3.527623
2,ham,pickled vegetables,0.000535,0.03125,3.489506
3,meat,roll products,0.000334,0.019841,3.620548
4,misc. beverages,salt,0.000334,0.021186,3.561941
5,misc. beverages,spread cheese,0.000334,0.021186,3.170127
6,seasonal products,soups,0.000334,0.04717,14.704206
7,spread cheese,sugar,0.000401,0.06,3.387849
8,bottled beer,butter,0.000334,0.007375,3.805055
9,bottled beer,whole milk,0.000334,0.007375,3.94095
