<a href="https://colab.research.google.com/github/chunter3/Data_Analytics_Projects/blob/master/Apriori_Algorithm%2C_Association_Rules%2C_%26_F1_Score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
%matplotlib inline

In [None]:
# Problem 1 (Start)

In [None]:
# Loading the Online Retail dataset using a Pandas dataframe (this cell takes a while to run)

online_ret_ds = pd.read_excel('Online Retail.xlsx')
online_ret_ds.head()

In [None]:
# Cleaning up the dataset

online_ret_ds['Description'] = online_ret_ds['Description'].str.strip() # Removing whitespace of elements in 'Description' column
online_ret_ds.dropna(axis=0, subset=['InvoiceNo'], inplace=True) # Removing invalid rows based on 'InvoiceNo'
online_ret_ds['InvoiceNo'] = online_ret_ds['InvoiceNo'].astype('str') # Cast the 'InvoiceNo' column to a string
online_ret_ds = online_ret_ds[~online_ret_ds['InvoiceNo'].str.contains('C')] # Removing rows that have 'C' in their InvoiceNo
online_ret_ds

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [None]:
# Generating basket the Apriori algorithm will use 

france_basket = (online_ret_ds[online_ret_ds['Country']=="France"].groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))
france_basket

In [None]:
# Encoding the dataset to a format readable by the Apriori algorithm (following code courtesy of edureka (https://www.youtube.com/channel/UCkw4JCwteGrDHIsyIIKo4tQ))

def encode_units(x):
  if x <= 0:
    return 0
  elif x >= 1:
    return 1
basket_sets = france_basket.applymap(encode_units)
basket_sets.drop('POSTAGE', inplace=True, axis=1)
basket_sets

In [None]:
# Determining the itemset w/ the largest support

france_isets = apriori(basket_sets, min_support=0.05, use_colnames=True)
france_isets.max()

support                                              0.188776
itemsets    (SET/6 RED SPOTTY PAPER PLATES, SET/6 RED SPOT...
dtype: object

In [None]:
# Using the association_rules module

asso_rules = association_rules(france_isets)
asso_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.094388,0.096939,0.079082,0.837838,8.642959,0.069932,5.568878
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.096939,0.094388,0.079082,0.815789,8.642959,0.069932,4.916181
2,(CHILDRENS CUTLERY DOLLY GIRL),(CHILDRENS CUTLERY SPACEBOY),0.071429,0.068878,0.063776,0.892857,12.962963,0.058856,8.690476
3,(CHILDRENS CUTLERY SPACEBOY),(CHILDRENS CUTLERY DOLLY GIRL),0.068878,0.071429,0.063776,0.925926,12.962963,0.058856,12.535714
4,(PACK OF 6 SKULL PAPER CUPS),(PACK OF 6 SKULL PAPER PLATES),0.063776,0.056122,0.05102,0.8,14.254545,0.047441,4.719388


In [None]:
# Determining the rule w/ the highest confidence

asso_rules[asso_rules['confidence']==asso_rules['confidence'].max()]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
13,"(SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796
14,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959


In [None]:
# Determining the rule w/ the highest lift

asso_rules[asso_rules['lift']==asso_rules['lift'].max()]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,(PACK OF 6 SKULL PAPER CUPS),(PACK OF 6 SKULL PAPER PLATES),0.063776,0.056122,0.05102,0.8,14.254545,0.047441,4.719388


In [None]:
# Problem 1 (End)

In [None]:
# Problem 2 (Start)

In [None]:
# Loading the Extended Bakery dataset using a Pandas dataframe

bakery_ds = pd.read_csv('75000-out2-binary.csv')
bakery_ds.head()

In [None]:
# Computing f11

bakery_ds[(bakery_ds['Chocolate Cake']==1) & (bakery_ds['Chocolate Coffee']==1)] # f11 = # of rows; f11 = 3303

In [None]:
# Computing f00

bakery_ds[(bakery_ds['Chocolate Cake']==0) & (bakery_ds['Chocolate Coffee']==0)] # f00 = # of rows; f00 = 65802

In [None]:
# Computing f01

bakery_ds[(bakery_ds['Chocolate Cake']==0) & (bakery_ds['Chocolate Coffee']==1)] # f01 = # of rows; f01 = 2933

In [None]:
# Computing f10

bakery_ds[(bakery_ds['Chocolate Cake']==1) & (bakery_ds['Chocolate Coffee']==0)] # f10 = # of rows; f10 = 2962

In [None]:
# Problem 2 (End)