## Libraries

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import permutations

## Dataset

In [2]:
df = pd.read_csv("Market_Basket_Optimisation.csv", header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [4]:
products = []

for i in tqdm(range(df.shape[0])):
    products.append([product.strip() for product in df.iloc[i, :].values if type(product)!=float])

100%|██████████| 7501/7501 [00:00<00:00, 7814.32it/s]


In [5]:
unique_products = []

for lista in tqdm(products):
    for product in lista:
        if product not in unique_products:
            unique_products.append(product)
unique_products = sorted(unique_products)

100%|██████████| 7501/7501 [00:00<00:00, 306525.53it/s]


In [6]:
possibilities = [{
    "current": product,
    "next": next_product} for product, next_product in permutations(unique_products, 2)]

##### 1. Support

In [7]:
for pair in tqdm(possibilities):
    pair["support_current"] = sum([1 for p in products if pair["current"] in p])/len(products)
    pair["support_next"] = sum([1 for p in products if pair["current"] in p])/len(products)
    pair["support_pair"] = sum([1 for p in products if pair["current"] in p and pair["next"] in p])/len(products)

100%|██████████| 14042/14042 [00:34<00:00, 409.74it/s]


##### 2. Confidence & Lift

In [8]:
for pair in tqdm(possibilities):
    new_proba = sum([1 for p in products if pair["current"] in p and pair["next"] in p])/len(products)
    pair["confidence"]= new_proba/pair["support_current"]
    pair["lift"] = pair["confidence"]/pair["support_next"]

100%|██████████| 14042/14042 [00:12<00:00, 1108.56it/s]


## Developing class

In [9]:
class Rule():
    
    def __init__(self, product, bought_with, support, confidence, lift):
        self.__product = product.title()
        self.__bought_with = bought_with.title()
        self.__support = support
        self.__confidence = confidence
        self.__lift = lift

    def __str__(self):
        text = f">>> {self.__bought_with} and {self.__product} are both present in "
        text += f"{100*self.__support:.2f}% of the transactions.\nAmong people who bought {self.__product}, "
        text += f"{100*self.__confidence:.2f}% also bought {self.__bought_with}.\nThis means that people are "
        text += f"{self.__lift:.2f} times more prone to buying {self.__bought_with} when they are "
        text += f"buying {self.__product} as well."
        return text
    
class APriori():
    
    def __init__(self, min_support, min_confidence, min_lift, min_length):
        self.__min_support = min_support
        self.__min_confidence = min_confidence
        self.__min_lift = min_lift
        self.__min_length = min_length
        self.rules = []
        
    def _generate_list(self, df):
        assert isinstance(df, pd.DataFrame), "Insert a pandas dataframe"
        products = []
        for i in range(df.shape[0]):
            products.append([product.strip() for product in df.iloc[i, :].values if type(product)!=float])
            
        return products
    
    def _generate_combinations(self, products_list):
        unique_products = []
        for lista in products_list:
            for product in lista:
                if product not in unique_products:
                    unique_products.append(product)
        unique_products = sorted(unique_products)
        
        possibilities = [{
                "current": product,
                "next": next_product} for product, next_product in permutations(unique_products, 2)]
        
        return possibilities
    
    def _calculate_support_current(self, pair):
        return sum([1 for p in products if pair["current"] in p])/len(products)
    
    def _calculate_support_next(self, pair):
        return sum([1 for p in products if pair["next"] in p])/len(products)
    
    def _calculate_support_pair(self, pair):
        return sum([1 for p in products if pair["current"] in p and pair["next"] in p])/len(products)

    def _calculate_confidence(self, pair):
        total = pair["support_current"]*len(products)
        return sum([1 for p in products if pair["current"] in p and pair["next"] in p])/total
    
    def _calculate_lift(self, pair):
        return pair["confidence"]/pair["support_next"]
    
    def get_rules(self, df):
        products_list = self._generate_list(df)
        possibilities = self._generate_combinations(products_list)
        
        output = []

        for pair in possibilities:
            pair["support_current"] = self._calculate_support_current(pair)
            pair["support_next"] = self._calculate_support_next(pair)
            pair["support"] = self._calculate_support_pair(pair)
            if pair["support"] < self.__min_support:
                continue
            pair["confidence"] = self._calculate_confidence(pair)
            if pair["confidence"] < self.__min_confidence:
                continue
            pair["lift"] = self._calculate_lift(pair)
            if pair["lift"] < self.__min_lift:
                continue
            output.append(pair)
            
        output = sorted(output, key=lambda x: -x["lift"])
        for o in output:
            rule = Rule(product=o["current"],
                        bought_with=o["next"],
                        support=o["support"],
                        confidence=o["confidence"],
                        lift=o["lift"]
                    )
            self.rules.append(rule)
            
        return output
    
    def get_highest_lifts(self, n_rules=10):
        print(f"########## {n_rules} Highest Lifts ##########")
        print()
        for rule in self.rules:
            print(rule)
            print()

## Analyzing results

In [10]:
apriori = APriori(min_support = 0.003, min_confidence = 0.2, min_lift = 3, min_length = 2)

In [11]:
%%time
result = apriori.get_rules(df)

CPU times: user 39.7 s, sys: 181 ms, total: 39.9 s
Wall time: 43.2 s


In [12]:
result[:5]

[{'current': 'fromage blanc',
  'next': 'honey',
  'support_current': 0.013598186908412212,
  'support_next': 0.047460338621517134,
  'support': 0.003332888948140248,
  'confidence': 0.24509803921568626,
  'lift': 5.164270764485569},
 {'current': 'light cream',
  'next': 'chicken',
  'support_current': 0.01559792027729636,
  'support_next': 0.05999200106652446,
  'support': 0.004532728969470737,
  'confidence': 0.2905982905982906,
  'lift': 4.8439506172839515},
 {'current': 'pasta',
  'next': 'escalope',
  'support_current': 0.01573123583522197,
  'support_next': 0.0793227569657379,
  'support': 0.005865884548726837,
  'confidence': 0.3728813559322034,
  'lift': 4.700811850163794},
 {'current': 'pasta',
  'next': 'shrimp',
  'support_current': 0.01573123583522197,
  'support_next': 0.07145713904812692,
  'support': 0.005065991201173177,
  'confidence': 0.3220338983050847,
  'lift': 4.506672147735896},
 {'current': 'whole wheat pasta',
  'next': 'olive oil',
  'support_current': 0.02946

In [13]:
apriori.get_highest_lifts(n_rules=10)

########## 10 Highest Lifts ##########

>>> Honey and Fromage Blanc are both present in 0.33% of the transactions.
Among people who bought Fromage Blanc, 24.51% also bought Honey.
This means that people are 5.16 times more prone to buying Honey when they are buying Fromage Blanc as well.

>>> Chicken and Light Cream are both present in 0.45% of the transactions.
Among people who bought Light Cream, 29.06% also bought Chicken.
This means that people are 4.84 times more prone to buying Chicken when they are buying Light Cream as well.

>>> Escalope and Pasta are both present in 0.59% of the transactions.
Among people who bought Pasta, 37.29% also bought Escalope.
This means that people are 4.70 times more prone to buying Escalope when they are buying Pasta as well.

>>> Shrimp and Pasta are both present in 0.51% of the transactions.
Among people who bought Pasta, 32.20% also bought Shrimp.
This means that people are 4.51 times more prone to buying Shrimp when they are buying Pasta as wel