In [9]:
pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.4 MB ? eta -:--:--
   --------------- ------------------------ 0.5/1.4 MB 1.9 MB/s eta 0:00:01
   ------------------------------- -------- 1.0/1.4 MB 1.9 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 1.6 MB/s eta 0:00:00
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.4
Note: you may need to restart the kernel to use updated packages.


In [98]:
import pandas as pd
import mlxtend
import plotly.express as px
import plotly.graph_objects as go

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [99]:
toy_dataset = [['Skirt', 'Sneakers', 'Scarf', 'Pants', 'Hat'],

        ['Sunglasses', 'Skirt', 'Sneakers', 'Pants', 'Hat'],

        ['Dress', 'Sandals', 'Scarf', 'Pants', 'Heels'],

        ['Dress', 'Necklace', 'Earrings', 'Scarf', 'Hat', 'Heels', 'Hat'],

      ['Earrings', 'Skirt', 'Skirt', 'Scarf', 'Shirt', 'Pants']]

toy_dataset

[['Skirt', 'Sneakers', 'Scarf', 'Pants', 'Hat'],
 ['Sunglasses', 'Skirt', 'Sneakers', 'Pants', 'Hat'],
 ['Dress', 'Sandals', 'Scarf', 'Pants', 'Heels'],
 ['Dress', 'Necklace', 'Earrings', 'Scarf', 'Hat', 'Heels', 'Hat'],
 ['Earrings', 'Skirt', 'Skirt', 'Scarf', 'Shirt', 'Pants']]

In [100]:
te=TransactionEncoder()

In [104]:
#Apply one-hot-encoding on our dataset
te_ary=te.fit(toy_dataset).transform(toy_dataset)    

#Creating a new DataFrame from our Numpy array
df=pd.DataFrame(te_ary, columns=te.columns_)

df

Unnamed: 0,Dress,Earrings,Hat,Heels,Necklace,Pants,Sandals,Scarf,Shirt,Skirt,Sneakers,Sunglasses
0,False,False,True,False,False,True,False,True,False,True,True,False
1,False,False,True,False,False,True,False,False,False,True,True,True
2,True,False,False,True,False,True,True,True,False,False,False,False
3,True,True,True,True,True,False,False,True,False,False,False,False
4,False,True,False,False,False,True,False,True,True,True,False,False


In [106]:
frequent_itemsets = apriori(df, min_support = 0.6, use_colnames=True)

In [108]:
association_rules(frequent_itemsets, metric = "confidence", min_threshold = 0.7, num_itemsets = 10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Pants),(Scarf),0.8,0.8,0.6,0.75,0.9375,1.0,-0.04,0.8,-0.25,0.6,-0.25,0.75
1,(Scarf),(Pants),0.8,0.8,0.6,0.75,0.9375,1.0,-0.04,0.8,-0.25,0.6,-0.25,0.75
2,(Skirt),(Pants),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
3,(Pants),(Skirt),0.8,0.6,0.6,0.75,1.25,1.0,0.12,1.6,1.0,0.75,0.375,0.875


### Interpretation.

### Association Rules Interpretation

| Antecedents | Consequents | Confidence | Lift |
|-------------|-------------|------------|------|
| (Scarf)     | (Pants)     | 0.75     | 0.9375  |
| (Pants)     | (Scarf)     | 0.75     | 0.9375  |
| (Pants)     | (Skirt)     | 0.75      | 1.2500  |
| (Skirt)     | (Pants)     | 1.00     | 1.2500  |

1. **Antecedents and Consequents:** These columns show the relationships between items. For example, if a customer buys a Scarf (antecedent), they are likely to also buy Pants (consequent).

2. **Confidence:** This metric indicates the likelihood of the consequent being purchased when the antecedent is purchased. A confidence of 0.75 means that in 75% of the transactions where a Scarf was purchased, Pants were also purchased.

3. **Lift:** This metric measures how much more likely the consequent is to be purchased when the antecedent is purchased, compared to the likelihood of purchasing the consequent alone. A lift of 1.0 means there is no association between the items; values above 1 indicate a positive association, and values below 1 indicate a negative association.

### Additional Metrics
The table also contains other metrics such as:
- **Support:** The proportion of transactions that contain both the antecedent and the consequent.
- **Leverage:** The difference between the observed frequency of a rule and the expected frequency if the items were independent.
- **Conviction:** A measure of the strength of implication of the rule.

### Analysis
For the rule (Scarf) → (Pants):
- **Support:** 0.6, meaning 60% of all transactions contain both items.
- **Confidence:** 0.75, indicating high reliability of this rule.
- **Lift:** 0.9375, suggesting no strong association, but the high confidence indicates it's a common pairing.

For the rule (Scarf) → (Pants):
- **Support:** 0.6, meaning 60% of all transactions contain both items.
- **Confidence:** 0.75, indicating high reliability of this rule.
- **Lift:** 0.9375, suggesting no strong association, but the high confidence indicates it's a common pairing.

For the rule (Scarf) → (Pants):
- **Support:** 0.6, meaning 60% of all transactions contain both items.
- **Confidence:** 0.75, indicating high reliability of this rule.
- **Lift:** 1.2500, suggesting a positive association, but the high confidence indicates it's a common pairing.

For the rule (Scarf) → (Pants):
- **Support:** 0.6, meaning 60% of all transactions contain both items.
- **Confidence:** 1.0, indicating high reliability of this rule.
- **Lift:** 1.2500, suggesting a positive association, and the high confidence indicates it's a common pairing.

In [112]:
data = pd.read_csv("Market_Basket_Optimisation.csv", dtype = str)

data

Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
0,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
1,chutney,,,,,,,,,,,,,,,,,,,
2,turkey,avocado,,,,,,,,,,,,,,,,,,
3,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
4,low fat yogurt,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,butter,light mayo,fresh bread,,,,,,,,,,,,,,,,,
7496,burgers,frozen vegetables,eggs,french fries,magazines,green tea,,,,,,,,,,,,,,
7497,chicken,,,,,,,,,,,,,,,,,,,
7498,escalope,green tea,,,,,,,,,,,,,,,,,,


In [114]:
# Display general information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   shrimp             7500 non-null   object
 1   almonds            5746 non-null   object
 2   avocado            4388 non-null   object
 3   vegetables mix     3344 non-null   object
 4   green grapes       2528 non-null   object
 5   whole weat flour   1863 non-null   object
 6   yams               1368 non-null   object
 7   cottage cheese     980 non-null    object
 8   energy drink       653 non-null    object
 9   tomato juice       394 non-null    object
 10  low fat yogurt     255 non-null    object
 11  green tea          153 non-null    object
 12  honey              86 non-null     object
 13  salad              46 non-null     object
 14  mineral water      24 non-null     object
 15  salmon             7 non-null      object
 16  antioxydant juice  3 non-null      object


### Visualization cannot be performed on this dataset!

In [121]:
# Encode the transactions
te2 = TransactionEncoder()
te_ary2 = te2.fit(data).transform(data)
dff = pd.DataFrame(te_ary2, columns = te2.columns_)

dff

Unnamed: 0,Unnamed: 1,a,b,c,d,e,f,g,h,i,...,p,r,s,t,u,v,w,x,y,z
0,False,False,False,False,False,False,False,False,True,True,...,True,True,True,False,False,False,False,False,False,False
1,False,True,False,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,False,True,False,True,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,True,True,True,False,False,True,False,True,False,True,...,False,False,True,True,False,True,False,True,False,False
4,True,True,False,False,False,True,False,True,False,False,...,True,True,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7496,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7497,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7498,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Using the Apriori algorithm on the dataset iss not possible due to the enormous amount of null and Nan values.