### Load preprocessed basket list

We load the `basket_list.pkl` saved from Step 2. 
Each transaction represents a customer's shopping basket. 
This makes Step 4 fully independent.




In [13]:
import pickle

file_path = r"C:\coding5final\coding5\data\processed\basket_list.pkl"

with open(file_path, "rb") as f:
    basket_list = pickle.load(f)

print(f"Number of transactions: {len(basket_list)}")
print("Example basket:", basket_list[0])




Number of transactions: 14964
Example basket: ['sausage', 'whole milk', 'semi-finished bread', 'yogurt']


### Import libraries

- `mlxtend` provides tools for market basket analysis
- `TransactionEncoder` converts baskets to one-hot encoding
- `apriori` finds frequent itemsets
- `association_rules` generates recommendation rules



In [24]:
# !pip install mlxtend

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules



### One-hot encode baskets

- Each column = an item
- Each row = a transaction
- True indicates the item is in that basket
This format is required for Apriori algorithm.



In [25]:
te = TransactionEncoder()
te_ary = te.fit(basket_list).transform(basket_list)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

df_encoded.head()


Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Find frequent itemsets

- `min_support=0.005` allows rare items to be included
- `use_colnames=True` ensures we see item names instead of indices



In [26]:
frequent_itemsets = apriori(df_encoded, min_support=0.001, use_colnames=True)
frequent_itemsets.sort_values('support', ascending=False).head(10)




Unnamed: 0,support,itemsets
146,0.157912,(whole milk)
90,0.122093,(other vegetables)
109,0.109997,(rolls/buns)
123,0.0971,(soda)
147,0.085873,(yogurt)
110,0.069567,(root vegetables)
139,0.067763,(tropical fruit)
10,0.060679,(bottled water)
115,0.060345,(sausage)
28,0.053128,(citrus fruit)


### Generate association rules

- `antecedents`: items already in basket
- `consequents`: recommended items
- `confidence`: probability of consequent given antecedent
- `lift`: strength of the rule

We use `min_threshold=0.1` for confidence to capture more potential rules.


In [28]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.05)
rules['antecedents'] = rules['antecedents'].apply(lambda x: set(x))
rules['consequents'] = rules['consequents'].apply(lambda x: set(x))
rules.sort_values(by='confidence', ascending=False).head(10)



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
449,"{yogurt, sausage}",{whole milk},0.005747,0.157912,0.00147,0.255814,1.619975,1.0,0.000563,1.131555,0.384919,0.009065,0.116261,0.132562
436,"{rolls/buns, sausage}",{whole milk},0.005346,0.157912,0.001136,0.2125,1.345683,1.0,0.000292,1.069318,0.258264,0.007007,0.064824,0.109847
446,"{soda, sausage}",{whole milk},0.005948,0.157912,0.001069,0.179775,1.13845,1.0,0.00013,1.026655,0.12234,0.006568,0.025963,0.093273
383,{semi-finished bread},{whole milk},0.009489,0.157912,0.001671,0.176056,1.114899,1.0,0.000172,1.022021,0.104045,0.010081,0.021546,0.093318
442,"{rolls/buns, yogurt}",{whole milk},0.007819,0.157912,0.001337,0.17094,1.082501,1.0,0.000102,1.015714,0.076813,0.00813,0.015471,0.089702
448,"{whole milk, sausage}",{yogurt},0.008955,0.085873,0.00147,0.164179,1.911888,1.0,0.000701,1.093688,0.481266,0.015748,0.085662,0.09065
156,{detergent},{whole milk},0.008621,0.157912,0.001403,0.162791,1.030893,1.0,4.2e-05,1.005827,0.030228,0.008499,0.005793,0.085839
205,{ham},{whole milk},0.017108,0.157912,0.00274,0.160156,1.01421,1.0,3.8e-05,1.002672,0.014255,0.015904,0.002665,0.088754
35,{bottled beer},{whole milk},0.045309,0.157912,0.00715,0.157817,0.999397,1.0,-4e-06,0.999887,-0.000632,0.036469,-0.000113,0.101549
173,{frozen fish},{whole milk},0.006816,0.157912,0.001069,0.156863,0.993353,1.0,-7e-06,0.998755,-0.006692,0.006533,-0.001246,0.081817


### Build dynamic recommendation function

- Accepts any basket (even rare combinations)
- Returns top recommended items
- If no exact match, it finds rules where basket partially matches antecedents
- Filters out items already in the basket



In [29]:
def recommend_items_full(basket, rules_df, basket_list, top_n=5):
    basket = set(basket)
    recommendations = {}
    
    # --- Tier 1: Exact matches ---
    for _, row in rules_df.iterrows():
        if row['antecedents'].issubset(basket):
            for item in row['consequents']:
                if item not in basket:
                    recommendations[item] = max(recommendations.get(item, 0), row['confidence'])
    
    # --- Tier 2: Partial matches ---
    if not recommendations:
        for _, row in rules_df.iterrows():
            overlap = len(basket & row['antecedents'])
            if overlap > 0:
                for item in row['consequents']:
                    if item not in basket:
                        score = overlap * row['confidence']
                        recommendations[item] = max(recommendations.get(item, 0), score)
    
    # --- Tier 3: Frequency-based fallback ---
    if not recommendations:
        # Count items that appear with any item in the basket
        co_occur = {}
        for trans in basket_list:
            trans_set = set(trans)
            if basket & trans_set:
                for item in trans_set:
                    if item not in basket:
                        co_occur[item] = co_occur.get(item, 0) + 1
        # Pick top N
        sorted_items = sorted(co_occur.items(), key=lambda x: x[1], reverse=True)
        recommendations = {item: count for item, count in sorted_items[:top_n]}
    
    if not recommendations:
        return ["No recommendation found"]
    
    # Return sorted top N recommendations
    recommended_sorted = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    return [item for item, score in recommended_sorted][:top_n]



### Test dynamic recommendations

- Basket from dataset → guaranteed recommendations
- Rare basket → still receives partial-match recommendations


In [30]:
# Example basket from dataset
basket1 = set(basket_list[0])
print("Basket:", basket1)
print("Recommended items:", recommend_items_full(basket1, rules, basket_list, top_n=5))

# Rare basket
basket2 = {'pasta', 'olive oil'}
print("\nBasket:", basket2)
print("Recommended items:", recommend_items_full(basket2, rules, basket_list, top_n=5))

# Another example
basket3 = {'whole milk', 'yogurt'}
print("\nBasket:", basket3)
print("Recommended items:", recommend_items_full(basket3, rules, basket_list, top_n=5))



Basket: {'whole milk', 'yogurt', 'sausage', 'semi-finished bread'}
Recommended items: ['rolls/buns', 'soda', 'other vegetables', 'tropical fruit', 'bottled beer']

Basket: {'pasta', 'olive oil'}
Recommended items: ['whole milk']

Basket: {'whole milk', 'yogurt'}
Recommended items: ['sausage', 'rolls/buns', 'other vegetables', 'soda', 'tropical fruit']


In [23]:
rules.to_csv(r"C:\coding5final\coding5\data\processed\association_rules.csv", index=False)
print("Association rules saved successfully!")


Association rules saved successfully!
