In [1]:
import pandas as pd
from scipy.sparse import coo_matrix
from implicit.nearest_neighbours import ItemItemRecommender
from collections import defaultdict, Counter

# === 1. Load Datasets ===
all_orders = pd.read_csv('all_except_last_orders.csv')
last_orders = pd.read_csv('last_orders_subset.csv')

# Ensure consistent types
all_orders['SKU'] = all_orders['SKU'].astype(str)
last_orders['SKU'] = last_orders['SKU'].astype(str)

# === 2. Create ID Mappings ===
user_ids = all_orders['Member'].unique().tolist()
item_ids = all_orders['SKU'].unique().tolist()

user_id_map = {u: i for i, u in enumerate(user_ids)}
item_id_map = {i: j for j, i in enumerate(item_ids)}
rev_item_id_map = {j: i for i, j in item_id_map.items()}

# Add numeric indices
all_orders['user_idx'] = all_orders['Member'].map(user_id_map)
all_orders['item_idx'] = all_orders['SKU'].map(item_id_map)

# === 3. Build User-Item Matrix ===
data = all_orders.groupby(['user_idx', 'item_idx']).size().reset_index(name='count')
user_item_matrix = coo_matrix((data['count'], (data['item_idx'], data['user_idx'])), dtype='float64')

# === 4. Train Item-Item Collaborative Filtering Model ===
model = ItemItemRecommender(K=20)
model.fit(user_item_matrix)

# === 5. Build Global Popularity Fallback ===
popular_items = all_orders['SKU'].value_counts()
fallback_items = [item_id_map[sku] for sku in popular_items.index if sku in item_id_map]

# === 6. Generate Recommendations ===
recommendations = []
for (member, order), group in last_orders.groupby(['Member', 'Order']):
    skus = group['SKU'].tolist()
    user_idx = user_id_map.get(member)

    # Map known SKUs to idx
    known_item_idxs = [item_id_map.get(sku) for sku in skus if sku in item_id_map]

    score_dict = defaultdict(float)
    for item_idx in known_item_idxs:
        try:
            similar_items = model.similar_items(item_idx, N=20)
            # Add a check to ensure similar_items is a list of tuples with 2 elements
            if isinstance(similar_items, list) and all(isinstance(rec, tuple) and len(rec) == 2 for rec in similar_items):
                for sim_idx, score in similar_items:
                    if sim_idx not in known_item_idxs:
                        score_dict[sim_idx] += score
        except KeyError:
            continue

    # Sort scores and filter top 5
    sorted_items = [item for item, _ in sorted(score_dict.items(), key=lambda x: -x[1])]

    # Fallback if < 5
    i = 0
    while len(sorted_items) < 5 and i < len(fallback_items):
        f_item = fallback_items[i]
        if f_item not in known_item_idxs and f_item not in sorted_items:
            sorted_items.append(f_item)
        i += 1

    # Ensure exactly 5 recommendations
    top5 = sorted_items[:5]

    for rec_item_idx in top5:
        recommendations.append({
            'Member': member,
            'Order': order,
            'SKU': rev_item_id_map[rec_item_idx]
        })

# === 7. Save Output ===
rec_df = pd.DataFrame(recommendations)
assert rec_df.groupby(['Member', 'Order']).size().min() == 5, "Each order must have 5 SKUs"


rec_df.head()



  0%|          | 0/638 [00:00<?, ?it/s]

Unnamed: 0,Member,Order,SKU
0,SSCEHNS,8069966,15668381
1,SSCEHNS,8069966,15668688
2,SSCEHNS,8069966,15668460
3,SSCEHNS,8069966,15668379
4,SSCEHNS,8069966,15669878


In [3]:
rec_df["Order"].value_counts()

Order
8069966    5
8141498    5
8339940    5
7764234    5
8321717    5
          ..
8113176    5
7369866    5
7517068    5
8180759    5
8105205    5
Name: count, Length: 638, dtype: int64

In [4]:
rec_df.drop_duplicates()

Unnamed: 0,Member,Order,SKU
0,SSCEHNS,8069966,15668381
1,SSCEHNS,8069966,15668688
2,SSCEHNS,8069966,15668460
3,SSCEHNS,8069966,15668379
4,SSCEHNS,8069966,15669878
...,...,...,...
3185,SWRNHCS,8105205,15668381
3186,SWRNHCS,8105205,15668688
3187,SWRNHCS,8105205,15668460
3188,SWRNHCS,8105205,15668379


In [5]:
recc_df = rec_df[["Order", "SKU", "Member"]].reset_index().rename(columns = {"index": "ID"})
recc_df["ID"] = recc_df["ID"].apply(lambda x: x+1)
recc_df

Unnamed: 0,ID,Order,SKU,Member
0,1,8069966,15668381,SSCEHNS
1,2,8069966,15668688,SSCEHNS
2,3,8069966,15668460,SSCEHNS
3,4,8069966,15668379,SSCEHNS
4,5,8069966,15669878,SSCEHNS
...,...,...,...,...
3185,3186,8105205,15668381,SWRNHCS
3186,3187,8105205,15668688,SWRNHCS
3187,3188,8105205,15668460,SWRNHCS
3188,3189,8105205,15668379,SWRNHCS


In [6]:
recc_df.to_csv("item_based_collab_filtering_recc2.csv", index = False)