In [1]:
# =========================
# FULL END-TO-END HYBRID RECOMMENDER
# Uses DF_SAMPLE (1000 invoices)
# =========================

import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

# -------------------------
# 0) Load and sample data
# -------------------------
df = pd.read_csv("combined_unique.csv", encoding="latin-1")

# Keep required columns and drop rows missing required fields
df = df[['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'Price', 'Customer ID', 'Country']].dropna(subset=['Invoice', 'Description', 'Price'])

# Convert InvoiceDate to datetime if possible
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')

# Random 1000 invoices sample (safe for Colab / local)
sample_invoices = df['Invoice'].drop_duplicates().sample(1000, random_state=42)
df_sample = df[df['Invoice'].isin(sample_invoices)].copy()

# Ensure Customer ID exists (we keep null Customer ID rows for Apriori/content but not for CF/RFM)
# We'll also create a numeric-safe CustomerID column for CF/RFM where possible
df_sample['Customer ID'] = df_sample['Customer ID'].astype(str).replace('nan', np.nan)
df_sample['CustomerID_numeric'] = pd.to_numeric(df_sample['Customer ID'], errors='coerce').astype('Int64')

# Clean Description (lowercase, strip punctuation)
df_sample['Description'] = (
    df_sample['Description'].astype(str)
    .str.lower()
    .str.replace(r"[^a-z0-9 ]", " ", regex=True)
    .str.strip()
)

# For consistent downstream usage rename Customer column (we'll keep original 'Customer ID' too)
df_sample = df_sample.rename(columns={"Customer ID": "Customer ID"})

# -------------------------
# 1) APRIORI (transaction ‚Üí items)
#    (uses Invoice and Description; uses Price aggregation like your prior code)
# -------------------------
basket = (
    df_sample.groupby(['Invoice', 'Description'])['Price']
    .sum()
    .unstack(fill_value=0)
)
# convert to 0/1
basket = (basket > 0).astype(int)

rules = pd.DataFrame()
try:
    frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
except Exception as e:
    print("Apriori failed or found no frequent itemsets:", e)
    rules = pd.DataFrame()

def apriori_recommend(product, top_n=5):
    product_norm = str(product).lower().strip()
    if rules.empty:
        return []
    try:
        related = rules[rules['antecedents'].apply(lambda s: product_norm in [str(i).lower().strip() for i in s])]
        if related.empty:
            return []
        recs = (
            related.sort_values('lift', ascending=False)
                   .head(top_n)
                   .apply(lambda x: list(x['consequents']), axis=1)
                   .explode()
                   .unique()
                   .tolist()
        )
        return [str(r).lower().strip() for r in recs][:top_n]
    except Exception:
        return []

# -------------------------
# 2) COLLABORATIVE FILTERING (Item-item)
#    (uses Customer ID and Description; drop missing Customer IDs)
# -------------------------
df_cf = df_sample.dropna(subset=['CustomerID_numeric', 'Description', 'Price']).copy()
# create user-item matrix (Customer √ó Description) using total Price then binarize
user_item = (
    df_cf.groupby(['CustomerID_numeric', 'Description'])['Price']
    .sum()
    .unstack(fill_value=0)
)
user_item_bin = (user_item > 0).astype(int)

item_similarity = cosine_similarity(user_item_bin.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_bin.columns, columns=user_item_bin.columns)

def collaborative_recommend(product, top_n=5):
    product_norm = str(product).lower().strip()
    if product_norm not in item_similarity_df.index:
        # attempt fuzzy match: find descriptions containing product substring
        candidates = [idx for idx in item_similarity_df.index if product_norm in idx]
        if not candidates:
            return []
        prod = candidates[0]
    else:
        prod = product_norm
    sim_scores = item_similarity_df[prod].sort_values(ascending=False)
    recs = sim_scores.iloc[1: top_n + 1].index.tolist()  # exclude self
    return [str(r).lower().strip() for r in recs]

# -------------------------
# 3) CONTENT-BASED (TF-IDF on Description)
#    (deduplicate descriptions and average Price as before)
# -------------------------
df_content = (
    df_sample.groupby('Description', as_index=False)['Price'].mean()
)

# Ensure cleaned descriptions
df_content['Description'] = df_content['Description'].astype(str).str.lower().str.replace(r"[^a-z0-9 ]", " ", regex=True).str.strip()

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_content['Description'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
indices = pd.Series(df_content.index, index=df_content['Description']).drop_duplicates()

def content_recommend(product, top_n=5):
    product_norm = str(product).lower().strip()
    # exact match or substring match
    if product_norm not in indices:
        matches = [p for p in indices.index if product_norm in p]
        if not matches:
            return []
        product_norm = matches[0]
    idx = indices[product_norm]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1: top_n + 1]
    similar_indices = [i[0] for i in sim_scores]
    return df_content.iloc[similar_indices]['Description'].astype(str).str.lower().tolist()

# -------------------------
# 4) RFM Model
#    (uses InvoiceDate, Invoice, Price, Customer ID)
# -------------------------
df_rfm = df_sample.dropna(subset=['CustomerID_numeric', 'InvoiceDate']).copy()
df_rfm = df_rfm[df_rfm['Quantity'] > 0]  # remove returns/negatives if any

df_rfm['TotalAmount'] = df_rfm['Quantity'] * df_rfm['Price']
snapshot_date = df_rfm['InvoiceDate'].max() + pd.Timedelta(days=1)

rfm = df_rfm.groupby('CustomerID_numeric').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'Invoice': 'nunique',
    'TotalAmount': 'sum'
}).reset_index().rename(columns={'InvoiceDate': 'Recency', 'Invoice': 'Frequency', 'TotalAmount': 'Monetary'})

# Score R, F, M
rfm['R_rank'] = pd.qcut(rfm['Recency'].rank(method='first'), 5, labels=[5,4,3,2,1]).astype(int)
rfm['F_rank'] = pd.qcut(rfm['Frequency'].rank(method='first'), 5, labels=[1,2,3,4,5]).astype(int)
rfm['M_rank'] = pd.qcut(rfm['Monetary'].rank(method='first'), 5, labels=[1,2,3,4,5]).astype(int)
rfm['RFM_Score'] = rfm[['R_rank','F_rank','M_rank']].sum(axis=1)

# Top customers (top 10%)
top_n_customers = max(1, int(0.1 * len(rfm)))
top_customers = rfm.sort_values('RFM_Score', ascending=False).head(top_n_customers)['CustomerID_numeric'].tolist()
top_products = df_rfm[df_rfm['CustomerID_numeric'].isin(top_customers)]['Description'].value_counts().head(20).index.tolist()

def rfm_recommend(product, top_n=5):
    # simply return top_products as before
    return [str(p).lower().strip() for p in top_products[:top_n]]

# -------------------------
# 5) FINAL ENSEMBLE (your provided function, unchanged)
#    weights adjustable at top of this block
# -------------------------
# Model weights (edit if needed)
W_APRIORI = 0.25
W_COLLAB  = 0.25
W_CONTENT = 0.25
W_RFM     = 0.25

def ensemble_recommend(product, top_n=5):
    product_norm = str(product).lower().strip()
    rows = []

    # Apriori
    if 'rules' in globals() and not rules.empty:
        try:
            related = rules[
                rules['antecedents'].apply(
                    lambda s: product_norm in [str(i).lower().strip() for i in s]
                )
            ]
            for _, r in related.iterrows():
                for c in r['consequents']:
                    c = str(c).lower().strip()
                    lift_score = float(r.get('lift', 1))
                    rows.append((c, 'Apriori', lift_score))
        except Exception:
            pass

    # Collaborative
    try:
        recs = collaborative_recommend(product_norm, top_n=10)
        for rank, item in enumerate(recs):
            item = str(item).lower().strip()
            score = 1 / (rank + 1)
            rows.append((item, 'Collaborative', score))
    except Exception:
        pass

    # Content
    try:
        recs = content_recommend(product_norm, top_n=10)
        for rank, item in enumerate(recs):
            item = str(item).lower().strip()
            score = 1 / (rank + 1)
            rows.append((item, 'Content', score))
    except Exception:
        pass

    # RFM
    try:
        recs = rfm_recommend(product_norm, top_n=10)
        for rank, item in enumerate(recs):
            item = str(item).lower().strip()
            score = 1 / (rank + 2)
            rows.append((item, 'RFM', score))
    except Exception:
        pass

    if not rows:
        return pd.DataFrame(columns=['Product','Model','weighted_score'])

    df_all = pd.DataFrame(rows, columns=['Product','Model','raw_score'])
    # normalize within model
    df_all['norm_score'] = df_all.groupby('Model')['raw_score'].transform(lambda s: s / s.max() if s.max() > 0 else 0)

    weight_map = {'Apriori': W_APRIORI, 'Collaborative': W_COLLAB, 'Content': W_CONTENT, 'RFM': W_RFM}
    df_all['weighted_score'] = df_all.apply(lambda x: x['norm_score'] * weight_map.get(x['Model'], 0), axis=1)

    final_df = (df_all.sort_values('weighted_score', ascending=False)
                      .drop_duplicates(subset=['Product'])
                      .head(top_n)
                      .reset_index(drop=True))

    # restore original casing if df_content exists
    if 'df_content' in globals():
        mapping = {d.lower(): d for d in df_content['Description'].astype(str)}
        final_df['Product'] = final_df['Product'].map(mapping).fillna(final_df['Product'])

    return final_df[['Product','Model','weighted_score']]

# -------------------------
# 6) TEST the ensemble with single input (product page)
# -------------------------
test_product = "set 5 red spotty lid glass bowls"  # change as needed
recommendations = ensemble_recommend(test_product, top_n=5)

print("\nüõçÔ∏è Recommended Products:")
print(recommendations.to_string(index=False))



üõçÔ∏è Recommended Products:
                           Product         Model  weighted_score
   set 20 red spotty paper napkins       Apriori        0.250000
          tea bag plate red spotty Collaborative        0.250000
 set 5 blue spotty lid glass bowls       Content        0.250000
          regency cakestand 3 tier           RFM        0.250000
white hanging heart t light holder           RFM        0.166667
