In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

from scipy import sparse
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans

In [2]:
device = torch.device('mps')
device

device(type='mps')

In [3]:
df_full = pd.read_csv('full_orders_v5.csv', sep=';')

In [4]:
df_full.head()

Unnamed: 0,anon_id_encrypred,articul_encrypred,color_base,sizeid,size_title,order_date,store,brand,ktt1,ktt2,ktt3,ktt4,title,product_id,product_created_at,base_price,net_price,sale_percentage
0,wyyypqqtprtqppuq,ttwyyup,Бежевый,48,48,2019-01-03,E,Eton,Товары для мужчин,Одежда,"Сорочки, рубашки",Сорочки,Хлопковая сорочка,12025510,2017-05-17,13600.0,13600.0,0.0
1,wyyypqqtptpuuwpy,twvytqp,Чёрный,S,80-85,2019-01-04,E,Coccinelle,Товары для женщин,Аксессуары из кожи,Ремни,Ремень,Кожаный ремень,13084351,2018-12-25,9600.0,8640.0,0.1
2,wyyypqqtpuupxxvx,twtxyxt,Чёрный,NS,NS,2019-01-03,E,Benedetta Bruzziches,Товары для женщин,Сумки,Сумки вечерние,Сумка мини вечерняя,Сумка Best Friend,13050017,2018-12-04,28950.0,28950.0,0.0
3,wyyypqqtputvywvt,twuvvpq,Чёрный,XL,52,2019-01-09,E,Dsquared,Товары для мужчин,Одежда,Одежда джерси,Футболка джерси,Хлопковая футболка,12907421,2018-08-29,13850.0,13850.0,0.0
4,wyyypqqtqvvrwwrv,ttyxryu,Голубой,XS,42,2019-01-05,E,Les Tien,Товары для женщин,Одежда,Одежда джерси,Брюки джерси,Хлопковые джоггеры,12216505,2017-08-11,30750.0,30750.0,0.0


In [5]:
max_purchase_days = 100
min_purchases = 2

In [6]:
purchase_days = df_full.groupby('anon_id_encrypred')['order_date'].nunique().reset_index()
purchase_days.columns = ['anon_id_encrypred', 'unique_purchase_days']
resellers = purchase_days[purchase_days['unique_purchase_days'] > max_purchase_days]['anon_id_encrypred']

user_purchase_counts = df_full['anon_id_encrypred'].value_counts()

df_filtered = df_full[df_full['anon_id_encrypred'].isin(user_purchase_counts[user_purchase_counts >= min_purchases].index)]
df_filtered = df_filtered[~df_filtered['anon_id_encrypred'].isin(resellers)]

In [7]:
df_filtered.head()

Unnamed: 0,anon_id_encrypred,articul_encrypred,color_base,sizeid,size_title,order_date,store,brand,ktt1,ktt2,ktt3,ktt4,title,product_id,product_created_at,base_price,net_price,sale_percentage
0,wyyypqqtprtqppuq,ttwyyup,Бежевый,48,48,2019-01-03,E,Eton,Товары для мужчин,Одежда,"Сорочки, рубашки",Сорочки,Хлопковая сорочка,12025510,2017-05-17,13600.0,13600.0,0.0
1,wyyypqqtptpuuwpy,twvytqp,Чёрный,S,80-85,2019-01-04,E,Coccinelle,Товары для женщин,Аксессуары из кожи,Ремни,Ремень,Кожаный ремень,13084351,2018-12-25,9600.0,8640.0,0.1
2,wyyypqqtpuupxxvx,twtxyxt,Чёрный,NS,NS,2019-01-03,E,Benedetta Bruzziches,Товары для женщин,Сумки,Сумки вечерние,Сумка мини вечерняя,Сумка Best Friend,13050017,2018-12-04,28950.0,28950.0,0.0
5,wyyysqqtqpwxpqxp,ttxrppx,Белый,28,46,2019-01-03,E,AG Adriano Goldschmied,Товары для женщин,Одежда,Одежда джинсовая,Джинсы,Джинсы Ex-boyfriend Slim,12263976,2017-08-29,26250.0,26250.0,0.0
6,wyyyrqqtqqrspyuv,twrxyqu,Зелёный,M,46,2019-01-04,E,MRZ,Товары для женщин,Одежда,Одежда джерси,Блузы джерси,Шерстяной кардиган,12831366,2018-07-16,54900.0,54900.0,0.0


In [8]:
df_filtered.shape, df_full.shape

((3231102, 18), (3485912, 18))

In [16]:
df_filtered.to_csv('filtered_orders_2.csv', index=False)

---

In [9]:
# df = pd.read_csv('filtered_orders.csv')
df = df_filtered
df = df.sort_values(by=['anon_id_encrypred', 'order_date'])

In [10]:
train_data = []
test_data = []
split_ratio = 0.8

for user, user_df in df.groupby('anon_id_encrypred'):
    split_index = int(len(user_df) * split_ratio)
    train_data.append(user_df.iloc[:split_index])
    test_data.append(user_df.iloc[split_index:])

train_df = pd.concat(train_data)
test_df = pd.concat(test_data)

train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

---

In [13]:
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')

In [11]:
test_target_items = test_df.groupby('anon_id_encrypred')['product_id'].apply(set).to_dict()

In [14]:
# Load user features
user_features_df = pd.read_csv('user_features.csv', index_col=0)
user_features_df.index = user_features_df['anon_id_encrypred']

# Preprocess user features
feature_columns = [
    'total_receipts', 'purchase_frequency', 'seasonality_var', 'holiday_ratio',
    'avg_receipt_value', 'avg_discount', 'premium_ratio', 'brand_entropy',
    'collection_freshness', 'favourite_brand_ratio'
]

# Standardize features
scaler = StandardScaler()
user_features_scaled = scaler.fit_transform(user_features_df[feature_columns])
user_features_scaled_df = pd.DataFrame(
    user_features_scaled, 
    index=user_features_df.index,
    columns=feature_columns
)

# Create user clusters
n_clusters = 8  # You can tune this
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
user_features_df['cluster'] = kmeans.fit_predict(user_features_scaled)

In [15]:
# Create sparse matrices instead of dense pandas DataFrames
unique_users = train_df['anon_id_encrypred'].unique()
all_items = train_df['product_id'].unique()

# Create mappings between IDs and indices
user_to_index = {user: i for i, user in enumerate(unique_users)}
item_to_index = {item: i for i, item in enumerate(all_items)}
index_to_user = {i: user for user, i in user_to_index.items()}
index_to_item = {i: item for item, i in item_to_index.items()}

# Process all data at once to create a sparse matrix
user_indices = [user_to_index[user] for user in train_df['anon_id_encrypred']]
item_indices = [item_to_index[item] for item in train_df['product_id']]
data = np.ones(len(user_indices))

# Create a sparse matrix with counts (1 for each interaction)
user_item_matrix = sparse.csr_matrix(
    (data, (user_indices, item_indices)), 
    shape=(len(unique_users), len(all_items))
)

# TODO: check later
# Count multiple purchases of the same item:
from collections import Counter
user_item_counts = Counter(zip(
    train_df['anon_id_encrypred'].map(user_to_index), 
    train_df['product_id'].map(item_to_index)
))

rows, cols, data = [], [], []
for (user_idx, item_idx), count in user_item_counts.items():
    rows.append(user_idx)
    cols.append(item_idx)
    data.append(count)

user_item_matrix = sparse.csr_matrix(
    (data, (rows, cols)), 
    shape=(len(unique_users), len(all_items))
)

In [16]:
print(f'Number of items: {user_item_matrix.shape[1]}')

Number of items: 288306


In [17]:
item_user_matrix = user_item_matrix.T.tocsr()
n_neighbors = min(50, item_user_matrix.shape[0] - 1)  # Don't request more neighbors than items
model = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine', algorithm='brute')
model.fit(item_user_matrix)

In [18]:
def get_similar_items(item_id, n_recommendations=5):
    if item_id not in item_to_index:
        return []
    
    item_idx = item_to_index[item_id]
    # Get the item vector
    item_vector = item_user_matrix[item_idx:item_idx+1]
    
    # Find nearest neighbors
    distances, indices = model.kneighbors(item_vector, n_neighbors=n_recommendations+1)
    
    # First item is the item itself, so exclude it
    similar_items = [index_to_item[idx] for idx in indices.flatten()[1:]]
    return similar_items

In [19]:
def get_cf_recommendations(user_id, n_recommendations=10):
    """Recommend items for a user based on their purchase history"""
    if user_id not in user_to_index:
        return []
    
    user_idx = user_to_index[user_id]
    
    # Get user's purchased items
    user_items = user_item_matrix[user_idx].toarray().flatten()
    purchased_items = [index_to_item[i] for i, count in enumerate(user_items) if count > 0]
    
    # Get similar items for each purchased item
    all_recommendations = []
    for item in purchased_items:
        similar_items = get_similar_items(item, n_recommendations=5)
        all_recommendations.extend(similar_items)
    
    # Count occurrences to find most frequently recommended items
    from collections import Counter
    recommendation_counts = Counter(all_recommendations)
    
    # Remove already purchased items
    for item in purchased_items:
        if item in recommendation_counts:
            del recommendation_counts[item]
    
    # Get top recommendations
    top_recommendations = [item for item, count in recommendation_counts.most_common(n_recommendations)]
    return top_recommendations

def get_feature_based_recommendations(user_id, n_recommendations=10):
    """Get recommendations based on user features and clusters"""
    if user_id not in user_features_df.index:
        return []
    
    # Get user's cluster
    user_cluster = user_features_df.loc[user_id, 'cluster']
    
    # Get other users in the same cluster
    similar_users = user_features_df[user_features_df['cluster'] == user_cluster].index.tolist()
    
    # Remove the target user
    if user_id in similar_users:
        similar_users.remove(user_id)
    
    # Find items bought by similar users
    items_by_similar_users = []
    for similar_user in similar_users[:50]:  # Limit to 50 similar users for efficiency
        if similar_user in user_to_index:
            user_idx = user_to_index[similar_user]
            user_items = user_item_matrix[user_idx].toarray().flatten()
            user_purchased = [index_to_item[i] for i, count in enumerate(user_items) if count > 0]
            items_by_similar_users.extend(user_purchased)
    
    # Count item frequencies
    from collections import Counter
    item_counts = Counter(items_by_similar_users)
    
    # Get user's purchased items to filter them out
    purchased_items = []
    if user_id in user_to_index:
        user_idx = user_to_index[user_id]
        user_items = user_item_matrix[user_idx].toarray().flatten()
        purchased_items = [index_to_item[i] for i, count in enumerate(user_items) if count > 0]
    
    # Remove already purchased items and get the most common items
    top_items = [item for item, _ in item_counts.most_common(n_recommendations * 2) 
                if item not in purchased_items]
    
    return top_items[:n_recommendations]

In [21]:
import multiprocessing
from functools import partial
from tqdm import tqdm

# For Jupyter notebooks, we need to explicitly set the start method for multiprocessing
# This needs to be called at the top level of your notebook
multiprocessing.set_start_method('spawn', force=True)

# Helper function to evaluate a single user with CF approach
def evaluate_user_cf(user_id, k, test_targets, user_idx_map, get_recs_func):
    if user_id not in user_idx_map:
        return None
        
    actual_items = test_targets[user_id]
    
    # Get CF recommendations
    cf_items = get_recs_func(user_id, n_recommendations=k)
    correct_count = len(set(cf_items) & actual_items)
    
    # Calculate metrics
    precision = correct_count / len(cf_items) if cf_items else 0
    
    return {
        'user_id': user_id,
        'correct_count': correct_count,
        'total_actual': len(actual_items),
        'precision': precision
    }

# Helper function to evaluate a single user with Feature-based approach
def evaluate_user_feature(user_id, k, test_targets, feature_idx_map, get_recs_func):
    if user_id not in feature_idx_map:
        return None
        
    actual_items = test_targets[user_id]
    
    # Get feature-based recommendations
    feature_items = get_recs_func(user_id, n_recommendations=k)
    correct_count = len(set(feature_items) & actual_items)
    
    # Calculate metrics
    precision = correct_count / len(feature_items) if feature_items else 0
    
    return {
        'user_id': user_id,
        'correct_count': correct_count,
        'total_actual': len(actual_items),
        'precision': precision
    }

# Parallel evaluation for CF recommendations
def evaluate_cf_recommendations(k_list=[3, 5, 10], n_sample=10_000, n_workers=None):
    # Track metrics for CF model
    results = {'precision': {}, 'recall': {}}
    
    sample_users = random.sample(list(test_target_items.keys()), min(n_sample, len(test_target_items)))
    
    for k in k_list:
        print(f"Evaluating CF @{k}...")
        
        # Non-parallel fallback if multiprocessing fails
        if n_workers is None or n_workers <= 1:
            user_results = []
            for user_id in tqdm(sample_users, desc=f"CF @{k}"):
                result = evaluate_user_cf(
                    user_id=user_id,
                    k=k,
                    test_targets=test_target_items,
                    user_idx_map=user_to_index,
                    get_recs_func=get_cf_recommendations
                )
                if result is not None:
                    user_results.append(result)
        else:
            try:
                # Try parallel processing
                eval_func = partial(
                    evaluate_user_cf, 
                    k=k, 
                    test_targets=test_target_items,
                    user_idx_map=user_to_index,
                    get_recs_func=get_cf_recommendations
                )
                
                with multiprocessing.Pool(processes=n_workers) as pool:
                    user_results = list(tqdm(
                        pool.imap(eval_func, sample_users),
                        total=len(sample_users),
                        desc=f"CF @{k}"
                    ))
            except Exception as e:
                print(f"Multiprocessing failed with error: {e}")
                print("Falling back to non-parallel execution...")
                
                # Fall back to non-parallel execution
                user_results = []
                for user_id in tqdm(sample_users, desc=f"CF @{k}"):
                    result = evaluate_user_cf(
                        user_id=user_id,
                        k=k,
                        test_targets=test_target_items,
                        user_idx_map=user_to_index,
                        get_recs_func=get_cf_recommendations
                    )
                    if result is not None:
                        user_results.append(result)
        
        # Filter out None results
        user_results = [r for r in user_results if r is not None]
        
        # Aggregate results
        correct_predictions = sum(r['correct_count'] for r in user_results)
        total_predictions = sum(r['total_actual'] for r in user_results)
        total_precision = sum(r['precision'] for r in user_results)
        processed_users = len(user_results)
        
        # Calculate and store metrics
        results['precision'][k] = total_precision / processed_users if processed_users > 0 else 0
        results['recall'][k] = correct_predictions / total_predictions if total_predictions > 0 else 0
        
        # Print results
        print(f"--- CF Results for k={k} ---")
        print(f"Precision@{k}: {results['precision'][k]:.4f}")
        print(f"Recall@{k}: {results['recall'][k]:.4f}")
        print()
    
    return results

# Parallel evaluation for Feature-based recommendations
def evaluate_feature_recommendations(k_list=[3, 5, 10], n_sample=10_000, n_workers=None):
    # Track metrics for Feature-based model
    results = {'precision': {}, 'recall': {}}
    
    sample_users = random.sample(list(test_target_items.keys()), min(n_sample, len(test_target_items)))
    
    for k in k_list:
        print(f"Evaluating Feature-based @{k}...")
        
        # Non-parallel fallback if multiprocessing fails
        if n_workers is None or n_workers <= 1:
            user_results = []
            for user_id in tqdm(sample_users, desc=f"Feature @{k}"):
                result = evaluate_user_feature(
                    user_id=user_id,
                    k=k,
                    test_targets=test_target_items,
                    feature_idx_map=user_features_df.index,
                    get_recs_func=get_feature_based_recommendations
                )
                if result is not None:
                    user_results.append(result)
        else:
            try:
                # Try parallel processing
                eval_func = partial(
                    evaluate_user_feature, 
                    k=k, 
                    test_targets=test_target_items,
                    feature_idx_map=user_features_df.index,
                    get_recs_func=get_feature_based_recommendations
                )
                
                with multiprocessing.Pool(processes=n_workers) as pool:
                    user_results = list(tqdm(
                        pool.imap(eval_func, sample_users),
                        total=len(sample_users),
                        desc=f"Feature @{k}"
                    ))
            except Exception as e:
                print(f"Multiprocessing failed with error: {e}")
                print("Falling back to non-parallel execution...")
                
                # Fall back to non-parallel execution
                user_results = []
                for user_id in tqdm(sample_users, desc=f"Feature @{k}"):
                    result = evaluate_user_feature(
                        user_id=user_id,
                        k=k,
                        test_targets=test_target_items,
                        feature_idx_map=user_features_df.index,
                        get_recs_func=get_feature_based_recommendations
                    )
                    if result is not None:
                        user_results.append(result)
        
        # Filter out None results
        user_results = [r for r in user_results if r is not None]
        
        # Aggregate results
        correct_predictions = sum(r['correct_count'] for r in user_results)
        total_predictions = sum(r['total_actual'] for r in user_results)
        total_precision = sum(r['precision'] for r in user_results)
        processed_users = len(user_results)
        
        # Calculate and store metrics
        results['precision'][k] = total_precision / processed_users if processed_users > 0 else 0
        results['recall'][k] = correct_predictions / total_predictions if total_predictions > 0 else 0
        
        # Print results
        print(f"--- Feature-based Results for k={k} ---")
        print(f"Precision@{k}: {results['precision'][k]:.4f}")
        print(f"Recall@{k}: {results['recall'][k]:.4f}")
        print()
    
    return results

# Example usage:
# results_cf = evaluate_cf_recommendations(k_list=[5], n_sample=10000, n_workers=6)
# results_feature = evaluate_feature_recommendations(k_list=[5], n_sample=10000, n_workers=6)

In [None]:
results_cf = evaluate_cf_recommendations(k_list=[5], n_sample=10000, n_workers=10)

Evaluating CF @5...


CF @5:   0%|                                                                                 | 0/10000 [00:00<?, ?it/s]

In [37]:
results_feature_based = evaluate_feature_recommendations(k_list=[5], n_sample=10000, n_workers=10)

Evaluating Feature-based @5...


Feature @5: 100%|██████████| 10000/10000 [16:48<00:00,  9.91it/s]


--- Feature-based Results for k=5 ---
Precision@5: 0.0001
Recall@5: 0.0002

