In [3]:
import os
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [4]:
transactions=pd.read_parquet('/kaggle/input/transactions-filtered-pqt/transactions_filtered.pqt')

In [5]:
customers=pd.read_parquet('/kaggle/input/customers/customers.parquet')

In [6]:
articles=pd.read_parquet('/kaggle/input/articles/articles.parquet')

In [8]:
class CandidateGenerator:
    def __init__(self, transactions):
        self.transactions = transactions

    def split_data(self):
        df = self.transactions
        self.train1 = df.loc[(df["t_dat"] >= datetime.datetime(2020,9,8)) & (df['t_dat'] < datetime.datetime(2020,9,16))]
        self.train2 = df.loc[(df["t_dat"] >= datetime.datetime(2020,9,1)) & (df['t_dat'] < datetime.datetime(2020,9,8))]
        self.train3 = df.loc[(df["t_dat"] >= datetime.datetime(2020,8,23)) & (df['t_dat'] < datetime.datetime(2020,9,1))]
        self.train4 = df.loc[(df["t_dat"] >= datetime.datetime(2020,8,15)) & (df['t_dat'] < datetime.datetime(2020,8,23))]
        
        

    def predict_next_item(self):
        
        self.train_all = self.train_all.sort_values(['customer_id', 't_dat'])
        self.train_all['next_item'] = self.train_all.groupby('customer_id')['article_id'].shift(-1)
        
        pairs = self.train_all.dropna(subset=['next_item'])
        pair_counts = pairs.groupby(['article_id', 'next_item']).size().reset_index(name='count')
        
        self.pred_next = {}
        for item, group in pair_counts.groupby('article_id'):
            total = group['count'].sum()
            most_common = group.loc[group['count'].idxmax()]
            ratio = most_common['count'] / total
            if total >= 5 and ratio >= 0.1:
                self.pred_next[item] = most_common['next_item']

    def get_popular_items(self):
        df = self.train_all.copy()
        end_date = df['t_dat'].max()
        df['pop_factor'] = (end_date - df['t_dat']).dt.days
        df['pop_factor'] = 1 / df['pop_factor'].replace(0, 1)
        popular_items_group = df.groupby('article_id')['pop_factor'].sum()
        self.popular_items = popular_items_group.sort_values(ascending=False).index.tolist()

In [9]:
class RuleBasedCandidateGenerator(CandidateGenerator):
    def group_data(self):
        self.positive_items_per_user1 = self.train1.groupby(['customer_id'])['article_id'].apply(list)
        self.positive_items_per_user2 = self.train2.groupby(['customer_id'])['article_id'].apply(list)
        self.positive_items_per_user3 = self.train3.groupby(['customer_id'])['article_id'].apply(list)
        self.positive_items_per_user4 = self.train4.groupby(['customer_id'])['article_id'].apply(list)
        self.train_all = pd.concat([self.train1, self.train2, self.train3, self.train4], axis=0)


        

    def predict(self,users):
        
        positive_items_per_user1 = self.positive_items_per_user1
        positive_items_per_user2 = self.positive_items_per_user2
        positive_items_per_user3 = self.positive_items_per_user3
        positive_items_per_user4 = self.positive_items_per_user4
        pred_next = self.pred_next
        popular_items = self.popular_items
        candidate_data = []
        
        for user in tqdm(users, desc="Generating candidates"):
            user_output = []
            if user in positive_items_per_user1:
                user_output += list(Counter(positive_items_per_user1[user]).keys())[:12]
            if user in positive_items_per_user2:
                user_output += list(Counter(positive_items_per_user2[user]).keys())[:12]
            if user in positive_items_per_user3:
                user_output += list(Counter(positive_items_per_user3[user]).keys())[:12]
            if user in positive_items_per_user4:
                user_output += list(Counter(positive_items_per_user4[user]).keys())[:12]
        
            user_output = list(dict.fromkeys(user_output))
        
            next_preds = [pred_next[item] for item in user_output
                          if item in pred_next and pred_next[item] not in user_output]
            user_output += next_preds
        
            user_output += list(popular_items[:12 - len(user_output)])
        
            for article in list(dict.fromkeys(user_output))[:12]:
                candidate_data.append({'customer_id': user, 'article_id': article})
    
        return pd.DataFrame(candidate_data)

In [None]:
class MlBasedCandidateGenerator(CandidateGenerator):
    def group_data(self):
        self.positive_items_per_user1 = self.train1.groupby(['customer_id'])['article_id'].apply(list)
        self.positive_items_per_user2 = self.train2.groupby(['customer_id'])['article_id'].apply(list)
        self.positive_items_per_user3 = self.train3.groupby(['customer_id'])['article_id'].apply(list)
        self.train_all = pd.concat([self.train1, self.train2, self.train3], axis=0)        

    def make_candidate(self,users):
        
        positive_items_per_user1 = self.positive_items_per_user1
        positive_items_per_user2 = self.positive_items_per_user2
        positive_items_per_user3 = self.positive_items_per_user3

        pred_next = self.pred_next
        popular_items = self.popular_items
        candidate_data = []
        
        for user in tqdm(users, desc="Generating candidates"):
            user_output = []
            if user in positive_items_per_user1:
                user_output += list(Counter(positive_items_per_user1[user]).keys())[:12]
            if user in positive_items_per_user2:
                user_output += list(Counter(positive_items_per_user2[user]).keys())[:12]
            if user in positive_items_per_user3:
                user_output += list(Counter(positive_items_per_user3[user]).keys())[:12]
            
        
            user_output = list(dict.fromkeys(user_output))
        
            next_preds = [pred_next[item] for item in user_output
                          if item in pred_next and pred_next[item] not in user_output]
            user_output += next_preds
        
            user_output += list(popular_items[:24 - len(user_output)])
        
            for article in list(dict.fromkeys(user_output))[:24]:
                candidate_data.append({'customer_id': user, 'article_id': article})
    
        return pd.DataFrame(candidate_data)

    def create_supervised_dataset(self):
        
        make_candidate(self.train4)
        actual_purchases = self.train4[['customer_id', 'article_id']].copy()
        actual_purchases['purchased'] = 1
        
        data = pd.merge(
            self.train_all,
            actual_purchases,
            on=['customer_id', 'article_id'],
            how='left'
        )
        
        data['purchased'].fillna(0, inplace=True)
        data['purchased'] = data['purchased'].astype(int)
        
        return data

    def predict(self):
        

In [10]:
rule_based_candidate_generator=RuleBasedCandidateGenerator(transactions)

In [12]:
rule_based_candidate_generator.split_data()
rule_based_candidate_generator.group_data()
rule_based_candidate_generator.predict_next_item()
rule_based_candidate_generator.get_popular_items()

In [10]:


class CandidatorGenerator:
    def __init__(self, data_path='/kaggle/input/h-and-m-personalized-fashion-recommendations'):
        self.transactions = pd.read_csv(
            f'{data_path}/transactions_train.csv',
            dtype={'article_id': str}
        )
        self.transactions["t_dat"] = pd.to_datetime(self.transactions["t_dat"])

    def split_data(self):
        df = self.transactions
        self.train1 = df.loc[(df["t_dat"] >= datetime.datetime(2020,9,8)) & (df['t_dat'] < datetime.datetime(2020,9,16))]
        self.train2 = df.loc[(df["t_dat"] >= datetime.datetime(2020,9,1)) & (df['t_dat'] < datetime.datetime(2020,9,8))]
        self.train3 = df.loc[(df["t_dat"] >= datetime.datetime(2020,8,23)) & (df['t_dat'] < datetime.datetime(2020,9,1))]
        self.train4 = df.loc[(df["t_dat"] >= datetime.datetime(2020,8,15)) & (df['t_dat'] < datetime.datetime(2020,8,23))]
        
        self.test = df.loc[df["t_dat"] >= datetime.datetime(2020,9,16)]
        
        

    def predict_next_item(self):
        
        self.train_all = self.train_all.sort_values(['customer_id', 't_dat'])
        self.train_all['next_item'] = self.train_all.groupby('customer_id')['article_id'].shift(-1)
        
        pairs = self.train_all.dropna(subset=['next_item'])
        pair_counts = pairs.groupby(['article_id', 'next_item']).size().reset_index(name='count')
        
        self.pred_next = {}
        for item, group in pair_counts.groupby('article_id'):
            total = group['count'].sum()
            most_common = group.loc[group['count'].idxmax()]
            ratio = most_common['count'] / total
            if total >= 5 and ratio >= 0.1:
                self.pred_next[item] = most_common['next_item']

In [11]:
g=RuleBaseCandidatorGenerator()

In [12]:
g.split_data()

In [14]:
g.group_data()

In [15]:
g.predict_next_item()

In [16]:
g.get_popular_items()

In [17]:
g.test.

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
31548013,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,0786022008,0.048441,2
31548014,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,0913272003,0.032288,2
31548015,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,0889669006,0.056508,2
31548016,2020-09-16,0010e8eb18f131e724d6997909af0808adbba057529edb...,0237347060,0.033881,1
31548017,2020-09-16,0010e8eb18f131e724d6997909af0808adbba057529edb...,0562245001,0.013542,1
...,...,...,...,...,...
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,0929511001,0.059305,2
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,0891322004,0.042356,2
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,0918325001,0.043203,1
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,0833459002,0.006763,1


In [19]:
from collections import Counter
import pandas as pd
from tqdm import tqdm

candidate_data = []
users = g.test['customer_id'].unique()

positive_items_per_user1 = g.positive_items_per_user1
positive_items_per_user2 = g.positive_items_per_user2
positive_items_per_user3 = g.positive_items_per_user3
positive_items_per_user4 = g.positive_items_per_user4
pred_next = g.pred_next
popular_items = g.popular_items

for user in tqdm(users, desc="Generating candidates"):
    user_output = []
    if user in positive_items_per_user1:
        user_output += list(Counter(positive_items_per_user1[user]).keys())[:12]
    if user in positive_items_per_user2:
        user_output += list(Counter(positive_items_per_user2[user]).keys())[:12]
    if user in positive_items_per_user3:
        user_output += list(Counter(positive_items_per_user3[user]).keys())[:12]
    if user in positive_items_per_user4:
        user_output += list(Counter(positive_items_per_user4[user]).keys())[:12]

    user_output = list(dict.fromkeys(user_output))

    next_preds = [pred_next[item] for item in user_output
                  if item in pred_next and pred_next[item] not in user_output]
    user_output += next_preds

    user_output += list(popular_items[:24 - len(user_output)])

    for article in list(dict.fromkeys(user_output))[:24]:
        candidate_data.append({'customer_id': user, 'article_id': article})

candidates_df = pd.DataFrame(candidate_data)



Generating candidates: 100%|██████████| 68984/68984 [00:04<00:00, 17199.85it/s]


MAP@24: 0.025960127052400864


In [5]:
class CandidatorGenerator:

    def __init__(self,data_path='/kaggle/input/h-and-m-personalized-fashion-recommendations'):
        
        self.transactions = pd.read_csv(f'{data_path}/transactions_train.csv', dtype={'article_id': str})
        self.transactions["t_dat"] = pd.to_datetime(self.transactions["t_dat"])

    
    def split_data(self):
        self.train1 = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,9,8)) & (transactions['t_dat'] < datetime.datetime(2020,9,16))]
        self.train2 = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,9,1)) & (transactions['t_dat'] < datetime.datetime(2020,9,8))]
        self.train3 = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,8,23)) & (transactions['t_dat'] < datetime.datetime(2020,9,1))]
        self.train4 = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,8,15)) & (transactions['t_dat'] < datetime.datetime(2020,8,23))]
        
        self.test = transactions.loc[transactions["t_dat"] >= datetime.datetime(2020,9,16)]
        

    def predict_next_item(self):
        
        self.train_all = self.train_all.sort_values(['customer_id', 't_dat'])
        self.user_group = self.train_all.groupby(['customer_id'])['article_id'].apply(list)
        
        next_items = {}
        for user in tqdm(user_group.keys(), desc="Processing next items"):
            items = user_group[user]
            for i,item in enumerate(items[:-1]):
                if item not in next_items:
                    next_items[item] = []
                if item != items[i+1]:
                    next_items[item].append(items[i+1])
    
        self.pred_next = {}
        for item in next_items:
            if len(next_items[item]) >= 5:
                most_common = Counter(next_items[item]).most_common()
                ratio = most_common[0][1]/len(next_items[item])
                if ratio >= 0.1:
                    self.pred_next[item] = most_common[0][0]
    
    
        

In [3]:

    print("Grouping purchases by user for different time windows...")
    positive_items_per_user1 = train1.groupby(['customer_id'])['article_id'].apply(list)
    positive_items_per_user2 = train2.groupby(['customer_id'])['article_id'].apply(list)
    positive_items_per_user3 = train3.groupby(['customer_id'])['article_id'].apply(list)
    positive_items_per_user4 = train4.groupby(['customer_id'])['article_id'].apply(list)

    # --- Next item prediction logic ---
    print("Calculating next-item predictions...")
    train_all = pd.concat([train1, train2, train3, train4], axis=0)
    train_all = train_all.sort_values(['customer_id', 't_dat'])
    user_group = train_all.groupby(['customer_id'])['article_id'].apply(list)
    
    next_items = {}
    for user in tqdm(user_group.keys(), desc="Processing next items"):
        items = user_group[user]
        for i,item in enumerate(items[:-1]):
            if item not in next_items:
                next_items[item] = []
            if item != items[i+1]:
                next_items[item].append(items[i+1])

    pred_next = {}
    for item in next_items:
        if len(next_items[item]) >= 5:
            most_common = Counter(next_items[item]).most_common()
            ratio = most_common[0][1]/len(next_items[item])
            if ratio >= 0.1:
                pred_next[item] = most_common[0][0]

    # --- Popular items logic ---
    print("Calculating popular items...")
    train_pop = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,9,1)) & (transactions['t_dat'] < datetime.datetime(2020,9,16))]
    train_pop['pop_factor'] = train_pop['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,16) - x).days)
    popular_items_group = train_pop.groupby(['article_id'])['pop_factor'].sum()
    _, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

    # --- Generate candidate list for each user ---
    print("Generating final candidate lists for validation users...")
    candidate_data = []
    for user in tqdm(val_users, desc="Generating candidates"):
        user_output = []
        if user in positive_items_per_user1:
            user_output += list(Counter(positive_items_per_user1[user]).keys())[:12]
        if user in positive_items_per_user2:
            user_output += list(Counter(positive_items_per_user2[user]).keys())[:12]
        if user in positive_items_per_user3:
            user_output += list(Counter(positive_items_per_user3[user]).keys())[:12]
        if user in positive_items_per_user4:
            user_output += list(Counter(positive_items_per_user4[user]).keys())[:12]
        
        user_output = list(dict.fromkeys(user_output))

        next_preds = [pred_next[item] for item in user_output if item in pred_next and pred_next[item] not in user_output]
        user_output += next_preds
        
        user_output += list(popular_items[:24 - len(user_output)])
        
        for article in list(dict.fromkeys(user_output))[:24]:
            candidate_data.append({'customer_id': user, 'article_id': article})
            
    candidates_df = pd.DataFrame(candidate_data)
    return candidates_df, val




In [53]:
def create_supervised_dataset(candidates_df, validation_df):
    """Merges candidates with actual purchases to create a labeled dataset."""
    print("Creating supervised dataset with labels...")
    
    actual_purchases = validation_df[['customer_id', 'article_id']].copy()
    actual_purchases['purchased'] = 1
    
    data = pd.merge(
        candidates_df,
        actual_purchases,
        on=['customer_id', 'article_id','t_dat'],
        how='left'
    )
    
    data['purchased'].fillna(0, inplace=True)
    data['purchased'] = data['purchased'].astype(int)
    
    return data


In [54]:

def add_features(df, customers, articles, transactions):
    """Adds customer, article, and popularity features."""
    print("Adding features...")
    
    df = pd.merge(df, customers, on='customer_id', how='left')
    df = pd.merge(df, articles, on='article_id', how='left')
    
    transactions['week'] = (transactions['t_dat'] - transactions['t_dat'].min()).dt.days // 7
    
    sales = transactions.groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(12).rename('bestseller_rank').astype('int8')
    sales = sales.reset_index()
    
    val_week = transactions['week'].max() + 1
    sales.week += 1
    
    bestsellers_for_val = sales[sales.week == val_week]
    
    df = pd.merge(df, bestsellers_for_val[['article_id', 'bestseller_rank']], on='article_id', how='left')
    df['bestseller_rank'].fillna(999, inplace=True)

    # Clean up dtypes and fill NaNs
    for col in df.columns:
        if df[col].dtype == 'object' and col not in ['customer_id', 'article_id']:
             df[col].fillna('Unknown', inplace=True)
        elif pd.api.types.is_numeric_dtype(df[col]):
            df[col].fillna(-1, inplace=True)

    return df


In [55]:


print("Starting script to create supervised learning dataset...")

# Load all necessary data
transactions, customers, articles = load_data()


Starting script to create supervised learning dataset...
Loading data...


In [56]:
supervised_df

Unnamed: 0,customer_id,article_id,purchased
0,000fb6e772c5d0023892065e659963da90b1866035558e...,0884319006,0
1,000fb6e772c5d0023892065e659963da90b1866035558e...,0762846026,0
2,000fb6e772c5d0023892065e659963da90b1866035558e...,0919273004,1
3,000fb6e772c5d0023892065e659963da90b1866035558e...,0927922002,0
4,000fb6e772c5d0023892065e659963da90b1866035558e...,0611415034,0
...,...,...,...
1653483,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0448509014,0
1653484,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0918292001,0
1653485,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0923758001,0
1653486,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0915529003,0


In [57]:

# Generate candidates based on notebook logic
candidates, validation_set = generate_candidates(transactions)

Splitting data into time windows...
Grouping purchases by user for different time windows...
Calculating next-item predictions...


Processing next items: 100%|██████████| 254618/254618 [00:01<00:00, 133006.47it/s]


Calculating popular items...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Generating final candidate lists for validation users...


Generating candidates: 100%|██████████| 68984/68984 [00:03<00:00, 18884.29it/s]


In [58]:
candidates

Unnamed: 0,customer_id,article_id
0,000fb6e772c5d0023892065e659963da90b1866035558e...,0884319006
1,000fb6e772c5d0023892065e659963da90b1866035558e...,0762846026
2,000fb6e772c5d0023892065e659963da90b1866035558e...,0919273004
3,000fb6e772c5d0023892065e659963da90b1866035558e...,0927922002
4,000fb6e772c5d0023892065e659963da90b1866035558e...,0611415034
...,...,...
1651218,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0448509014
1651219,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0918292001
1651220,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0923758001
1651221,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0915529003


In [59]:
# Create the base supervised dataset (customer, article, purchased_label)
supervised_df = create_supervised_dataset(candidates, validation_set)

Creating supervised dataset with labels...


KeyError: 't_dat'

In [None]:




# Add features to the supervised dataset
final_df = add_features(supervised_df, customers, articles, transactions)

# Save the result
output_path = 'supervised_dataset.parquet'
print(f"Saving final dataset to {output_path}...")
final_df.to_parquet(output_path, index=False)

print("Script finished successfully!")
print("\nFinal DataFrame head:")
print(final_df.head())
print("\nColumns:", final_df.columns.tolist())
print("\nShape:", final_df.shape)

In [84]:
final_df

Unnamed: 0,customer_id,article_id,purchased,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,product_code,...,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,bestseller_rank,query_id
0,000fb6e772c5d0023892065e659963da90b1866035558e...,0884319006,0,1.0,1.0,ACTIVE,Regularly,42.0,68ca4d9d6051d9c10b917d36bf9cb4afbadc551f7e4feb...,884319,...,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1010,Blouses,Blouse in an airy weave with a stand-up collar...,999.0,000fb6e772c5d0023892065e659963da90b1866035558e...
1,000fb6e772c5d0023892065e659963da90b1866035558e...,0762846026,0,1.0,1.0,ACTIVE,Regularly,42.0,68ca4d9d6051d9c10b917d36bf9cb4afbadc551f7e4feb...,762846,...,Ladieswear,1,Ladieswear,11,Womens Tailoring,1010,Blouses,Long-sleeved blouse in woven fabric with a col...,999.0,000fb6e772c5d0023892065e659963da90b1866035558e...
2,000fb6e772c5d0023892065e659963da90b1866035558e...,0919273004,1,1.0,1.0,ACTIVE,Regularly,42.0,68ca4d9d6051d9c10b917d36bf9cb4afbadc551f7e4feb...,919273,...,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1010,Blouses,Blouse in soft lace made from a cotton blend w...,999.0,000fb6e772c5d0023892065e659963da90b1866035558e...
3,000fb6e772c5d0023892065e659963da90b1866035558e...,0927922002,0,1.0,1.0,ACTIVE,Regularly,42.0,68ca4d9d6051d9c10b917d36bf9cb4afbadc551f7e4feb...,927922,...,Ladieswear,1,Ladieswear,11,Womens Tailoring,1010,Blouses,Straight-cut blouse in satin crêpe with a boat...,999.0,000fb6e772c5d0023892065e659963da90b1866035558e...
4,000fb6e772c5d0023892065e659963da90b1866035558e...,0611415034,0,1.0,1.0,ACTIVE,Regularly,42.0,68ca4d9d6051d9c10b917d36bf9cb4afbadc551f7e4feb...,611415,...,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Brazilian briefs in lace with a low waist, lin...",999.0,000fb6e772c5d0023892065e659963da90b1866035558e...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1653483,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0448509014,0,1.0,1.0,ACTIVE,Regularly,29.0,47258851e6f73dd2583ef4775814f9b88e43a9e2741c64...,448509,...,Divided,2,Divided,53,Divided Collection,1009,Trousers,"5-pocket, ankle-length jeans in washed, sturdy...",10.0,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...
1653484,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0918292001,0,1.0,1.0,ACTIVE,Regularly,29.0,47258851e6f73dd2583ef4775814f9b88e43a9e2741c64...,918292,...,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Sports tights in fast-drying functional fabric...,999.0,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...
1653485,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0923758001,0,1.0,1.0,ACTIVE,Regularly,29.0,47258851e6f73dd2583ef4775814f9b88e43a9e2741c64...,923758,...,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1010,Blouses,"Wide shirt in a cotton weave with a collar, bu...",4.0,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...
1653486,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0915529003,0,1.0,1.0,ACTIVE,Regularly,29.0,47258851e6f73dd2583ef4775814f9b88e43a9e2741c64...,915529,...,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1003,Knitwear,"Jumper in a soft, fine knit containing some wo...",9.0,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...


In [80]:
%%time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm.sklearn import LGBMRanker
from sklearn.metrics import ndcg_score

X = final_df.drop(columns=['purchased'])
y = final_df['purchased']


CPU times: user 370 ms, sys: 595 ms, total: 965 ms
Wall time: 964 ms


In [81]:

for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [82]:

train_group = X_train.groupby('customer_id').size().tolist()
test_group = X_test.groupby('customer_id').size().tolist()

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="gbdt",
    learning_rate=0.05,
    n_estimators=100
)

ranker.fit(
    X_train, y_train,
    group=train_group,
    eval_set=[(X_test, y_test)],
    eval_group=[test_group],
    eval_at=[5, 10]
)


[1]	valid_0's ndcg@5: 0.986183	valid_0's ndcg@10: 0.987114
[2]	valid_0's ndcg@5: 0.986574	valid_0's ndcg@10: 0.987483
[3]	valid_0's ndcg@5: 0.986751	valid_0's ndcg@10: 0.987625
[4]	valid_0's ndcg@5: 0.986709	valid_0's ndcg@10: 0.987576
[5]	valid_0's ndcg@5: 0.986791	valid_0's ndcg@10: 0.987665
[6]	valid_0's ndcg@5: 0.986778	valid_0's ndcg@10: 0.98762
[7]	valid_0's ndcg@5: 0.986841	valid_0's ndcg@10: 0.987709
[8]	valid_0's ndcg@5: 0.986908	valid_0's ndcg@10: 0.987778
[9]	valid_0's ndcg@5: 0.986893	valid_0's ndcg@10: 0.987754
[10]	valid_0's ndcg@5: 0.986932	valid_0's ndcg@10: 0.98777
[11]	valid_0's ndcg@5: 0.986882	valid_0's ndcg@10: 0.987721
[12]	valid_0's ndcg@5: 0.98688	valid_0's ndcg@10: 0.987718
[13]	valid_0's ndcg@5: 0.986903	valid_0's ndcg@10: 0.987736
[14]	valid_0's ndcg@5: 0.986892	valid_0's ndcg@10: 0.987725
[15]	valid_0's ndcg@5: 0.986922	valid_0's ndcg@10: 0.987759
[16]	valid_0's ndcg@5: 0.986946	valid_0's ndcg@10: 0.987779
[17]	valid_0's ndcg@5: 0.986952	valid_0's ndcg@10: 0

LGBMRanker(learning_rate=0.05, metric='ndcg', objective='lambdarank')

In [83]:

y_pred = ranker.predict(X_test)
X_test['score'] = y_pred
X_test['customer_id'] = final_df.loc[X_test.index, 'customer_id']
ranked = X_test.sort_values(by=['customer_id', 'score'], ascending=[True, False])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
%%time
from sklearn.metrics import average_precision_score
import numpy as np

X_test = X_test.copy()
X_test.loc[:, 'score'] = y_pred
X_test.loc[:, 'true_label'] = y_test.values
X_test.loc[:, 'customer_id'] = final_df.loc[X_test.index, 'customer_id'].values

grouped = X_test.groupby('customer_id')
average_precisions = []
K = 12

for _, group in grouped:
    top_k = group.sort_values('score', ascending=False).head(K)
    y_true = top_k['true_label'].values
    y_scores = top_k['score'].values
    if y_true.sum() == 0:
        average_precisions.append(0)
    else:
        ap = average_precision_score(y_true, y_scores)
        average_precisions.append(ap)

map_score = np.mean(average_precisions)
print("MAP@12 Score:", map_score)

In [85]:
%%time
from sklearn.metrics import average_precision_score
import numpy as np

X_test = X_test.copy()
X_test.loc[:, 'true_label'] = y_test.values
X_test.loc[:, 'customer_id'] = final_df.loc[X_test.index, 'customer_id'].values

grouped = X_test.groupby('customer_id')
average_precisions = []
K = 12

for _, group in grouped:
    top_k = group.head(K)
    y_true = top_k['true_label'].values
    y_scores = np.arange(len(y_true), 0, -1)  # 임의 점수: 뒤로 갈수록 낮게
    if y_true.sum() == 0:
        average_precisions.append(0)
    else:
        ap = average_precision_score(y_true, y_scores)
        average_precisions.append(ap)

map_score = np.mean(average_precisions)
print("MAP@12 Score (no model):", map_score)

MAP@12 Score (no model): 0.019156676870812826
CPU times: user 9.88 s, sys: 351 ms, total: 10.2 s
Wall time: 10.2 s


In [89]:
X_test['true_label'].value_counts()

0    327627
1      3071
Name: true_label, dtype: int64

In [93]:
2*(3071/327627)

0.01874692867193485