In [1]:
import random
from typing import Optional

import pandas as pd
import lightgbm
from lightgbm.callback import early_stopping

TARGET = 'target'
REQUEST_ID = 'request_id'

In [2]:
pd.__version__

'1.3.5'

In [3]:
def df_to_dict(df: pd.DataFrame):
    return df.groupby(REQUEST_ID).item_id.apply(list).to_dict()

def mapk_df(df_with_prediction: pd.DataFrame, k: int) -> float:
    df_sorted = df_with_prediction.sort_values(['request_id', 'prediction'], ascending=[True, False])
    df_sorted['rank'] = df_sorted.groupby('request_id').cumcount()
    top = df_sorted[df_sorted['rank'] < k]
    pred_dict = df_to_dict(top)
    ans_dict = df_to_dict(df_sorted.query('target == 1'))
    
    predicted = []
    actual = []
    for k in ans_dict:
        predicted.append(pred_dict[k])
        actual.append(ans_dict[k])
        
    return mapk(actual, predicted, k)


def apk(actual, predicted, k=30):

    if not actual:
        return 0.0

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=30):
    return sum([apk(a, p, k) for a, p in zip(actual, predicted)]) / len(actual)
    

In [14]:
def get_group_for_lgb(sorted_vals):
    curr_val = sorted_vals[0]
    cnt = 1
    groups = []
    for val in sorted_vals[1:]:
        if val != curr_val:
            groups.append(cnt)
            curr_val = val
            cnt = 1
        else:
            cnt += 1

    groups.append(cnt)

    return groups


def train_val_test_split(
        df: pd.DataFrame,
        split_column: str,
        val_part: float = 0.1,
        test_part: float = 0.1,
        seed: int = 32,
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    random.seed(seed)
    unique_values = df[split_column].unique().tolist()
    random.shuffle(unique_values)
    val_start = int(len(unique_values) * (1 - val_part - test_part))
    test_start = int(len(unique_values) * (1 - test_part))
    train_ids = unique_values[:val_start]
    val_ids = unique_values[val_start: test_start]
    test_ids = unique_values[test_start:]

    return (
        df[df[split_column].isin(train_ids)],
        df[df[split_column].isin(val_ids)],
        df[df[split_column].isin(test_ids)],
    )



def train_lightgbm(ranking_df: pd.DataFrame, features: list, model = None):
    train, val, test = train_val_test_split(ranking_df, REQUEST_ID, val_part=0.1, test_part=0.1)
    X_train, y_train, train_groups = train[features], train[TARGET], get_group_for_lgb(train[REQUEST_ID].values)
    X_val, y_val, val_groups = val[features], val[TARGET], get_group_for_lgb(val[REQUEST_ID].values)

    lgb_train = lightgbm.Dataset(
        X_train, y_train, categorical_feature=['category_id'],
        group=train_groups, free_raw_data=False
    )
    lgb_eval = lightgbm.Dataset(
        X_val, y_val, reference=lgb_train, categorical_feature=['category_id'],
        group=val_groups, free_raw_data=False
    )


    model = lightgbm.train(
        {'objective': 'binary', 'metric': 'map', 'eval_at': [8, 30], 'learning_rate': 0.05},
        lgb_train,
        valid_sets=[lgb_eval],
        callbacks=[
            early_stopping(50),
            lightgbm.print_evaluation(10)
        ],
        init_model=model
    )
    
    test['prediction'] = model.predict(test[features])
    print(f'map@30 on holdout test: {mapk_df(test, 30)}')
    

    return model

In [15]:
!pip install pyarrow 

[0m

In [16]:
!ls

__notebook_source__.ipynb  lgbm_baseline.csv


In [17]:
path_to_data = '../input/ml-1-shad-fall-2022/'
train = pd.read_csv("../input/shad-distributed-items/items_train_0.pq")
#test = pd.read_parquet(path_to_data + 'items_test.pq')

In [18]:
test = pd.read_csv("../input/shad-distributed-items/items_test_0.pq")

In [19]:
test.shape

(136236, 46)

In [20]:
train.shape

(96213, 49)

In [21]:
request_ids = train.request_id.unique()
len(request_ids)

321

In [22]:
features = [
       'category_id','model_a_score',
       'model_a_rank', 'model_b_score', 'model_b_rank', 'model_b_f1',
       'model_b_f2', 'model_b_f3', 'model_c_score', 'model_c_rank',
       'model_d_score', 'model_d_rank', 'shows_count', 'microcategory_id',
       'location_id', 'price', 'sort_age', 'start_age', 'latitude',
       'longitude', 'item_stats.days',
       'item_stats.contact_conversion', 'item_stats.hides', 'item_stats.shows',
       'item_stats.shows_last_day', 'item_stats.shows_rec',
       'item_stats.shows_rec_last_day', 'item_stats.contacts',
       'item_stats.contacts_last_day', 'item_stats.contacts_rec_last_day',
       'item_stats.contacts_rec', 'item_stats.last_contacts',
       'item_stats.delayed_contacts', 'item_stats.delayed_contacts_rec',
       'item_stats.clicks', 'item_stats.clicks_rec', 'item_stats.last_clicks',
       'item_stats.long_clicks', 'item_stats.clicks_last_day',
       'item_stats.clicks_rec_last_day',
       'item_stats.segment_contact_conversion',
       'item_stats.seller_contact_conversion',
       'item_stats.segment_click_conversion',
       'item_stats.seller_click_conversion',
]

In [23]:
from tqdm.auto import trange
model = None
for i in trange(100):
    train = pd.read_csv(f"../input/shad-distributed-items/items_train_{i}.pq")
    model = train_lightgbm(train, features, model)
    del train

  0%|          | 0/100 [00:00<?, ?it/s]



[LightGBM] [Info] Number of positive: 395, number of negative: 76318
[LightGBM] [Info] Total groups: 256, total data: 76713
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9665
[LightGBM] [Info] Number of data points in the train set: 76713, number of used features: 44
[LightGBM] [Info] Total groups: 32, total data: 9600
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005149 -> initscore=-5.263778
[LightGBM] [Info] Start training from score -5.263778
Training until validation scores don't improve for 50 rounds
[10]	valid_0's map@8: 0.0640625	valid_0's map@30: 0.0808295




[20]	valid_0's map@8: 0.0679687	valid_0's map@30: 0.0857899
[30]	valid_0's map@8: 0.0771949	valid_0's map@30: 0.0941472
[40]	valid_0's map@8: 0.0729167	valid_0's map@30: 0.0907528
[50]	valid_0's map@8: 0.0746528	valid_0's map@30: 0.0909521
Early stopping, best iteration is:
[2]	valid_0's map@8: 0.0985243	valid_0's map@30: 0.105518
map@30 on holdout test: 0.058542229920397794


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[LightGBM] [Info] Number of positive: 412, number of negative: 76514
[LightGBM] [Info] Total groups: 257, total data: 76926
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9681
[LightGBM] [Info] Number of data points in the train set: 76926, number of used features: 44
[LightGBM] [Info] Total groups: 32, total data: 9600
Training until validation scores don't improve for 50 rounds
[10]	valid_0's map@8: 0.0837674	valid_0's map@30: 0.0954241
[20]	valid_0's map@8: 0.0835193	valid_0's map@30: 0.0903753
[30]	valid_0's map@8: 0.0802083	valid_0's map@30: 0.0890968
[40]	valid_0's map@8: 0.09375	valid_0's map@30: 0.101664
[50]	valid_0's map@8: 0.0755208	valid_0's map@30: 0.0848568
Early stopping, best iteration is:
[3]	valid_0's map@8: 0.105035	valid_0's map@30: 0.117573
map@30 on holdout test: 0.04825171010433159
[LightGBM] [Info] Number of positive: 389, number of negative: 76450
[LightGBM] [Info] Total groups: 257, total data: 76839
You can set `force_c

In [24]:
def a_create_submission(filename, model, test, k: int = 30):
    test['pred'] = model.predict(test[model.feature_name()])
    test_sorted = test.sort_values(['request_id', 'pred'], ascending=[True, False])
    test_sorted['rank'] = test_sorted.groupby('request_id').cumcount()
    top = test_sorted[test_sorted['rank'] < k]
    subm = top[['request_id', 'item_id']].groupby('request_id')['item_id'].apply(lambda x: ' '.join(map(str, x)))
    subm_df = subm.to_frame().reset_index().rename(columns={'item_id': 'item_ids'})
    subm_df.to_csv(filename, index=False)

In [25]:
from tqdm.auto import trange

def create_submission(filename, model, k: int = 30):
    datasets = []
    for i in trange(100):
        test = pd.read_csv(f'../input/shad-distributed-items/items_test_{i}.pq')
        test['pred'] = model.predict(test[model.feature_name()])
        test_sorted = test.sort_values(['request_id', 'pred'], ascending=[True, False])
        test_sorted['rank'] = test_sorted.groupby('request_id').cumcount()
        top = test_sorted[test_sorted['rank'] < k]
        subm = top[['request_id', 'item_id']].groupby('request_id')['item_id'].apply(lambda x: ' '.join(map(str, set(x))))
        subm_df = subm.to_frame().reset_index().rename(columns={'item_id': 'item_ids'})
        datasets.append(subm_df)
    subm_df = pd.concat(datasets)
    subm_df = subm_df.drop_duplicates(subset=['request_id'])
    subm_df.to_csv(filename, index=False)

In [26]:
create_submission('lgbm_baseline.csv', model)

  0%|          | 0/100 [00:00<?, ?it/s]

In [27]:
df0 = pd.read_csv('./lgbm_baseline.csv')
df0

Unnamed: 0,request_id,item_ids
0,1,14909188 21154439 10503565 22208527 14905618 1...
1,3,1173762 22573702 21815432 25524497 14834066 24...
2,7,4273024 374785 23010567 20056457 24890387 2232...
3,9,25420929 23960834 16055297 23332229 25037962 2...
4,12,24649856 884098 446470 14637063 672532 1046338...
...,...,...
45407,147890,232321 20750723 12809867 3198093 3287310 24979...
45408,147891,20557571 17501957 1453704 3829513 22497674 253...
45409,147893,24910084 24712842 15972113 12855956 23672730 1...
45410,147897,10543496 20783624 12462736 4835472 6980888 235...


In [40]:
df0 = df0.drop_duplicates(subset='request_id')
df0

Unnamed: 0,request_id,item_ids
0,1,23862020 21206930 14905618 10731542 7171995 20...
1,3,1173762 24940291 20913027 21815432 25524497 23...
2,7,374785 6451201 21166341 12902918 25127558 2232...
3,9,2942981 23332229 19211273 25037962 6466315 176...
4,12,16862852 446470 21992975 672532 16912026 70674...
...,...,...
45503,147890,232321 11067267 25565317 17219849 478220 15911...
45504,147891,7963010 20557571 1453704 12121357 14939665 991...
45505,147893,24712842 22197521 21728919 16091033 13846299 1...
45506,147897,20783624 6781452 24594451 15921182 5660191 194...


In [38]:
len(df0['request_id'].unique())

45412