In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from scipy.sparse import csr_matrix
import os

In [2]:
BASE_PATH = '/content/drive/MyDrive/data/atmaCup#16'

In [3]:
train_log_df = pd.read_csv(os.path.join(BASE_PATH, 'train_log.csv'))
yado_df = pd.read_csv(os.path.join(BASE_PATH, 'yado.csv'))
train_label_df = pd.read_csv(os.path.join(BASE_PATH, 'train_label.csv'))

test_session_df = pd.read_csv(os.path.join(BASE_PATH, 'test_session.csv'))
test_log_df = pd.read_csv(os.path.join(BASE_PATH, 'test_log.csv'))
sample_submission_df = pd.read_csv(os.path.join(BASE_PATH, 'sample_submission.csv'))


In [4]:
train_log_df['yad_idx'] = train_log_df['yad_no'] - 1
test_log_df['yad_idx'] = test_log_df['yad_no'] - 1
yado_df['yad_idx'] = yado_df['yad_no'] - 1
YADO_SIZE = len(yado_df)

In [5]:
sml_count_per_wid = yado_df.groupby('wid_cd')['sml_cd'].nunique()

sorted_wid_cd = sml_count_per_wid.sort_values(ascending=False).index

wid_cd_to_index = {wid_cd: index for index, wid_cd in enumerate(sorted_wid_cd)}

yado_df['wid_cd_index'] = yado_df['wid_cd'].map(wid_cd_to_index)

def hierarchical_indexing_with_wid_index(df, higher_level_column, lower_level_column):

    sorted_higher_level = df.sort_values(by='wid_cd_index')[higher_level_column].unique()

    lower_level_mapping = {}
    offset = 0
    for higher_level in sorted_higher_level:
        lower_levels = df[df[higher_level_column] == higher_level][lower_level_column].unique()
        for i, lower_level in enumerate(lower_levels):
            lower_level_mapping[lower_level] = offset + i
        offset += len(lower_levels)

    return df[lower_level_column].map(lower_level_mapping)

yado_df['ken_cd_index'] = hierarchical_indexing_with_wid_index(yado_df, 'wid_cd_index', 'ken_cd')
yado_df['lrg_cd_index'] = hierarchical_indexing_with_wid_index(yado_df, 'ken_cd', 'lrg_cd')
yado_df['sml_cd_index'] = hierarchical_indexing_with_wid_index(yado_df, 'lrg_cd', 'sml_cd')

yado_df

Unnamed: 0,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,yad_idx,wid_cd_index,ken_cd_index,lrg_cd_index,sml_cd_index
0,1,0,129.0,1.0,0,1.0,,,1.0,f0112abf369fb03cdc5f5309300913da,072c85e1653e10c9c7dd065ad007125a,449c52ef581d5f9ef311189469a0520e,677a32689cd1ad74e867f1fbe43a3e1c,0,0,0,0,0
1,2,0,23.0,1.0,0,,,,,d86102dd9c232bade9a97dccad40df48,b4d2fb4e51ea7bca80eb1270aa474a54,5c9a8f48e9df0234da012747a02d4b29,4ee16ee838dd2703cc9a1d5a535f0ced,1,8,34,250,451
2,3,0,167.0,1.0,1,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,2,1,6,54,86
3,4,0,144.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52c9ea83f2cfe92be54cb6bc961edf21,1cc3e1838bb0fd0fde0396130b1f82b9,3,1,7,41,94
4,5,0,41.0,1.0,1,,,,,43875109d1dab93592812c50d18270a7,75617bb07a2785a948ab1958909211f1,9ea5a911019b66ccd42f556c42a2fe2f,be1b876af18afc4deeb3081591d2a910,4,9,39,278,492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13801,13802,0,10.0,1.0,1,,,,,c312e07b7a5d456d53a5b00910a336e1,558ac1909f0318b82c621ab250329d6d,80fb3c5ad0c89931d0923e9f80885218,5eb30820716082c720836733d73c605e,13801,4,25,179,294
13802,13803,0,,,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,e5cfcc0a43c82072aca11628ff0add53,20ad8785a30f125bee5a8a325782ab06,13802,3,19,114,213
13803,13804,0,80.0,1.0,1,,1.0,,1.0,d86102dd9c232bade9a97dccad40df48,7d76599bd27ff9e7823b2b1323ca763e,c5fe8848b6ab39b040cdb3668aea9433,b3eab50ccf6ffb51c37d36ee384abfbf,13803,8,37,265,459
13804,13805,0,8.0,1.0,1,,,,1.0,3300cf6f774b7c6a5807110f244cbc21,689cf8289e7ea0b2eef1b017dcdfe8de,8b712435430a6875839a6c3b5a40b008,2b4165444a777465576b25f65697d739,13804,6,32,233,406


In [6]:
vc = test_log_df['yad_no'].value_counts().reset_index()
vc.columns = ['yad_no', 'counts']

yado_df = yado_df.merge(vc, on='yad_no', how='left').fillna({'counts': 0})
yado_df['sml_rank'] = yado_df.groupby('sml_cd')['counts'].rank(method='min', ascending=False)
yado_df['lrg_rank'] = yado_df.groupby('lrg_cd')['counts'].rank(method='min', ascending=False)
yado_df

Unnamed: 0,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,...,lrg_cd,sml_cd,yad_idx,wid_cd_index,ken_cd_index,lrg_cd_index,sml_cd_index,counts,sml_rank,lrg_rank
0,1,0,129.0,1.0,0,1.0,,,1.0,f0112abf369fb03cdc5f5309300913da,...,449c52ef581d5f9ef311189469a0520e,677a32689cd1ad74e867f1fbe43a3e1c,0,0,0,0,0,30.0,25.0,61.0
1,2,0,23.0,1.0,0,,,,,d86102dd9c232bade9a97dccad40df48,...,5c9a8f48e9df0234da012747a02d4b29,4ee16ee838dd2703cc9a1d5a535f0ced,1,8,34,250,451,5.0,32.0,32.0
2,3,0,167.0,1.0,1,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,...,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,2,1,6,54,86,30.0,3.0,46.0
3,4,0,144.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,...,52c9ea83f2cfe92be54cb6bc961edf21,1cc3e1838bb0fd0fde0396130b1f82b9,3,1,7,41,94,17.0,13.0,27.0
4,5,0,41.0,1.0,1,,,,,43875109d1dab93592812c50d18270a7,...,9ea5a911019b66ccd42f556c42a2fe2f,be1b876af18afc4deeb3081591d2a910,4,9,39,278,492,0.0,14.0,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13801,13802,0,10.0,1.0,1,,,,,c312e07b7a5d456d53a5b00910a336e1,...,80fb3c5ad0c89931d0923e9f80885218,5eb30820716082c720836733d73c605e,13801,4,25,179,294,0.0,8.0,30.0
13802,13803,0,,,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,...,e5cfcc0a43c82072aca11628ff0add53,20ad8785a30f125bee5a8a325782ab06,13802,3,19,114,213,1.0,23.0,162.0
13803,13804,0,80.0,1.0,1,,1.0,,1.0,d86102dd9c232bade9a97dccad40df48,...,c5fe8848b6ab39b040cdb3668aea9433,b3eab50ccf6ffb51c37d36ee384abfbf,13803,8,37,265,459,39.0,7.0,7.0
13804,13805,0,8.0,1.0,1,,,,1.0,3300cf6f774b7c6a5807110f244cbc21,...,8b712435430a6875839a6c3b5a40b008,2b4165444a777465576b25f65697d739,13804,6,32,233,406,2.0,5.0,29.0


In [7]:
sml_ranking_list = yado_df.sort_values(by=['sml_cd_index', 'sml_rank']).groupby('sml_cd_index')['yad_idx'].apply(list)

In [8]:
_train_label_df = train_label_df.rename(columns={'yad_no': 'target_yad_no'})
merged_df = train_log_df.merge(_train_label_df, on='session_id', how='left')
target_rows = merged_df[['session_id', 'target_yad_no']].drop_duplicates()
target_rows['seq_no'] = 100
target_rows.rename(columns={'target_yad_no': 'yad_no'}, inplace=True)
extended_train_log_df = pd.concat([train_log_df, target_rows], ignore_index=True).sort_values(by=['session_id', 'seq_no'])
extended_train_log_df = extended_train_log_df.merge(yado_df, on='yad_no', how='left')
extended_train_log_df = extended_train_log_df[['session_id', 'seq_no', 'yad_no', 'yad_type',
       'total_room_cnt', 'wireless_lan_flg', 'onsen_flg', 'kd_stn_5min',
       'kd_bch_5min', 'kd_slp_5min', 'kd_conv_walk_5min', 'wid_cd_index', 'ken_cd_index', 'lrg_cd_index',
       'sml_cd_index']]
extended_train_log_df['yad_idx'] = extended_train_log_df['yad_no'] - 1
extended_train_log_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extended_train_log_df['yad_idx'] = extended_train_log_df['yad_no'] - 1


Unnamed: 0,session_id,seq_no,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd_index,ken_cd_index,lrg_cd_index,sml_cd_index,yad_idx
0,000007603d533d30453cc45d0f3d119f,0,2395,0,113.0,1.0,0,,,,,3,19,117,236,2394
1,000007603d533d30453cc45d0f3d119f,100,4101,0,39.0,,0,,,,1.0,3,19,117,236,4100
2,0000ca043ed437a1472c9d1d154eb49b,0,13535,0,40.0,1.0,0,1.0,,,1.0,7,33,242,424,13534
3,0000ca043ed437a1472c9d1d154eb49b,100,8253,0,26.0,1.0,0,1.0,,,1.0,7,33,242,424,8252
4,0000d4835cf113316fe447e2f80ba1c8,0,123,0,17.0,1.0,0,,,,,1,6,57,98,122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707963,ffffcd5bc19d62cad5a3815c87818d83,2,12230,0,354.0,1.0,0,,,,1.0,10,45,290,506,12229
707964,ffffcd5bc19d62cad5a3815c87818d83,100,10619,0,,1.0,0,,,,1.0,10,45,290,506,10618
707965,fffffa7baf370083ebcdd98f26a7e31a,0,2439,0,81.0,1.0,0,1.0,,,1.0,1,6,54,86,2438
707966,fffffa7baf370083ebcdd98f26a7e31a,1,11822,0,161.0,1.0,0,1.0,,,1.0,1,6,54,86,11821


In [9]:
# 同一 session_id で最後の yad_no と最後から二つ前の yad_no が同じである session_id の取得
extended_train_log_df['prev_yad_no'] = extended_train_log_df.groupby('session_id')['yad_no'].shift(2)
exclude_session_ids = extended_train_log_df[extended_train_log_df['yad_no'] == extended_train_log_df['prev_yad_no']]['session_id'].unique()


In [10]:

# 正解labelの情報も使う
session_yad_pairs_ext = extended_train_log_df[['session_id', 'yad_idx', 'yad_no', 'seq_no']]
session_yad_pairs_ext['is_test'] = False
# test dataの情報も使う
session_yad_pairs_test = test_log_df[['session_id', 'yad_idx', 'yad_no', 'seq_no']]
session_yad_pairs_test['is_test'] = True
session_yad_pairs = pd.concat([session_yad_pairs_ext, session_yad_pairs_test])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  session_yad_pairs_ext['is_test'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  session_yad_pairs_test['is_test'] = True


In [11]:
from scipy.sparse import coo_matrix
from itertools import combinations

def create_co_visitation_matrix(df):
    row_indices = []
    col_indices = []
    data = []

    for session_id, group in df.groupby('session_id'):
        yad_indices = group['yad_idx'].tolist()
        is_tests = group['is_test'].tolist()
        seq_nos = group['seq_no'].tolist()

        for i in range(len(yad_indices)):
            # test重視
            if is_tests[i]:
                weight = 15  # test dataは重み15倍にする
            else:
                weight = 1

            label_weight = 1
            if i < len(seq_nos)-1:
                if seq_nos[i+1] == 100:
                    # 正解yadoへの遷移は重み1.2倍
                    label_weight = 1.2

            for j in range(i + 1, len(yad_indices)):
                row_indices.append(yad_indices[i])
                col_indices.append(yad_indices[j])
                data.append(weight*label_weight)

                row_indices.append(yad_indices[j])
                col_indices.append(yad_indices[i])
                data.append(weight)

    yad_size = df['yad_idx'].max() + 1
    co_visitation_matrix = coo_matrix((data, (row_indices, col_indices)), shape=(yad_size, yad_size))
    return co_visitation_matrix


In [12]:
def get_top_indices(indices, co_visitation_matrix):
    # 各 yad_idx の共訪問情報を集計
    total_row = np.zeros(YADO_SIZE)
    for idx in indices:
        row = co_visitation_matrix[idx]
        total_row += row

    # 0のデータは含まれないようにする
    indices_over_one = np.where(total_row > 0)[0]
    top_indices = indices_over_one[np.argsort(total_row[indices_over_one])][::-1]
    return top_indices

def get_recommendation_candidates(session_yad_indices, co_visitation_matrix):
    recommendations = []
    for yad_indices in session_yad_indices:
        # 最後の yad_idx
        last_yad_idx = yad_indices[-1] if len(yad_indices) > 0 else None

        # 最後から2つ目の yad_idx を確認
        second_last_idx = yad_indices[-2] if len(yad_indices) > 1 else None

        # セッション内のyadを整理
        session_prioritized = [yad for yad in yad_indices if yad != last_yad_idx]

        if second_last_idx:
            session_prioritized = [second_last_idx] + [yad for yad in session_prioritized if yad != second_last_idx]

        top_indices = get_top_indices(yad_indices, co_visitation_matrix)

        top_indices = [i for i in top_indices if i != last_yad_idx]

        # top_indicesから既に追加されたyadを除外
        top_indices = [i for i in top_indices if i not in session_prioritized]

        # session_prioritizedとtop_indicesを結合
        top_indices = session_prioritized + top_indices

        # 最後から2つ目の yad_idx があれば最優先の候補に含める
        if second_last_idx is not None:
            if second_last_idx in top_indices:
                top_indices.remove(second_last_idx)
            top_indices = [second_last_idx] + top_indices


        # どうやって埋めるか。
        if len(top_indices) < 10:
            total_row = np.zeros(YADO_SIZE)
            for idx in top_indices:
                row = co_visitation_matrix[idx]
                total_row += row
                indices_over_one = np.where(total_row > 0)[0]
                sub_top_indices = indices_over_one[np.argsort(total_row[indices_over_one])][::-1]
                sub_top_indices = [i for i in sub_top_indices if i not in top_indices]
                top_indices = top_indices + sub_top_indices
                if (len(top_indices)) > 9:
                    break

       # 候補が10に満たない場合の補完処理
        if len(top_indices) < 10:
            # top_indicesに含まれる最初のyadのsml_cd_indexを取得
            for i in range(len(top_indices)):
                sml_idx = yado_df[yado_df['yad_idx'] == top_indices[i]]['sml_cd_index'].values[0]

                # 同じsml_cd_indexのランキング上位から候補を選択
                for yad in sml_ranking_list[sml_idx][:5]:
                    if yad not in top_indices and len(top_indices) < 10:
                        top_indices.append(yad)


        if len(top_indices) < 10:
            top_indices += [-2] * (10 - len(top_indices))  # 候補が10個に満たない場合はダミーの値で埋める

        recommendations.append(top_indices[:10])
    return recommendations

In [13]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k for a single actual value.

    Parameters:
    actual : int
        The actual value that is to be predicted
    predicted : list
        A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The average precision at k
    """
    if actual in predicted[:k]:
        return 1.0 / (predicted[:k].index(actual) + 1)
    return 0.0

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k for lists of actual values and predicted values.

    Parameters:
    actual : list
        A list of actual values that are to be predicted
    predicted : list
        A list of lists of predicted elements (order does matter in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The mean average precision at k
    """
    return sum(apk(a, p, k) for a, p in zip(actual, predicted)) / len(actual)

# Validation

In [14]:
%%time
session_yad_co_visitation_matrix = create_co_visitation_matrix(session_yad_pairs)
co_visitation_matrix_csr = csr_matrix(session_yad_co_visitation_matrix)

co_visitation_matrix_arr = np.zeros((YADO_SIZE, YADO_SIZE))

for idx in range(YADO_SIZE):
    row = co_visitation_matrix_csr.getrow(idx).toarray()[0]
    co_visitation_matrix_arr[idx] = row

CPU times: user 58.8 s, sys: 3.85 s, total: 1min 2s
Wall time: 1min 6s


In [15]:
%%time

# 各 session_id の yad_idx のリストを取得
session_yad_indices = train_log_df.groupby('session_id')['yad_idx'].apply(list)

# レコメンデーション候補を取得
candidates_lists = get_recommendation_candidates(session_yad_indices, co_visitation_matrix_arr)

# 結果をデータフレームに格納（+1 で yad_no に変換）
predictions_df = pd.DataFrame(candidates_lists, columns=['predict_' + str(i) for i in range(10)], index=session_yad_indices.index)
predictions_df += 1

# 予測結果の表示
predictions_df

CPU times: user 43.5 s, sys: 402 ms, total: 43.9 s
Wall time: 44.5 s


Unnamed: 0_level_0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
000007603d533d30453cc45d0f3d119f,11882,2808,5289,4101,3324,12846,997,2395,5821,12837
0000ca043ed437a1472c9d1d154eb49b,8253,8747,3725,4488,2259,11104,1586,2570,3564,8225
0000d4835cf113316fe447e2f80ba1c8,9039,5238,13642,11724,7509,6722,4863,4355,1967,2957
0000fcda1ae1b2f431e55a7075d1f500,626,2272,755,11715,109,6830,7812,13296,13549,1341
000104bdffaaad1a1e0a9ebacf585f33,96,12491,902,1284,7749,5490,11380,3894,4072,9067
...,...,...,...,...,...,...,...,...,...,...
ffff2262d38abdeb247ebd591835dcc9,8747,13210,13079,2876,5719,10955,10522,3564,1092,10327
ffff2360540745117193ecadcdc06538,2900,8703,3940,6654,399,5299,4767,513,8465,963
ffff7fb4617164b2604aaf51c40bf82d,7308,4040,12240,7820,4398,2087,10364,7057,10421,9558
ffffcd5bc19d62cad5a3815c87818d83,10619,12500,570,3238,11091,12829,11316,9671,7551,10616


In [16]:
train_label = train_label_df.set_index("session_id").loc[predictions_df.index]["yad_no"].values
score = mapk(actual=train_label, predicted=predictions_df.values.tolist(), k=10)
score

0.4150302364627259

# Submission

In [17]:
%%time
# 各 session_id の yad_idx のリストを取得
session_yad_indices = test_log_df.groupby('session_id')['yad_idx'].apply(list)

# レコメンデーション候補を取得
candidates_lists = get_recommendation_candidates(session_yad_indices, co_visitation_matrix_arr)

# 結果をデータフレームに格納（+1 で yad_no に変換）
predictions_df = pd.DataFrame(candidates_lists, columns=['predict_' + str(i) for i in range(10)], index=session_yad_indices.index)
predictions_df += 1

# 予測結果の表示
predictions_df

CPU times: user 27.3 s, sys: 200 ms, total: 27.5 s
Wall time: 27.7 s


Unnamed: 0_level_0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
00001149e9c73985425197104712478c,3560,4420,9534,11561,5466,4714,2680,6488,6563,5785
0000e02747d749a52b7736dfa751e258,143,613,8108,4066,11923,6129,7014,6555,11237,10095
0000f17ae2628237d78d3a38b009d3be,757,7710,9190,9910,1774,410,13570,10485,6721,3400
000174a6f7a569b84c5575760d2e9664,12341,6991,3359,13521,1542,10861,4180,10746,9319,2363
00017e2a527901c9c41b1acef525d016,2862,9020,763,10826,13235,1448,5650,607,11480,12029
...,...,...,...,...,...,...,...,...,...,...
fffee3199ef94b92283239cd5e3534fa,1997,7888,1885,11123,8771,5744,12942,7641,831,10997
ffff62c6bb49bc9c0fbcf08494a4869c,4014,1227,12432,899,3802,13220,3644,2232,2164,4962
ffff9a7dcc892875c7a8b821fa436228,13241,11037,13797,13719,2087,7308,12939,8143,5810,844
ffffb1d30300fe17f661941fd085b04b,3100,3002,2373,13672,4976,5513,1687,12281,6034,5515


In [18]:
predictions_df.to_csv('submission.csv', index=False)