In [1]:
# 必要ライブラリのimportとデータの読み込み
import pandas as pd
import numpy as np

yado = pd.read_csv("in/yado.csv", dtype={"yad_no":int}) 
train_log = pd.read_csv("in/train_log.csv", dtype={"session_id":str, "seq_no":int, "yad_no":int})
train_label = pd.read_csv("in/train_label.csv", dtype={"session_id":str, "yad_no":int})
test_log = pd.read_csv("in/test_log.csv", dtype={"session_id":str, "seq_no":int, "yad_no":int})
test_session = pd.read_csv("in/test_session.csv", dtype={"session_id":str})

### パラメータ

In [2]:
#### typeが違う時のスコアをどの程度落とすか
DECREASE_RATE = 0.95
DECREASE_RATE2 = 0.65

#### 最後から二番目から見た時のD[v][r]に重み付け
SECOND_WEIGHT = 0.5
SECOND_WEIGHT2 = 0.8

# rate pow
POW1 = 0.56
POW2 = 0.56

# D2
D2_WEIGHT = 1.0

PROD_WEIGHT = 1.2

BASE_POINT = 2

### test_labelとtrain_labelでの検索数に大きな差があるやつを見つけたい

- train と test で時間がずれているので、その部分の考慮が必要
  - 例えば、yad_no=3338はtestの時にはほとんど人気がない
  - trainでは上位がディズニー >> USJ なのに対して test ではディズニー < USJ になっている
- log と label での考察
  - log で人気でも label では不人気なものがあるかもしれない
  - 例えばディズニーホテルなど。埋まってて予約できないというケース

これらのケースに当てはまるものは思い切って解答から外してしまう、というアイデアが考えられる。

In [3]:
train_size, test_size = len(train_log), len(test_log)
print(train_size, test_size)

419270 250305


In [4]:
train_yado = train_log["yad_no"].value_counts().reset_index()
train_yado = train_yado.merge(yado, how="left", on='yad_no')

train_label_yado = train_label["yad_no"].value_counts().reset_index()
train_label_yado = train_label_yado.merge(yado, how="left", on="yad_no")

test_yado = test_log["yad_no"].value_counts().reset_index()
test_yado = test_yado.merge(yado, how="left", on="yad_no")

In [5]:
# yad_no -> 各属性でのcount
# 0除算を防ぐために全てのcountを+1しておく -> そうじゃなくて+10くらいしたほうがいいのでは？
each_counts = [{
    'yad_no': idx,
    'train_log': BASE_POINT,
    'train_label': BASE_POINT,
    'test_log': BASE_POINT
} for idx in range(len(yado) + 1)]

for _, row in train_yado.iterrows():
    yad_no, count = row[0:2]
    each_counts[yad_no]["train_log"] = count + BASE_POINT

for _, row in train_label_yado.iterrows():
    yad_no, count = row[0:2]
    each_counts[yad_no]["train_label"] = count + BASE_POINT

for _, row in test_yado.iterrows():
    yad_no, count = row[0:2]
    each_counts[yad_no]["test_log"] = count + BASE_POINT

In [6]:
for idx, data in enumerate(each_counts):
    # rate1: ログで出た時の選ばれやすさ
    rate1 = data['train_label'] / max(1, data['train_log'])
    # rate2: train から test での変化率
    rate2 = data['test_log'] / max(1, data['train_log']) * (train_size / test_size)

    # rate1 = pow(rate1, 1.2)
    # rate2 = pow(rate2, 1.2)

    each_counts[idx]['rate1'] = rate1 # type: ignore
    each_counts[idx]['rate2'] = rate2 # type: ignore

In [7]:
each_counts

[{'yad_no': 0,
  'train_log': 2,
  'train_label': 2,
  'test_log': 2,
  'rate1': 1.0,
  'rate2': 1.6750364555242605},
 {'yad_no': 1,
  'train_log': 2,
  'train_label': 3,
  'test_log': 32,
  'rate1': 1.5,
  'rate2': 26.800583288388168},
 {'yad_no': 2,
  'train_log': 26,
  'train_label': 16,
  'test_log': 7,
  'rate1': 0.6153846153846154,
  'rate2': 0.4509713534103778},
 {'yad_no': 3,
  'train_log': 182,
  'train_label': 96,
  'test_log': 32,
  'rate1': 0.5274725274725275,
  'rate2': 0.2945119042680019},
 {'yad_no': 4,
  'train_log': 52,
  'train_label': 36,
  'test_log': 19,
  'rate1': 0.6923076923076923,
  'rate2': 0.6120325510569413},
 {'yad_no': 5,
  'train_log': 32,
  'train_label': 22,
  'test_log': 2,
  'rate1': 0.6875,
  'rate2': 0.10468977847026628},
 {'yad_no': 6,
  'train_log': 4,
  'train_label': 3,
  'test_log': 5,
  'rate1': 0.75,
  'rate2': 2.0937955694053256},
 {'yad_no': 7,
  'train_log': 24,
  'train_label': 4,
  'test_log': 2,
  'rate1': 0.16666666666666666,
  'rate2'

In [8]:
# train -> test で 減っているもの
decrease_yado = sorted(each_counts, key=lambda x: x['rate2'])
# decrease_yado = [data for data in decrease_yado if data['rate2'] != 0.0]

## 宿のタイプ：ビジネスと観光

In [9]:
yad_type = pd.read_csv("data/yado_with_count.csv")

for _, row in yad_type.iterrows():
    yad_no, yad_class = row[1], row[14]
    each_counts[yad_no]['yad_class'] = yad_class

each_counts

  yad_no, yad_class = row[1], row[14]


[{'yad_no': 0,
  'train_log': 2,
  'train_label': 2,
  'test_log': 2,
  'rate1': 1.0,
  'rate2': 1.6750364555242605},
 {'yad_no': 1,
  'train_log': 2,
  'train_label': 3,
  'test_log': 32,
  'rate1': 1.5,
  'rate2': 26.800583288388168,
  'yad_class': 'ビジネス'},
 {'yad_no': 2,
  'train_log': 26,
  'train_label': 16,
  'test_log': 7,
  'rate1': 0.6153846153846154,
  'rate2': 0.4509713534103778,
  'yad_class': 'ビジネスミドル'},
 {'yad_no': 3,
  'train_log': 182,
  'train_label': 96,
  'test_log': 32,
  'rate1': 0.5274725274725275,
  'rate2': 0.2945119042680019,
  'yad_class': '観光'},
 {'yad_no': 4,
  'train_log': 52,
  'train_label': 36,
  'test_log': 19,
  'rate1': 0.6923076923076923,
  'rate2': 0.6120325510569413,
  'yad_class': '観光'},
 {'yad_no': 5,
  'train_log': 32,
  'train_label': 22,
  'test_log': 2,
  'rate1': 0.6875,
  'rate2': 0.10468977847026628,
  'yad_class': '観光'},
 {'yad_no': 6,
  'train_log': 4,
  'train_label': 3,
  'test_log': 5,
  'rate1': 0.75,
  'rate2': 2.0937955694053256,
 

# Code

In [10]:
from collections import defaultdict
from heapq import heappush, heappop

### D[v][r] の計算

In [11]:
# データの加工

# train_logで実際に予約した宿をひけるようにしておく
map_reserved = defaultdict(int)
for idx, rec in train_label.iterrows():
  session_id, yad_no_reserved = rec
  map_reserved[session_id] = yad_no_reserved

# 縦持ちのセッションログを、session_id : [閲覧したyad_noのリスト] のdictに変換
def Make_session_list(session_log):
  map_session_yads = defaultdict(list)
  for _, row in session_log.iterrows():
    session_id = row[0]
    yad_no = row[2]
    map_session_yads[session_id].append(yad_no)
  return map_session_yads

map_session_yads_train = Make_session_list(train_log)
map_session_yads_test = Make_session_list(test_log)

# D[v][r]:= 「最後に宿vを閲覧して、宿rを予約した」セッションの件数 <- 重み付けを行う！
D = defaultdict(lambda:defaultdict(float))
for session_id, viewed_yad_no in map_session_yads_train.items():
  last_viewed = viewed_yad_no[-1]
  reserved = map_reserved[session_id]
  D[last_viewed][reserved] += each_counts[reserved]['rate2'] ** POW2

# D2[v][r] := sessionlength >= 2 に対して最後から二番目に見た宿と予約確率
D2 = defaultdict(lambda:defaultdict(float))
for session_id, viewed_yad_no in map_session_yads_train.items():
  if len(viewed_yad_no) == 1:
    continue
  last_viewed2 = viewed_yad_no[-2]
  reserved = map_reserved[session_id]
  if last_viewed2 == reserved:
    continue
  D2[last_viewed2][reserved] += each_counts[reserved]['rate2'] ** POW2


  session_id = row[0]
  yad_no = row[2]


### 各エリアでの人気な宿TOP10列挙

`test01.ipynb` では train_log における人気度を用いたが、これよりも test_log における人気度を用いた方が良いと思われる。
しかし、「閲覧されるものの予約されにくいもの」というものもある可能性があるため、ここでは重み付けに rate1 を利用する。

In [12]:
yado_cnt = [0 for _ in range(15000)]

for idx, row in test_log.iterrows():
    yad_no = row[2]
    yado_cnt[yad_no] += each_counts[yad_no]['rate1'] ** POW1

sml_to_yado = {
    sml_cd: []
    for sml_cd in yado["sml_cd"]
}
yado_to_sml = {
    row[0]: row[12]
    for _, row in yado.iterrows()
}
for idx, row in yado.iterrows():
    yad_no = row[0]
    sml_cd = row[12]
    sml_to_yado[sml_cd].append(yad_no)

  yad_no = row[2]
  row[0]: row[12]
  yad_no = row[0]
  sml_cd = row[12]


In [13]:
for sml_cd in sml_to_yado.keys():
    sml_to_yado[sml_cd] = sorted(sml_to_yado[sml_cd], key=lambda x: yado_cnt[x], reverse=True)[:50]

In [14]:
lrg_to_yado = {
    lrg_cd: []
    for lrg_cd in yado["lrg_cd"]
}
yado_to_lrg = {
    row[0]: row[11]
    for _, row in yado.iterrows()
}
for idx, row in yado.iterrows():
    yad_no = row[0]
    lrg_cd = row[11]
    lrg_to_yado[lrg_cd].append(yad_no)

for lrg_cd in lrg_to_yado.keys():
    lrg_to_yado[lrg_cd] = sorted(lrg_to_yado[lrg_cd], key=lambda x: yado_cnt[x], reverse=True)[:20]

  row[0]: row[11]
  yad_no = row[0]
  lrg_cd = row[11]


In [15]:
popular_yados = [
    719, 2201, 13468, 12350, 2797
]

### ビジネス-観光の分類を入れる

In [16]:
def sort_yado_with_type(search_yad_class, yad_no):
    yad_class = each_counts[yad_no]['yad_class']
    if yad_class == search_yad_class:
        return yado_cnt[yad_no]
    elif yad_class == "ビジネス" and search_yad_class == "ビジネスミドル":
        return yado_cnt[yad_no] * DECREASE_RATE
    elif yad_class == "観光" and search_yad_class == "観光ミドル":
        return yado_cnt[yad_no] * DECREASE_RATE
    elif yad_class == "ビジネスミドル" or yad_class == "観光ミドル":
        return yado_cnt[yad_no] * DECREASE_RATE
    else:
        return yado_cnt[yad_no] * DECREASE_RATE2

### sml_cd での遷移関係

In [17]:
# 各sml_cdについて、最後に見た回数を数える
last_viewed_count_sml = {
    row[12]: 0
    for _, row in yado.iterrows()
}
for _, viewed_yad_no in map_session_yads_train.items():
    last_viewed = viewed_yad_no[-1]
    last_sml_cd = yado_to_sml[last_viewed]
    last_viewed_count_sml[last_sml_cd] += 1

# E[a][b] -> sml_cd='a' から sml_cd='b'に移動した回数
E = defaultdict(lambda:defaultdict(float))
for session_id, viewed_yad_no in map_session_yads_train.items():
    last_viewed = viewed_yad_no[-1]
    last_sml_cd = yado_to_sml[last_viewed]
    reserved = map_reserved[session_id]
    reserved_sml_cd = yado_to_sml[reserved]
    E[last_sml_cd][reserved_sml_cd] += 1 / last_viewed_count_sml[last_sml_cd]

# -> E[a][b] が欲しい確率！

  row[12]: 0


In [18]:
E

defaultdict(<function __main__.<lambda>()>,
            {'4044dac1931ddaa5a967e09506d76343': defaultdict(float,
                         {'4044dac1931ddaa5a967e09506d76343': 0.9385113268608487,
                          'af003d74af66f40a2511d2468db15fe0': 0.003236245954692557,
                          '284ab43612caf399fb30093363f92a52': 0.029126213592233025,
                          '20ad8785a30f125bee5a8a325782ab06': 0.004854368932038835,
                          '8a3f01bdf9eb39e9ce6f4780590a51cd': 0.0016181229773462784,
                          '33bfe292401fc7f99b8b9831a71f61ee': 0.003236245954692557,
                          'e2f51242791849e72240784844876b89': 0.004854368932038835,
                          'bb5a43cfd329c8cc725a8abf2651f537': 0.008090614886731393,
                          '199073cb3739d73a10d0d96f0b4f8555': 0.003236245954692557,
                          'ba9e619c37ec3475db6b5e1c3426289c': 0.0016181229773462784,
                          '90a34785b0654fabbea6c

### 解答作成！

In [19]:
Predict = [ [0] * 10 for _ in range(len(test_session))]

def prod_weight(prod):
    return prod ** PROD_WEIGHT

def business_weight(yad, last_viewed):
    return 1.0

# session長さ1の時のDからの選び方
def get_for_single_session(idx, last_viewed, rank):
    sorted_yad_list = []
    for yad_no, viewed_cnt in D[last_viewed].items():
        heappush(sorted_yad_list, (-viewed_cnt * business_weight(yad_no, last_viewed), yad_no))
    
    while rank < 10 and sorted_yad_list:
        _, predicted_yad_no = heappop(sorted_yad_list)
        if predicted_yad_no in Predict[idx]:
            continue
        Predict[idx][rank] = predicted_yad_no
        rank += 1
    
    return rank

# session長さ2の時のDからの選び方
def get_for_twice_session(idx, last_viewed, last_viewed2, rank):
    # last_viewed, last_viewed2 を用いる
    yad_score = {}
    for yad_no, viewed_cnt in D[last_viewed].items():
        yad_score[yad_no] = viewed_cnt
    for yad_no, viewed_cnt in D[last_viewed2].items():
        if yad_no in yad_score:
            yad_score[yad_no] += viewed_cnt * SECOND_WEIGHT
        else:
            yad_score[yad_no] = viewed_cnt * SECOND_WEIGHT
    for yad_no, viewed_cnt in D2[last_viewed2].items():
        if yad_no in yad_score:
            yad_score[yad_no] += viewed_cnt * D2_WEIGHT
        else:
            yad_score[yad_no] = viewed_cnt * D2_WEIGHT
    
    sorted_yad_list = []
    for yad_no, viewed_cnt in yad_score.items():
        heappush(sorted_yad_list, (-viewed_cnt, yad_no))
    
    while rank < 10 and sorted_yad_list:
        _, predicted_yad_no = heappop(sorted_yad_list)
        if predicted_yad_no in Predict[idx]:
            continue
        Predict[idx][rank] = predicted_yad_no
        rank += 1
    
    return rank

# session長さ3以上の時のDからの選び方
def get_for_multiple_session(idx, last_viewed, last_viewed2, last_viewed3, rank):
    # last_viewed, last_viewed2 を用いる
    yad_score = {}
    for yad_no, viewed_cnt in D[last_viewed].items():
        yad_score[yad_no] = viewed_cnt
    for yad_no, viewed_cnt in D[last_viewed2].items():
        if yad_no in yad_score:
            yad_score[yad_no] += viewed_cnt * SECOND_WEIGHT
        else:
            yad_score[yad_no] = viewed_cnt * SECOND_WEIGHT
    for yad_no, viewed_cnt in D2[last_viewed2].items():
        if yad_no in yad_score:
            yad_score[yad_no] += viewed_cnt * D2_WEIGHT
        else:
            yad_score[yad_no] = viewed_cnt * D2_WEIGHT
    for yad_no, viewed_cnt in D[last_viewed3].items():
        if yad_no in yad_score:
            yad_score[yad_no] += viewed_cnt * SECOND_WEIGHT
        else:
            yad_score[yad_no] = viewed_cnt * SECOND_WEIGHT
    
    sorted_yad_list = []
    for yad_no, viewed_cnt in yad_score.items():
        heappush(sorted_yad_list, (-viewed_cnt, yad_no))
    
    while rank < 10 and sorted_yad_list:
        _, predicted_yad_no = heappop(sorted_yad_list)
        if predicted_yad_no in Predict[idx]:
            continue
        Predict[idx][rank] = predicted_yad_no
        rank += 1
    
    return rank


# session長さ1の時のEからのエリア選択
# 上位3件取ってきて、そのままEの値で重み付けを行う
def get_from_sml_single(idx, last_viewed, rank):
    viewed_sml_cd = yado_to_sml[last_viewed]
    score = {}
    sorted_sml_list = []
    for sml_cd, viewed_prod in E[viewed_sml_cd].items():
        heappush(sorted_sml_list, (-viewed_prod, sml_cd))
    
    cnt = 0
    while cnt < 3 and sorted_sml_list:
        prod, sml_cd = heappop(sorted_sml_list)
        prod = -prod
        # sml_cd の人気宿を prod で重み付け
        num = 0
        yad_list = sml_to_yado[sml_cd]
        while num < min(len(yad_list), 10):
            yad_no = yad_list[num]
            score[yad_no] = yado_cnt[yad_no] * prod_weight(prod)
            num += 1
        cnt += 1
    
    sorted_yad_list = []
    for yad_no, prod in score.items():
        heappush(sorted_yad_list, (-prod * business_weight(yad_no, last_viewed), yad_no))
    
    while rank < 10 and sorted_yad_list:
        _, predicted_yad_no = heappop(sorted_yad_list)
        if predicted_yad_no in Predict[idx]:
            continue
        Predict[idx][rank] = predicted_yad_no
        rank += 1
    
    return rank

# session長さ2以上の時のEからのエリア選択
# 上位3件取ってきて、そのままEの値で重み付けを行う
def get_from_sml_multiple(idx, last_viewed, last_viewed2, rank):
    viewed_sml_cd, viewed_sml_cd2 = yado_to_sml[last_viewed], yado_to_sml[last_viewed2]
    sml_score = {}
    for sml_cd, viewed_prod in E[viewed_sml_cd].items():
        sml_score[sml_cd] = viewed_prod
    for sml_cd, viewed_prod in E[viewed_sml_cd2].items():
        if sml_cd in sml_score:
            sml_score[sml_cd] += prod_weight(viewed_prod * SECOND_WEIGHT2)
        else:
            sml_score[sml_cd] = prod_weight(viewed_prod * SECOND_WEIGHT2)

    sorted_sml_list = []
    for sml_cd, prod in sml_score.items():
        heappush(sorted_sml_list, (-prod, sml_cd))
    
    score = {}
    cnt = 0
    while cnt < 3 and sorted_sml_list:
        prod, sml_cd = heappop(sorted_sml_list)
        prod = -prod
        # sml_cd の人気宿を prod で重み付け
        num = 0
        yad_list = sml_to_yado[sml_cd]
        while num < min(len(yad_list), 10):
            yad_no = yad_list[num]
            score[yad_no] = yado_cnt[yad_no] * prod_weight(prod)
            num += 1
        cnt += 1
    
    sorted_yad_list = []
    for yad_no, prod in score.items():
        heappush(sorted_yad_list, (-prod, yad_no))
    
    while rank < 10 and sorted_yad_list:
        _, predicted_yad_no = heappop(sorted_yad_list)
        if predicted_yad_no in Predict[idx]:
            continue
        Predict[idx][rank] = predicted_yad_no
        rank += 1
    
    return rank

"""メイン処理"""
not_enough = 0
for idx, session_id in enumerate(test_session["session_id"]):
    viewed_list = map_session_yads_test[session_id]
    last_viewed = viewed_list[-1]
    rank = 0
    
    cand, seen = [], set()
    for item in viewed_list:
        if item != last_viewed and item not in seen:
            cand.append(item)
            seen.add(item)
    
    for item in cand:
        Predict[idx][rank] = item
        rank += 1

    if len(viewed_list) == 1:
        rank = get_for_single_session(idx, last_viewed, rank)
        rank = get_from_sml_single(idx, last_viewed, rank)
    elif len(viewed_list) == 2:
        rank = get_for_twice_session(idx, last_viewed, viewed_list[-2], rank)
        rank = get_from_sml_multiple(idx, last_viewed, viewed_list[-2], rank)
    else:
        rank = get_for_multiple_session(idx, last_viewed, viewed_list[-2], viewed_list[-3], rank)
        rank = get_from_sml_multiple(idx, last_viewed, viewed_list[-2], rank)

    
    if rank == 10:
        continue

    not_enough += 10 - rank

    # それでも余っているならlrg_cdでの人気宿で埋める（考察余地あり）
    cnt = 0
    lrg_cd = yado_to_lrg[last_viewed]
    top_yados = lrg_to_yado[lrg_cd]
    last_class = each_counts[last_viewed]['yad_class']
    top_yados = sorted(top_yados, key=lambda x: sort_yado_with_type(last_class, x), reverse=True)
    while rank < 10 and cnt < len(top_yados):
        if top_yados[cnt] in Predict[idx]:
            cnt += 1
            continue
        Predict[idx][rank] = top_yados[cnt]
        cnt += 1
        rank += 1
    
    # それでも余っているなら(どんな場合？？？) popular yados から選ぶ
    cnt = 0
    while rank < 10 and cnt < len(popular_yados):
        if popular_yados[cnt] in Predict[idx]:
            cnt += 1
            continue
        Predict[idx][rank] = popular_yados[cnt]
        cnt += 1
        rank += 1
    


print(not_enough)
    
df_submit3 = pd.DataFrame(Predict, columns=["predict_0", "predict_1", "predict_2", "predict_3", "predict_4", "predict_5", "predict_6", "predict_7", "predict_8", "predict_9"])
df_submit3.to_csv("out/last_sub.csv", index=False)

599
