In [None]:
# # train/val スプリット。channel で分割する。channel 間で共通の content もあるため、共通項が最小になるような channel で分ける
# topics_val = topics[["id", "channel"]]
# topics_val = topics_val.merge(
#     correlations, how="inner", left_on="id", right_on="topic_id"
# )

# channel_val = topics_val.groupby("channel").agg(list).reset_index()
# channel_val["content_ids"] = channel_val["content_ids"].apply(
#     lambda x: list(np.unique(np.concatenate([x_.split() for x_ in x])))
# )


# def iou(a, b):
#     return len(set(a).intersection(set(b))) / len(set(a + b))


# ious = np.zeros((len(channel_val), len(channel_val)))

# for i in range(len(channel_val)):
#     for j in range(i):
#         iou_ij = iou(channel_val["content_ids"][i], channel_val["content_ids"][j])
#         ious[i, j] = iou_ij
#         ious[j, i] = iou_ij

# G = nx.Graph(ious)

# components = [list(k) for k in nx.connected_components(G)]
# for i, c in enumerate(components):
#     print(f"Component #{i} of length {len(c)}")

# # 0.8:0.2 に分割。171 * 0.2 は約 34 なので、#23- を val として分ける
# channel_val["fold"] = 0
# channel_val.loc[sum(components[23:], []), "fold"] = 1

# print(len(channel_val[channel_val["fold"] == 0]))  # 137
# print(len(channel_val[channel_val["fold"] == 1]))  # 34

# topics_without_source = topics[topics["category"] != "source"]

# topics_pipeline_train = topics[
#     topics["channel"].isin(channel_val[channel_val["fold"] == 0]["channel"])
# ]
# topics_pipeline_val = topics_without_source[
#     topics_without_source["channel"].isin(
#         channel_val[channel_val["fold"] == 1]["channel"]
#     )
# ]

# topics_pipeline_train.to_csv("topics_pipeline_train.csv", index=False)
# topics_pipeline_val.to_csv("topics_pipeline_val.csv", index=False)

In [None]:
# Docker 上で実行するときのみ使用。Kaggle Notebooks 上で実行するときはコメントアウト
# !pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113

In [None]:
!pip install ../input/sentence-transformers/sentence_transformers-2.2.2-py3-none-any.whl

In [None]:
import gc
import os

import networkx as nx
import numpy as np
import pandas as pd
import torch
from cuml.neighbors import NearestNeighbors
from imblearn.under_sampling import RandomUnderSampler
from scipy.optimize import minimize
from sentence_transformers import (
    CrossEncoder,
    InputExample,
    SentenceTransformer,
    losses,
)
from sentence_transformers.cross_encoder.evaluation import (
    CEBinaryClassificationEvaluator,
)
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from torch.utils.data import DataLoader

In [None]:
PATH_TO_INPUT = "../input/learning-equality-curriculum-recommendations"
PATH_TO_INPUT_VAL = "../input/learning-equality-curriculum-recommendations-val"
PATH_TO_OUTPUT = "."

TOPICS = "topics.csv"
TOPICS_PIPELINE_TRAIN = "topics_pipeline_train.csv"
TOPICS_PIPELINE_VAL = "topics_pipeline_val.csv"
CONTENT = "content.csv"
CORRELATIONS = "correlations.csv"
SAMPLE_SUBMISSION = "sample_submission.csv"

PATH_TO_RETRIEVER_WEIGHTS = "../input/all-minilm-l6-v2-retriever"
PATH_TO_RERANKER_WEIGHTS = "../input/all-minilm-l6-v2-reranker"

topics = pd.read_csv(os.path.join(PATH_TO_INPUT, TOPICS))
topics_df = pd.read_csv(os.path.join(PATH_TO_INPUT, TOPICS), index_col=0)
topics_pipeline_train = pd.read_csv(
    os.path.join(PATH_TO_INPUT_VAL, TOPICS_PIPELINE_TRAIN)
)
topics_pipeline_val = pd.read_csv(os.path.join(PATH_TO_INPUT_VAL, TOPICS_PIPELINE_VAL))
content = pd.read_csv(os.path.join(PATH_TO_INPUT, CONTENT))
correlations = pd.read_csv(os.path.join(PATH_TO_INPUT, CORRELATIONS))
sample_submission = pd.read_csv(os.path.join(PATH_TO_INPUT, SAMPLE_SUBMISSION))

In [None]:
class Topic:
    def __init__(self, topic_id):
        self.id = topic_id

    @property
    def parent(self):
        parent_id = topics_df.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            if parent.title:
                ancestors.append(parent)
            parent = parent.parent
        return ancestors

    def get_breadcrumbs(self, separator=" >> ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join(reversed([a.title for a in ancestors]))

    def __getattr__(self, name):
        return topics_df.loc[self.id][name]

In [None]:
# 特徴量を作るための関数
def create_feature_for_topics(topics):
    topics = topics[topics["has_content"]]

    topics = topics.merge(
        topics_df["description"].rename("parent_description"),
        how="left",
        left_on="parent",
        right_index=True,
    )

    children_description = topics.merge(
        topics_df[["description", "parent"]].rename(
            columns={"description": "children_description"}
        ),
        how="left",
        left_on="id",
        right_on="parent",
    )
    children_description.dropna(axis=0, subset=["children_description"], inplace=True)
    children_description = children_description.groupby(["id"])[
        "children_description"
    ].agg(list)
    children_description = children_description.apply(lambda x: " ".join(x))

    topics = topics.merge(
        children_description,
        how="left",
        left_on="id",
        right_index=True,
    )

    topics_df["title"].fillna("", inplace=True)
    topics["breadcrumbs"] = topics["id"].apply(lambda x: Topic(x).get_breadcrumbs())

    topics["feature"] = topics[
        [
            "title",
            "description",
            "breadcrumbs",
            "children_description",
            "parent_description",
        ]
    ].apply(lambda x: " ".join(x.dropna().tolist()), axis=1)
    topics = topics[["id", "language", "feature"]]

    return topics


def truncate(row, truncate_columns=[("description", 100), ("text", 60)]):
    if row["language"] not in ["en", "es"]:
        return row

    for truncate_column in truncate_columns:
        column_name = truncate_column[0]
        max_length = truncate_column[1]

        text = row[column_name]

        if pd.isna(text):
            continue

        text = text.split()
        if len(text) > max_length:
            text = text[: max_length // 2] + text[-(max_length // 2) :]
            text = " ".join(text)

            row[column_name] = text

    return row


def create_feature_for_content(content):
    content = content.apply(truncate, axis=1)
    content["feature"] = content[
        [
            "title",
            "description",
            "text",
        ]
    ].apply(lambda x: " ".join(x.dropna().tolist()), axis=1)

    content = content[["id", "language", "feature"]]

    return content

In [None]:
# submit のため前処理
sample_submission.drop(columns="content_ids", inplace=True)
sample_submission = sample_submission.merge(
    topics,
    how="left",
    left_on="topic_id",
    right_on="id",
)
sample_submission.drop(columns="topic_id", inplace=True)

In [None]:
# retriver 用にデータを作る
topics = create_feature_for_topics(topics)
sample_submission = create_feature_for_topics(sample_submission)

content = create_feature_for_content(content)

In [None]:
# retriver の実装。topics は has_content == True のみの想定
def retrieve(topics, content, path_to_weights):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = SentenceTransformer(path_to_weights, device=device)

    correlations_preds = []
    for language, topics_language in topics.groupby(["language"]):
        content_language = content[content["language"] == language]
        content_language.reset_index(drop=True, inplace=True)

        topics_language_embeddings = model.encode(
            topics_language["feature"].tolist(), device=device
        )
        content_language_embeddings = model.encode(
            content_language["feature"].tolist(), device=device
        )

        neigh = NearestNeighbors(n_neighbors=100, metric="cosine")
        neigh.fit(content_language_embeddings)

        distances, indices = neigh.kneighbors(topics_language_embeddings)

        distances = distances.flatten()
        indices = indices.flatten()

        distances = distances.reshape((-1, 1))
        indices = indices.reshape((-1, 1))

        topics_language_id = topics_language["id"].to_numpy()
        topics_language_id = np.repeat(topics_language_id, 100)
        topics_language_id = topics_language_id.reshape((-1, 1))

        correlations_pred = np.hstack((topics_language_id, indices, distances))
        correlations_pred = pd.DataFrame(
            correlations_pred, columns=["topic_id", "content_index", "distance"]
        )

        correlations_pred = correlations_pred.merge(
            content_language["id"],
            how="left",
            left_on="content_index",
            right_index=True,
        )
        correlations_pred.rename(columns={"id": "content_id"}, inplace=True)
        correlations_pred = correlations_pred[["topic_id", "content_id", "distance"]]

        correlations_preds.append(correlations_pred)

    correlations_preds = pd.concat(correlations_preds)

    return correlations_preds

In [None]:
# recall 計算用の関数
def calculate_recall(corr_true, corr_pred, print_recall):
    corr_true_copy = corr_true.copy()
    corr_pred_copy = corr_pred.copy()

    corr_true_copy = corr_true_copy[
        corr_true_copy["topic_id"].isin(corr_pred_copy["topic_id"])
    ]

    corr_true_copy["content_ids"] = corr_true_copy["content_ids"].str.split()
    corr_true_copy = corr_true_copy.explode("content_ids")

    corr_true_copy = corr_true_copy.merge(
        corr_pred_copy,
        how="left",
        left_on=["topic_id", "content_ids"],
        right_on=["topic_id", "content_id"],
    )

    rowwise_len = len(corr_true_copy)
    rowwise_recall = corr_true_copy["content_id"].notnull().sum() / rowwise_len

    if not print_recall:
        return rowwise_recall

    print(f"（行単位で見たときの）再現率: {rowwise_recall:.2%}")

    corr_true_copy = corr_true_copy.groupby(["topic_id"]).agg(
        {"content_ids": "count", "content_id": "count"}
    )
    corr_true_copy["recall"] = (
        corr_true_copy["content_id"] / corr_true_copy["content_ids"]
    )

    topicwise_len = len(corr_true_copy)

    print(f"すべての正解データを含む: {(corr_true_copy['recall'] == 1).sum() / topicwise_len:.2%}")
    print(
        f"正解データを 80% 以上含む: {(corr_true_copy['recall'] >= 0.8).sum() / topicwise_len:.2%}"
    )
    print(
        f"正解データを 50% 以上含む: {(corr_true_copy['recall'] >= 0.5).sum() / topicwise_len:.2%}"
    )
    print(f"正解データを１つも含まない: {(corr_true_copy['recall'] == 0).sum() / topicwise_len:.2%}")

In [None]:
# retriver 学習用の関数
def train_retriver(topics_train, topics_val, content, correlations):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

    correlations_copy = correlations.copy()
    correlations_copy["content_ids"] = correlations_copy["content_ids"].str.split()
    correlations_copy = correlations_copy.explode("content_ids")
    correlations_copy.rename(columns={"content_ids": "content_id"}, inplace=True)

    topics_train = topics_train.merge(
        correlations_copy,
        how="left",
        left_on="id",
        right_on="topic_id",
    )
    topics_train.rename(columns={"feature": "topic_feature"}, inplace=True)
    topics_train.drop(columns="topic_id", inplace=True)

    topics_train = topics_train.merge(
        content[["id", "feature"]],
        how="left",
        left_on="content_id",
        right_on="id",
        suffixes=("", "_"),
    )
    topics_train.rename(columns={"feature": "content_feature"}, inplace=True)
    topics_train.drop(columns="id_", inplace=True)

    train_examples = []
    for row in topics_train.itertuples():
        input_example = InputExample(texts=[row.topic_feature, row.content_feature])

        train_examples.append(input_example)

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=256)
    train_loss = losses.MultipleNegativesRankingLoss(model=model)

    correlations_preds = retrieve(topics_val, content, path_to_weights)

    correlations_copy["is_match"] = 1
    correlations_preds = correlations_preds.merge(
        correlations_copy,
        how="left",
        on=["topic_id", "content_id"],
    )
    correlations_preds["is_match"].fillna(0, inplace=True)

    correlations_preds = correlations_preds.merge(
        topics_val[["id", "feature"]],
        how="left",
        left_on="topic_id",
        right_on="id",
    )
    correlations_preds.rename(columns={"feature": "topic_feature"}, inplace=True)
    correlations_preds.drop(columns="id", inplace=True)

    correlations_preds = correlations_preds.merge(
        content[["id", "feature"]],
        how="left",
        left_on="content_id",
        right_on="id",
    )
    correlations_preds.rename(columns={"feature": "content_feature"}, inplace=True)
    correlations_preds.drop(columns="id", inplace=True)

    print(len(correlations_preds[correlations_preds["is_match"] == 0]))
    print(len(correlations_preds[correlations_preds["is_match"] == 1]))

    rus = RandomUnderSampler(random_state=42)
    correlations_preds, _ = rus.fit_resample(
        correlations_preds, correlations_preds["is_match"]
    )

    print(len(correlations_preds[correlations_preds["is_match"] == 0]))
    print(len(correlations_preds[correlations_preds["is_match"] == 1]))

    topics = correlations_preds["topic_feature"].tolist()
    content = correlations_preds["content_feature"].tolist()
    labels = correlations_preds["is_match"].tolist()

    evaluator = evaluation.BinaryClassificationEvaluator(topics, content, labels)

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=evaluator,
        epochs=5,
        evaluation_steps=10_000,
        output_path="重みファイルの保存先を入力",
        use_amp=True,
    )

In [None]:
# # retriver の学習を実行
# train_retriver(
#     topics_pipeline_train, topics_pipeline_val, content, correlations
# )

In [None]:
# retriver の実行と精度の確認
correlations_preds = retrieve(sample_submission, content, PATH_TO_RETRIEVER_WEIGHTS)
calculate_recall(correlations, correlations_preds, True)

In [None]:
# F2 score の計算と閾値の最適化を行う関数
def _f2_score(row):
    true_content_ids = row["content_ids"].split()
    pred_content_ids = row["content_id"]

    true_content_ids = set(true_content_ids)
    pred_content_ids = set(pred_content_ids)

    tp = len(true_content_ids.intersection(pred_content_ids))
    fp = len(pred_content_ids - true_content_ids)
    fn = len(true_content_ids - pred_content_ids)

    f2 = 0

    if tp != 0:
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f2 = (5 * precision * recall) / ((4 * precision) + recall)

    return f2


def f2_score(x, corr_true, corr_pred):
    corr_true_copy = corr_true.copy()
    corr_pred_copy = corr_pred.copy()

    if x:
        corr_pred_copy = corr_pred_copy[corr_pred_copy["probability"] > x[0]]

    corr_pred_copy = corr_pred_copy[["topic_id", "content_id"]]
    corr_pred_copy = corr_pred_copy.groupby("topic_id").agg(list).reset_index()

    corr_pred_copy = corr_pred_copy.merge(
        corr_true_copy,
        how="left",
        on="topic_id",
    )

    corr_pred_copy["f2_score"] = corr_pred_copy.apply(lambda x: _f2_score(x), axis=1)
    f2_score = corr_pred_copy["f2_score"].mean()

    return -f2_score


def optimize_threshold(corr_true, corr_pred, x0=[0.5]):
    corr_true_copy = corr_true.copy()
    corr_pred_copy = corr_pred.copy()

    res = minimize(
        f2_score, x0, args=(corr_true_copy, corr_pred_copy), method="Nelder-Mead"
    )

    corr_pred_copy = corr_pred_copy[corr_pred_copy["probability"] > res.x[0]]
    f2_score_ = -f2_score(None, corr_true_copy, corr_pred_copy)

    return f2_score_, res

In [None]:
# reranker 用にデータを作る
correlations_preds = correlations_preds.merge(
    topics[["id", "feature"]],
    how="left",
    left_on="topic_id",
    right_on="id",
)
correlations_preds.rename(columns={"feature": "topic_feature"}, inplace=True)
correlations_preds.drop(columns="id", inplace=True)

correlations_preds = correlations_preds.merge(
    content[["id", "feature"]],
    how="left",
    left_on="content_id",
    right_on="id",
)
correlations_preds.rename(columns={"feature": "content_feature"}, inplace=True)
correlations_preds.drop(columns="id", inplace=True)

In [None]:
del topics, topics_df, topics_pipeline_train, topics_pipeline_val, content, correlations
gc.collect()

In [None]:
# reranker 学習用の関数
def train_reranker(correlations_train, correlations_val, correlations_true):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = CrossEncoder("cross-encoder/stsb-roberta-base", device=device)

    correlations_train_copy = correlations_train.copy()
    correlations_val_copy = correlations_val.copy()
    correlations_true_copy = correlations_true.copy()

    correlations_true_copy["content_ids"] = correlations_true_copy[
        "content_ids"
    ].str.split()
    correlations_true_copy = correlations_true_copy.explode("content_ids")
    correlations_true_copy.rename(columns={"content_ids": "content_id"}, inplace=True)

    correlations_true_copy["is_match"] = 1
    correlations_train_copy = correlations_train_copy.merge(
        correlations_true_copy,
        how="left",
        on=["topic_id", "content_id"],
    )
    correlations_train_copy["is_match"].fillna(0, inplace=True)

    train_examples = []
    for row in correlations_train_copy.itertuples():
        input_example = InputExample(
            texts=[row.topic_feature, row.content_feature], label=row.is_match
        )

        train_examples.append(input_example)

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)

    correlations_val_copy = correlations_val_copy.merge(
        correlations_true_copy,
        how="left",
        on=["topic_id", "content_id"],
    )
    correlations_val_copy["is_match"].fillna(0, inplace=True)

    print(len(correlations_val_copy[correlations_val_copy["is_match"] == 0]))
    print(len(correlations_val_copy[correlations_val_copy["is_match"] == 1]))

    rus = RandomUnderSampler(random_state=42)
    correlations_val_copy, _ = rus.fit_resample(
        correlations_val_copy, correlations_val_copy["is_match"]
    )

    print(len(correlations_val_copy[correlations_val_copy["is_match"] == 0]))
    print(len(correlations_val_copy[correlations_val_copy["is_match"] == 1]))

    topics = correlations_val_copy["topic_feature"].to_numpy()
    content = correlations_val_copy["content_feature"].to_numpy()

    topics = topics.reshape((-1, 1))
    content = content.reshape((-1, 1))

    topics_content = np.hstack((topics, content))
    topics_content = topics_content.tolist()

    labels = correlations_val_copy["is_match"].tolist()

    evaluator = CEBinaryClassificationEvaluator(topics_content, labels)

    model.fit(
        train_dataloader=train_dataloader,
        evaluator=evaluator,
        epochs=1,
        evaluation_steps=10_000,
        output_path="重みファイルの保存先を入力",
        use_amp=True,
    )

In [None]:
# # reranker の学習を実行
# train_reranker(correlations_preds_train, correlations_preds_val, correlations)

In [None]:
# reranker の実装
def rerank(correlations_preds, path_to_weights):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = CrossEncoder(path_to_weights, device=device)
    
    topics = correlations_preds["topic_feature"].to_numpy()
    content = correlations_preds["content_feature"].to_numpy()
    
    topics = topics.reshape((-1, 1))
    content = content.reshape((-1, 1))

    topics_content = np.hstack((topics, content))
    topics_content = topics_content.tolist()
    
    correlations_preds["reranker_probability"] = model.predict(topics_content)
    
    return correlations_preds

In [None]:
correlations_preds = rerank(correlations_preds, PATH_TO_RERANKER_WEIGHTS)

correlations_preds.loc[correlations_preds["reranker_probability"] <= 0.28320312, "content_id"] = np.nan

correlations_preds = correlations_preds[["topic_id", "content_id"]]
correlations_preds = correlations_preds.groupby(["topic_id"]).agg(list).reset_index()
correlations_preds["content_id"] = correlations_preds["content_id"].apply(lambda x: [x_ for x_ in x if pd.notna(x_)])
correlations_preds["content_id"] = correlations_preds["content_id"].str.join(" ")

correlations_preds.rename(columns={"content_id": "content_ids"}, inplace=True)

correlations_preds.to_csv("submission.csv", index=False)