In [1]:
import re
import yaml
from pathlib import Path

import polars as pl
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dirs = [
    "../output/exp044a/run3/",  # light_cnn_v2
    "../output/exp046a/single/",  # light_cnn_v2
    # "../output/exp047a/run1/",  # transformer
    "../output/exp049a/run0/",  # light_cnn_v2
    # "../output/exp052a/run0/",  # light_cnn_v2
    # "../output/exp058a/run0/",
    "../output/exp059a/run3/",  # light_cnn_v2 with se_block
    "../output/exp067a/run0/",  # light_cnn_v2 with se_block
    "../output/exp068a/run0/",  # light_cnn_v2 with se_block
    "../output/exp068a/run1/",  # light_cnn_v2 with se_block
    "../output/exp076a/run0/",  # light_cnn_v2 with se_block
    "../output/exp092a/run0/", 
    "../output/exp092a/run1/", 
    "../output/exp095a/run0/", 
    "../output/exp097a/run0/", 
    "../output/exp103a/run0/", 
    "../output/exp110a/run0/", 
    "../output/exp121a/run0/",
    "../output/exp135a/run0/",
    "../output/exp135a/run1/",
]
tags = [
    re.findall(r"exp[\d]{3}[abc]", dirname)[0]
    for dirname in dirs
]
tags

['exp044a',
 'exp046a',
 'exp049a',
 'exp059a',
 'exp067a',
 'exp068a',
 'exp068a',
 'exp076a',
 'exp092a',
 'exp092a',
 'exp095a',
 'exp097a',
 'exp103a',
 'exp110a',
 'exp121a',
 'exp135a',
 'exp135a']

In [3]:
each_scores = []
for dirname, tag in zip(dirs, tags):
    each_score = yaml.safe_load(open(Path(dirname, "each_r2_score.yaml")))
    each_score["tag"] = tag
    each_scores.append(each_score)
each_scores_df = pl.from_dicts(each_scores)

In [None]:
topk = 7
pred_dfs = []
for col in tqdm(each_scores_df.columns[:-1]):
    topk_df = each_scores_df.filter(pl.col(col) > 0).sort(col, descending=True).head(3).select(["tag", col])
    if len(topk_df):
        topk_tags = topk_df["tag"].to_list()
    else:
        topk_tags = [tags[-1]]
    dfs = []
    for tag in topk_tags:
        tag_index = tags.index(tag)
        filename = list(Path(dirs[tag_index]).glob("submission*.csv"))
        assert len(filename) == 1, tags[tag_index]
        df = pl.read_csv(filename[0], columns=["sample_id", col])
        dfs.append(df)
    df = (
        pl.concat(dfs)
        .group_by("sample_id")
        .agg(pl.col(col).drop_nulls().mean())
    )
    pred_dfs.append(df.sort("sample_id").drop("sample_id"))
pred_df = pl.concat(pred_dfs, how="horizontal")

In [None]:
col_order = pl.read_csv("../data/sample_submission.csv", n_rows=1).columns
sub_df = pl.read_csv("../data/sample_submission.csv", columns=["sample_id"])
sub_df = pl.concat([sub_df, pred_df], how="horizontal")
sub_df = sub_df.select(col_order)

In [None]:
input_df = pl.read_csv("../data/test.csv", columns=["state_q0002_27"])
sub_df = sub_df.with_columns(
    (-input_df["state_q0002_27"] / 1200).alias("ptend_q0002_27")
)

In [None]:
filename = f"top{topk}_" + "_".join(tags) + ".csv"
sub_df.write_csv(f"../output/ensemble/{filename}")

In [None]:
filename