# 2-stageのコード構成
## stage 1 (このノートブック)
* 各モデルのembeddingから、閾値を振ってpredictionを作成
  * 例: exp280_all_th0.4, exp280_img_th0.4, exp280_text_th0.4, exp280_all_th0.45, ...
* dataframeを出力する
* モデルができるごとにノートブックを作成する

## stage 2
* stage 1で出力したモデル・閾値ごとのpredictionを入力とし、votingをたくさん振る

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize, StandardScaler
import glob
import tqdm
import torch
import os
import re

In [None]:
df = pd.concat([pd.read_csv(x) for x in glob.glob("../input/shopeevaliddf/*.csv")]).reset_index(drop=True)

In [None]:
df = pd.merge(pd.read_csv("../input/shopee-product-matching/train.csv")[["posting_id", "title"]],
              df.drop("title", axis=1),
              how="inner",
              on="posting_id")

In [None]:
len(df)

In [None]:
def str_to_list(s: str):
    return s.replace("[", "").replace("]", "").replace("\n", "").replace("'", "").replace(",", "").split()
df["y_pred"] = df["y_pred"].map(str_to_list)
df["target"] = df["target"].map(str_to_list)

In [None]:
def get_cv(df, similarity_matrix, threshold, indices=None, pred_name="pred", min_n=2, mode="min"):
    posting_id = df["posting_id"].values
    preds = []
    for k in range(len(df)):
        if mode == "min": # euclid distance etc
            IDX = np.where(similarity_matrix[k, ] < threshold)[0]
            if len(IDX) < min_n:                
                IDX = np.argsort(similarity_matrix[k, ])[:min_n]
        if mode == "max": # cosine similarlity
            IDX = np.where(similarity_matrix[k, ] > threshold)[0]
            if len(IDX) < min_n:                
                IDX = np.argsort(similarity_matrix[k, ])[-min_n:]
            
        pred = posting_id[IDX]
        preds.append(pred)
    
    df[pred_name] = preds
    f1score, precision, recall = calc_cv(df, col_name=pred_name)
    # df[pred_name] = [' '.join(np.unique(x)) for x in df[pred_name].values]
    return f1score, precision, recall

In [None]:
def calc_cv(df, col_name):
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    df[f'f1_{col_name}'] = df.apply(get_f1(col_name),axis=1)
    return df[f"f1_{col_name}"].mean()

def get_f1(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

# kurupical_models

In [None]:
def string_escape(s, encoding="utf-8"):
    return (
        s.encode("latin1")  # To bytes, required by 'unicode-escape'
        .decode("unicode-escape")  # Perform the actual octal-escaping decode
        .encode("latin1")  # 1:1 mapping back to bytes
        .decode(encoding)
    )  # Decode original encoding

def get_unit_from_title(title, unit_name):
    def postprocess(x):
        ret = x.replace(" ", "").replace(",", ".").replace(unit_name, "").split("/")
        
        out = []
        for r in ret:
            try:
                out.append(round(float(r), 1e-8))
            except Exception as e:
                out.append(r)
        return out

    """ タイトル title から, 単位名 unit_name の数値特徴を抜き出す"""
    
    title = string_escape(title)
    title = title.lower().replace("(", "").replace(")", "").replace("[", "").replace("]", "")
    pattern = f".*? (\d+.{unit_name})"
    
    # ary = re.findall(pattern, title, re.S)
    ary = re.findall(r'[0-9.-.,./.]+' + '\s*'+ unit_name, title)
    ret = []
    for a in ary:
        ret.extend(postprocess(a))
    
    if len(ret) == 0:
        return None, None, None
    elif len(ret) == 1:
        return ret[0], None, None
    elif len(ret) == 2: 
        return ret[0], ret[1], None
    else:
        return ret[0], ret[1], ret[2]

def vote_for_unit(x):
    """
    drop_different_unit専用のvote
    * unit_nameが入っていないデータは無条件に同じグループとする
    * voteの結果が1件となった場合は, 後処理をしない
    
    """
    def isnan(x):
        if x == x:
            return False
        else:
            return True
        if type(x) == float:
            return True
        if type(x) == np.ndarray:
            return False
    
#     if "train_2678176944" in x[0]:
#         print("===================")
#         print(x)
#         print("@@@@@@")
    if type(x[1]) == np.ndarray:
        x_out = np.concatenate(x)
        x_out, counts = np.unique(x_out, return_counts=True)
        ret_idx = counts >= 2
#         if "train_2678176944" in x[0]:
#             print(f"unit_dict => {[unit_dict[xx] for xx in x[0]]}")
#             print(unit_dict[x[0][0]])
#             print(x_out)
#             print(counts)
#             print(x_unit_none)
#             print("----------------")
        x_out = x_out[ret_idx].tolist()
#         if "train_2678176944" in x[0]:
#             print("return x_out + x_unit_none")
#             print(x_out + x_unit_none)
        return x_out
    else:
#         if "train_2678176944" in x[0]:
#             print("return x[0]")
#             print(x[0])
        return []

def vote_for_unit_none(x, unit_dict):
    """
    drop_different_unit専用のvote
    * unit_nameが入っていないデータは無条件に同じグループとする
    * voteの結果が1件となった場合は, 後処理をしない
    
    """
#     if "train_3879247647" in x:
#         print("===================")
#         print(x)
#         print([unit_dict[xx] for xx in x])
#         print("@@@@@@")
    
    x_unit_none = [xx for xx in x if unit_dict[xx] is None or unit_dict[xx] != unit_dict[xx]]
    return x_unit_none

def drop_different_unit(df, pred_name, unit_name, pred_name_out=None):
    """
    同じグループのもののうち, 単位 unit_name の数字が違うデータは別グループとする
    例: unit_name = "ml" -> 50ml と 100ML は別グループ
    
    params
    @df: 
    @pred_name: 検査する予測結果の列名
    @unit_name: 単位名(例: ml, gram, ...)
    @pred_name_out: 後処理後の列名(Noneならpred_nameと同じ)
    """
    if pred_name_out is None:
        pred_name_out = pred_name
    
    units = [get_unit_from_title(title, unit_name) for title in df["title"].values]
    # unit の数値を抽出(3個まで)
    unit_dicts = {}
    for i in range(3):
        col_name = f"{unit_name}_{i}"
        df[col_name] = [x[i] for x in units]
        unit_dicts[i] = df.groupby(col_name)["posting_id"].agg("unique").to_dict()
    # 自分の unit の数値と同じ数値を持つものを列に持つ　
    # 自分 [0, 1, 2] - 相手 [0, 1, 2] で総当たりするので9通り
    pred_cols = []
    for target_idx in range(3):
        for search_idx in range(3):
            pred_col = f"pred_{target_idx}_{search_idx}"
            df[pred_col] = df[f"{unit_name}_{target_idx}"].map(unit_dicts[search_idx])
            pred_cols.append(pred_col)
    

    col_name = f"{unit_name}_0"
    print(f"{df[col_name].notnull().sum()} titles have unit_name '{unit_name}'. n_unique: {len(unit_dicts[0])}")
    
    for pred_col in pred_cols:
        df[pred_col] = [vote_for_unit(x) for x in df[[pred_name, pred_col]].values]
    

    unit_col_name = f"{unit_name}_0"
    posting_unit_dict = df[["posting_id", unit_col_name]].set_index("posting_id")[unit_col_name].to_dict()
    df["none_col"] = [vote_for_unit_none(x, posting_unit_dict) for x in df[pred_name].values]
#     print(df[df["posting_id"] == "train_2963630570"]["pred_0_0"])
#     print(df[df["posting_id"] == "train_2963630570"]["none_col"])
    df["pred_final"] = [np.unique(np.concatenate(x)) for x in df[pred_cols + ["none_col"]].values]
    
    def f(unit, pred_final, pred_original):
        
        if unit is None or unit != unit:
            return pred_original
        if len(pred_final) > 1:
            return pred_final
        else:
            return pred_original
        
    df[pred_name_out] = [f(x[0], x[1], x[2]) for x in df[[f"{unit_name}_0", "pred_final", pred_name]].values]
    return df

In [None]:
f1 = calc_cv(df, "y_pred")

In [None]:
f1

In [None]:
best_score = round(f1, 6)

In [None]:
best_score

## single

In [None]:
for unit_name in ['gr', 'gm', 'kg', 'kilo', 'mg', 'litre', 'ml', 'pcs', 'inch', 'yard', 'cm', 'mm', 'metre', 'micro', 'gb', 'mb', 'tb', 'kb', 'thn', 'capsule', 'kapsul']:
# for unit_name in ['pcs']:
    print(unit_name)
    df = drop_different_unit(df, 
                             pred_name="y_pred", 
                             unit_name=unit_name, 
                             pred_name_out="pred")
    f1score = calc_cv(df, "pred")

    df["diff"] = df["f1_pred"] - df["f1_y_pred"]
    f1score = np.round(f1score, 6)
    diff = np.round(f1score - best_score, 6)
    n_effected_data = len(df[df["diff"] != 0])

    print(f"unit_name={unit_name} / fscore {best_score} -> {f1score} diff={diff} n_effected_data={n_effected_data}")

In [None]:
df["pred_last"] = df["y_pred"].values
past_score = best_score
for unit_name in ["gr", "ml", "cm"]:

    print(unit_name)
    df = drop_different_unit(df, 
                             pred_name="pred_last", 
                             unit_name=unit_name) 
    f1score = calc_cv(df, "pred_last")

    df["diff"] = df["f1_y_pred"] - df["f1_pred_last"]
    f1score = np.round(f1score, 6)
    diff = np.round(f1score - past_score, 6)
    n_effected_data = len(df[df["diff"] != 0])

    print(f"unit_name={unit_name} / fscore {past_score} -> {f1score} diff={diff} n_effected_data={n_effected_data}")
    past_score = f1score

In [None]:
# for unit_name in ['gr', 'gm', 'kg', 'kilo', 'mg', 'litre', 'ml', 'pc', 'inch', 'yard', 'cm', 'mm', 'metre', 'micro', 'gb', 'mb', 'tb', 'kb', 'thn', 'capsule', 'kapsul']:
for unit_name in ['cm']:
    print(unit_name)
    df = drop_different_unit(df, 
                             pred_name="y_pred", 
                             unit_name=unit_name, 
                             pred_name_out="pred")
    f1score = calc_cv(df, "pred")

    df["diff"] = df["f1_pred"] - df["f1_y_pred"]
    f1score = np.round(f1score, 6)
    diff = np.round(f1score - best_score, 6)
    n_effected_data = len(df[df["diff"] != 0])

    print(f"unit_name={unit_name} / fscore {best_score} -> {f1score} diff={diff} n_effected_data={n_effected_data}")

In [None]:
df[df["diff"] != 0]["diff"].hist(bins=100, range=(-1, 1), figsize=(20, 8))

In [None]:
# for unit_name in ['gr', 'gm', 'kg', 'kilo', 'mg', 'litre', 'ml', 'pc', 'inch', 'yard', 'cm', 'mm', 'metre', 'micro', 'gb', 'mb', 'tb', 'kb', 'thn', 'capsule', 'kapsul']:
for unit_name in ['mm']:
    print(unit_name)
    df = drop_different_unit(df, 
                             pred_name="y_pred", 
                             unit_name=unit_name, 
                             pred_name_out="pred")
    f1score = calc_cv(df, "pred")

    df["diff"] = df["f1_pred"] - df["f1_y_pred"]
    f1score = np.round(f1score, 6)
    diff = np.round(f1score - best_score, 6)
    n_effected_data = len(df[df["diff"] != 0])

    print(f"unit_name={unit_name} / fscore {best_score} -> {f1score} diff={diff} n_effected_data={n_effected_data}")
    
    df[df["diff"] != 0]["diff"].hist(bins=100, range=(-1, 1), figsize=(20, 8))

In [None]:
df[df["diff"] != 0]["diff"].describe()

In [None]:
title_dict = df[["posting_id", "title"]].set_index("posting_id")["title"].to_dict()

In [None]:
w_df = df[df["diff"] != 0].sort_values("diff", ascending=True)

In [None]:
for i in range(10):
    series = w_df.iloc[i]
    posting_title = title_dict[series["posting_id"]]
    no_postprocess = [title_dict[x] for x in series["y_pred"]]
    postprocess = [title_dict[x] for x in series["pred"]]
    target = [title_dict[x] for x in series["target"]]
    f1_before = series["f1_y_pred"]
    f1_after = series["f1_pred"]
    posting_id = series["posting_id"]
    print(f"========{posting_id}: {posting_title}========")
    print("-----------------------")
    print("[no_postprocess]: \n{}".format("\n".join(no_postprocess)))
    print("-----------------------")
    print("[postprocess]: \n{}".format("\n".join(postprocess)))
    print("-----------------------")
    print("[target]: \n{}".format("\n".join(target)))
    print("-----------------------")    
    print(f"\n<< f1 >>: {f1_before} -> {f1_after}\n")

In [None]:
w_df = df[df["diff"] != 0].sort_values("diff", ascending=True)

In [None]:
for i in range(10):
    series = w_df.iloc[i]
    posting_title = title_dict[series["posting_id"]]
    no_postprocess = [title_dict[x] for x in series["y_pred"]]
    postprocess = [title_dict[x] for x in series["pred"]]
    target = [title_dict[x] for x in series["target"]]
    f1_before = series["f1_y_pred"]
    f1_after = series["f1_pred"]
    posting_id = series["posting_id"]
    print(f"========{posting_id}: {posting_title}========")
    print("-----------------------")
    print("[no_postprocess]: \n{}".format("\n".join(no_postprocess)))
    print("-----------------------")
    print("[postprocess]: \n{}".format("\n".join(postprocess)))
    print("-----------------------")
    print("[target]: \n{}".format("\n".join(target)))
    print("-----------------------")    
    print(f"\n<< f1 >>: {f1_before} -> {f1_after}\n")

In [None]:
w_df = df[df["diff"] != 0][["posting_id", "exp095_swin_all_th0.55", "pred", "target", "f1_exp095_swin_all_th0.55", "f1_pred", "diff"]].sort_values("diff", ascending=True)

In [None]:
for i in range(10):
    series = w_df.iloc[i]
    posting_title = title_dict[series["posting_id"]]
    no_postprocess = [title_dict[x] for x in series["exp095_swin_all_th0.55"]]
    postprocess = [title_dict[x] for x in series["pred"]]
    target = [title_dict[x] for x in series["target"]]
    f1_before = series["f1_exp095_swin_all_th0.55"]
    f1_after = series["f1_pred"]
    print(f"========{posting_title}========")
    print("-----------------------")
    print("[no_postprocess]: \n{}".format("\n".join(no_postprocess)))
    print("-----------------------")
    print("[postprocess]: \n{}".format("\n".join(postprocess)))
    print("-----------------------")
    print("[target]: \n{}".format("\n".join(target)))
    print("-----------------------")    
    print(f"\n<< f1 >>: {f1_before} -> {f1_after}\n")

In [None]:
re.findall('[0-9.-]+' + '\s*'+ "gram", title)

In [None]:
df["unit_gram"] = [get_unit_from_title(title, "gram") for title in df["title"].values]
tmp = df.groupby('unit_gram')["posting_id"].agg('unique').to_dict()
df['pred_unit_gram'] = df["unit_gram"].map(tmp)

In [None]:
for th in [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8]:
    df["pred"] = [vote(x, n=2) for x in df[[f"exp095_swin_all_th{th}", f"pred_unit_gram"]].values]
    f1score, precision, recall = calc_cv(df, "pred")
    print(f"model=exp095_swin_all_th{th} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
