# parameters

In [None]:
!ls /kaggle/input/d/kurupical/shopee-exp

In [None]:
import numpy as np
import pandas as pd
import os
import shutil
import glob
import tqdm
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from cuml.neighbors import NearestNeighbors
import cudf, cuml, cupy
import gc
from sklearn.preprocessing import normalize
import albumentations
from albumentations.pytorch.transforms import ToTensorV2
import re
from pathlib import Path


In [None]:
PARAMETERS = {
    "all_kiccho_cossim": 0.35+0.2,
    "img_kiccho_cossim": 0.35+0.2,
    "text_kiccho_cossim": 0.55+0.2,

    "all_kurupical_cossim": 0.55+0.2,
    "img_kurupical_cossim": 0.6+0.2,
    "text_kurupical_cossim": 0.65+0.2,

    "all_kiccho_euclidean": 0.85*0.7,
    "img_kiccho_euclidean": 1.05*0.7,
    "text_kiccho_euclidean": 1.1*0.7,

    "all_kurupical_euclidean": 1.1*0.7,
    "img_kurupical_euclidean": 0.8*0.7,
    "text_kurupical_euclidean": 0.75*0.7,
    
    "kiccho_avg_all_cossim": 0.35+0.2,
    "kiccho_max_all_cossim": 0.4+0.2,
    "kiccho_min_all_cossim": 0.3+0.2,
    
    "kiccho_avg_img_cossim": 0.35+0.2,
    "kiccho_max_img_cossim": 0.4+0.2,
    "kiccho_min_img_cossim": 0.3+0.2,
    
    "kiccho_avg_text_cossim": 0.55+0.2,
    "kiccho_max_text_cossim": 0.55+0.2,
    "kiccho_min_text_cossim": 0.45+0.2,

    "kurupical_avg_all_cossim": 0.55+0.2,
    "kurupical_max_all_cossim": 0.6+0.2,
    "kurupical_min_all_cossim": 0.4+0.2,
    
    "kurupical_avg_img_cossim": 0.6+0.2,
    "kurupical_max_img_cossim": 0.65+0.2,
    "kurupical_min_img_cossim": 0.55+0.2,
    
    "kurupical_avg_text_cossim": 0.65+0.2,
    "kurupical_max_text_cossim": 0.7+0.2,
    "kurupical_min_text_cossim": 0.6+0.2,
    
    "vote": 16
}

In [None]:
kurupical_models = [
    'all_swin_large_roberta_exp133.pth',
    'swin_large_exp103.pth'
]
kurupical_model_dir = Path("/kaggle/input/d/kurupical/shopee-exp")

kiccho_models = [
    "exp470",
    "exp471"
]

In [None]:
!pip install /kaggle/input/timm-048/timm

In [None]:
debug = False

In [None]:
import sys
sys.path.append("/kaggle/input/shopee-exp")
sys.path.append("/kaggle/input/shopee-exp/exp")
sys.path.append("/kaggle/input/d/kurupical/shopee-exp/exp")
sys.path.append("/kaggle/input/kaggle-shopee/src")
from kaggle_shopee.bin.inference_ensemble_kiccho import get_kiccho_embeddings
from kaggle_shopee.utils.time_util import TimeUtil

In [None]:
from get_model import get_kurupical_embeddings

In [None]:
from exp080 import get_cv

In [None]:
if debug:
    image_dir = '../input/shopee-product-matching/train_images'
    df = pd.read_csv("/kaggle/input/d/kurupical/shopee-exp/train_fold.csv")
    df["filepath"] = df['image'].apply(lambda x: os.path.join(image_dir, x))
    df = pd.concat([df, df]).reset_index(drop=True)
#     df = df[df["fold"] == 0]
    # threshold = 0.5
else:
    image_dir = '../input/shopee-product-matching/test_images'
    df = pd.read_csv("../input/shopee-product-matching/test.csv")
    df["filepath"] = df['image'].apply(lambda x: os.path.join(image_dir, x))
    # threshold = 0.7

In [None]:
# あとで消す
if not debug:
    df["label_group"] = 0

In [None]:
def get_neighbors_euclidean(df, embeddings, threshold, pred_name="pred", min_n=2):
    print(f"[EUCLIDEAN]{pred_name}: threshold={threshold}")
    if len(df) <= 3:
        model = NearestNeighbors(n_neighbors=2)
    else:
        model = NearestNeighbors(n_neighbors=75)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    posting_id = df["posting_id"].values
    preds = []
    for k in range(len(df)):
        IDX = np.where(distances[k, ] < threshold)[0]
        if len(IDX) < min_n:                
            IDX = np.argsort(distances[k, ])[:min_n]
        
        ids = indices[k, IDX]
        pred = posting_id[ids]
        preds.append(pred)
    
    df[pred_name] = preds
    return df

In [None]:
def get_neighbors_cossim(df, embeddings, threshold, pred_name="pred", min_n=2):
    print(f"[COSSIM] {pred_name}: threshold={threshold}")
    preds = []
    posting_id = df["posting_id"].values
    
    CHUNK = 1024 * 2
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1
    
    preds = []
    with torch.no_grad():
        embeddings = normalize(embeddings)
        print("norm avg:", np.linalg.norm(embeddings, axis=1).mean())
        embeddings = torch.tensor(embeddings).cuda()

        for j in tqdm.tqdm(range( CTS )):
            a = j * CHUNK
            b = (j+1) * CHUNK
            b = min(b, len(df))

            # COSINE SIMILARITY DISTANCE
            cts = torch.matmul(embeddings, embeddings[a:b].T).T
            for k in range(b-a):
                IDX = torch.where(cts[k,]>threshold)[0]
                if len(IDX) < min_n:                
                    IDX = torch.argsort(cts[k, ])[-min_n:]   
                IDX = IDX.detach().cpu().numpy()
                preds.append(posting_id[IDX])
            del cts
            gc.collect()
            torch.cuda.empty_cache()
    df[pred_name] = preds
    return df

In [None]:
def get_neighbors_cossim_agg(df, embeddings1, embeddings2, threshold_avg, threshold_max, threshold_min, pred_name="pred", min_n=2):
    print(f"[COSSIM] {pred_name}: threshold_avg={threshold_avg} threshold_max={threshold_max} threshold_min={threshold_min}")
    posting_id = df["posting_id"].values
    
    CHUNK = 1024 * 2
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1
    
    preds_avg = []
    preds_max = []
    preds_min = []
    with torch.no_grad():
        embeddings1 = normalize(embeddings1)
        embeddings2 = normalize(embeddings2)
        
        embeddings1 = torch.tensor(embeddings1).cuda()
        embeddings2 = torch.tensor(embeddings2).cuda()
        
        for j in tqdm.tqdm(range( CTS )):
            a = j * CHUNK
            b = (j+1) * CHUNK
            b = min(b, len(df))
            
            cts1 = torch.matmul(embeddings1, embeddings1[a:b].T).T
            cts2 = torch.matmul(embeddings2, embeddings2[a:b].T).T
            
            cts_avg = torch.mean(torch.stack([cts1, cts2]), dim=0)
            cts_max = torch.max(torch.stack([cts1, cts2]), dim=0)[0]
            cts_min = torch.min(torch.stack([cts1, cts2]), dim=0)[0]
            
            for k in range(b-a):
                IDX_avg = torch.where(cts_avg[k, ]>threshold_avg)[0]
                if len(IDX_avg) < min_n:                
                    IDX_avg = torch.argsort(cts_avg[k, ])[-min_n:]
                IDX_avg = IDX_avg.detach().cpu().numpy()
                preds_avg.append(posting_id[IDX_avg])
                
                IDX_max = torch.where(cts_max[k, ]>threshold_max)[0]
                if len(IDX_max) < min_n:                
                    IDX_max = torch.argsort(cts_max[k, ])[-min_n:]
                IDX_max = IDX_max.detach().cpu().numpy()
                preds_max.append(posting_id[IDX_max])
                
                IDX_min = torch.where(cts_min[k, ]>threshold_min)[0]
                if len(IDX_min) < min_n:                
                    IDX_min = torch.argsort(cts_min[k, ])[-min_n:]
                IDX_min = IDX_min.detach().cpu().numpy()
                preds_min.append(posting_id[IDX_min])
                
            del cts1
            del cts2
            del cts_avg
            del cts_min
            del cts_max 
            gc.collect()
            torch.cuda.empty_cache()

    df[f"{pred_name}_avg"] = preds_avg
    df[f"{pred_name}_max"] = preds_max
    df[f"{pred_name}_min"] = preds_min
    return df

In [None]:
model_dicts = []
embeddings = {}

In [None]:
for model_name in kurupical_models:
    with TimeUtil.timer(model_name):
        print("=================================================")
        
        model_path = str(kurupical_model_dir / model_name)
        model_dict = {
            "model_name": model_name,
            "th_cossim_all": PARAMETERS["all_kurupical_cossim"],
            "th_cossim_img": PARAMETERS["img_kurupical_cossim"],
            "th_cossim_text": PARAMETERS["text_kurupical_cossim"],
            "th_euclidean_all": PARAMETERS["all_kurupical_euclidean"],
            "th_euclidean_img": PARAMETERS["img_kurupical_euclidean"],
            "th_euclidean_text": PARAMETERS["text_kurupical_euclidean"],
        }
        
        embeddings[f"{model_name}_img"], embeddings[f"{model_name}_text"], embeddings[f"{model_name}_all"] = get_kurupical_embeddings(
            model_path=model_path, df=df
        )
        torch.cuda.empty_cache()
        
        for task in ["all", "img", "text"]:
            embeddings[f"{model_name}_{task}"] = normalize(embeddings[f"{model_name}_{task}"])

            df = get_neighbors_cossim(df, 
                                      embeddings=embeddings[f"{model_name}_{task}"],
                                      threshold=model_dict[f"th_cossim_{task}"],
                                      pred_name=f"{model_dict['model_name']}_cossim_{task}")
            df = get_neighbors_euclidean(df, 
                                         embeddings=embeddings[f"{model_name}_{task}"],
                                         threshold=model_dict[f"th_euclidean_{task}"],
                                         pred_name=f"{model_dict['model_name']}_euclidean_{task}")
            
        model_dicts.append(model_dict)

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
for task in ["all", "img", "text"]:
    with TimeUtil.timer(f"kurupical {task}"):
        df = get_neighbors_cossim_agg(
            df=df, 
            embeddings1=embeddings[f"{kurupical_models[0]}_{task}"], 
            embeddings2=embeddings[f"{kurupical_models[1]}_{task}"],
            threshold_avg=PARAMETERS[f"kurupical_avg_{task}_cossim"], 
            threshold_max=PARAMETERS[f"kurupical_max_{task}_cossim"], 
            threshold_min=PARAMETERS[f"kurupical_min_{task}_cossim"], 
            pred_name=f"kurupical_{task}", 
        )

In [None]:
del embeddings
gc.collect()
embeddings = {}

In [None]:
for exp in kiccho_models:
    with TimeUtil.timer(f"{exp}"):
        model_name = exp
        model_dict = {
            "model_name": model_name,
            "th_cossim_all": PARAMETERS["all_kiccho_cossim"],
            "th_cossim_img": PARAMETERS["img_kiccho_cossim"],
            "th_cossim_text": PARAMETERS["text_kiccho_cossim"],
            "th_euclidean_all": PARAMETERS["all_kiccho_euclidean"],
            "th_euclidean_img": PARAMETERS["img_kiccho_euclidean"],
            "th_euclidean_text": PARAMETERS["text_kiccho_euclidean"],
        }
         
        embeddings[f"{model_name}_all"], embeddings[f"{model_name}_img"], embeddings[f"{model_name}_text"] = get_kiccho_embeddings(
            exp, df, num_workers=4, image_dir=image_dir
        )
        
        for task in ["all", "img", "text"]:
            embeddings[f"{model_name}_{task}"] = normalize(embeddings[f"{model_name}_{task}"])

            df = get_neighbors_cossim(df, 
                                      embeddings=embeddings[f"{model_name}_{task}"],
                                      threshold=model_dict[f"th_cossim_{task}"],
                                      pred_name=f"{model_dict['model_name']}_cossim_{task}")
            df = get_neighbors_euclidean(df, 
                                         embeddings=embeddings[f"{model_name}_{task}"],
                                         threshold=model_dict[f"th_euclidean_{task}"],
                                         pred_name=f"{model_dict['model_name']}_euclidean_{task}")

        model_dicts.append(model_dict)

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
for task in ["all", "img", "text"]:
    with TimeUtil.timer(f"kiccho {task}"):
        df = get_neighbors_cossim_agg(
            df=df, 
            embeddings1=embeddings[f"{kiccho_models[0]}_{task}"], 
            embeddings2=embeddings[f"{kiccho_models[1]}_{task}"],
            threshold_avg=PARAMETERS[f"kiccho_avg_{task}_cossim"], 
            threshold_max=PARAMETERS[f"kiccho_max_{task}_cossim"], 
            threshold_min=PARAMETERS[f"kiccho_min_{task}_cossim"], 
            pred_name=f"kiccho_{task}", 
        )

In [None]:
del embeddings
gc.collect()
embeddings = {}

In [None]:
model_names = []
model_names.extend([f"{x['model_name']}_cossim_img" for x in model_dicts])
model_names.extend([f"{x['model_name']}_cossim_text" for x in model_dicts])
model_names.extend([f"{x['model_name']}_cossim_all" for x in model_dicts])
model_names.extend([f"{x['model_name']}_euclidean_img" for x in model_dicts])
model_names.extend([f"{x['model_name']}_euclidean_text" for x in model_dicts])
model_names.extend([f"{x['model_name']}_euclidean_all" for x in model_dicts])

In [None]:
for prefix in ["kurupical", "kiccho"]:
    for task in ["img", "text", "all"]:        
        for method in ["avg", "max", "min"]:        
            model_names.append(f"{prefix}_{task}_{method}")

In [None]:
model_names

In [None]:
def combine_predictions(row):
    x = np.unique(np.concatenate([row['matches'], row["title_aggregate"]]))
    return x

In [None]:
df["title_lower"] = [x.lower() for x in df["title"].values]
tmp = df.groupby('title_lower').posting_id.agg('unique').to_dict()
df['title_aggregate'] = df.title_lower.map(tmp)

In [None]:
def combine_predictions_major(row):
    # x = np.concatenate([row['xlm_roberta_base'], row['distilbert_base'], row["bert_base"], row["bert_indonesian"]])
    x = np.concatenate(row[model_names].values.reshape(-1))
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts >= PARAMETERS["vote"]
    return x[ret_idx]

In [None]:
df['matches'] = df.apply(combine_predictions_major, axis=1)
df['matches'] = df.apply(combine_predictions, axis=1)

In [None]:
def get_unit_from_title(title, unit_name):
    """ タイトル title から, 単位名 unit_name の数値特徴を抜き出す"""
    
    title = title.lower().replace("(", "").replace(")", "").replace("[", "").replace("]", "")
    pattern = f".*? (\d+.{unit_name})"
    
    # ary = re.findall(pattern, title, re.S)
    ary = re.findall('[0-9.-]+' + '\s*'+ unit_name, title)
    if len(ary) != 1:
        return None
    else:
        return ary[0].replace(" ", "").replace(unit_name, "")

def vote_for_unit(x, unit_dict):
    """
    drop_different_unit専用のvote
    * unit_nameが入っていないデータは無条件に同じグループとする
    * voteの結果が1件となった場合は, 後処理をしない
    
    """
    def isnan(x):
        if type(x) == float:
            return True
        if type(x) == np.ndarray:
            return False
    
    if type(x[1]) == np.ndarray:
        x_out = np.concatenate(x)
        x_out, counts = np.unique(x_out, return_counts=True)
        x_unit_none = [xx for xx in x_out if isnan(unit_dict[xx])]
        
        ret_idx = counts == 2
        if ret_idx.sum() == 1:
            return x[0]
        x_out = x_out[ret_idx].tolist()
        return x_out + x_unit_none
    else:
        return x[0]    

def drop_different_unit(df, pred_name, unit_name, pred_name_out=None):
    """
    同じグループのもののうち, 単位 unit_name の数字が違うデータは別グループとする
    例: unit_name = "ml" -> 50ml と 100ML は別グループ
    
    params
    @df: 
    @pred_name: 検査する予測結果の列名
    @unit_name: 単位名(例: ml, gram, ...)
    @pred_name_out: 後処理後の列名(Noneならpred_nameと同じ)
    """
    if pred_name_out is None:
        pred_name_out = pred_name
    
    unit_pred_name = f"pred_unit_{unit_name}"
    
    df[f"unit_{unit_name}"] = [get_unit_from_title(title, unit_name) for title in df["title"].values]
    tmp = df.groupby(f"unit_{unit_name}")["posting_id"].agg('unique').to_dict()
    df[unit_pred_name] = df[f"unit_{unit_name}"].map(tmp)
    print(f"{df[unit_pred_name].notnull().sum()} titles have unit_name '{unit_name}'. n_unique: {len(tmp)}")
    
    unit_dict = df[["posting_id", unit_pred_name]].set_index("posting_id")[unit_pred_name].to_dict()
    
    df[pred_name_out] = [vote_for_unit(x, unit_dict) for x in df[[pred_name, unit_pred_name]].values]
    return df

In [None]:
if debug:
    df["pred"] = df["matches"]
    df["pred"] = [np.unique(x) for x in df["pred"].values]
    print(get_cv(df))

In [None]:
for unit_name in ["gr", "ml"]:
    df = drop_different_unit(df, 
                             pred_name="matches", 
                             unit_name=unit_name)

In [None]:
if debug:
    df["pred"] = df["matches"]
    df["pred"] = [np.unique(x) for x in df["pred"].values]
    print(get_cv(df))
else:
    df["matches"] = [' '.join(np.unique(x)) for x in df["matches"].values]

In [None]:
df[["posting_id", "matches"]].to_csv('submission.csv', index=False)