In [None]:
from collections import Counter
from os import cpu_count

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from mpl_toolkits.axes_grid1 import ImageGrid
from pytorch_lightning import seed_everything
from sklearn.cluster import DBSCAN
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from timm.models.efficientnet import EfficientNet, tf_efficientnet_b8
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms as T
from tqdm.auto import tqdm
from umap import UMAP
import cv2

from utils import pca_explained_variance_ratio
from metric import getMetric

In [None]:
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model: EfficientNet = tf_efficientnet_b8(pretrained=True)

    @torch.no_grad()
    def forward(self, x):
        x = self.model.forward_features(x)
        x = self.model.global_pool(x)
        return x


class MyDataset(Dataset):
    def __init__(self, df: pd.DataFrame, transform: T.Compose):
        self.df = df
        self.transform = transform

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(row['path_img'])
        img = self.transform(img)
        return img

    def __len__(self):
        return len(self.df)


def get_loader():
    return DataLoader(
        dataset=MyDataset(
            df,
            transform=T.Compose([
                T.Resize(672),
                T.CenterCrop(int(672 * 0.954)),
                T.ToTensor(),
                T.Normalize(
                    mean=IMAGENET_DEFAULT_MEAN,
                    std=IMAGENET_DEFAULT_STD,
                ),
            ])
        ),
        batch_size=16,
        num_workers=cpu_count() - 1,
        pin_memory=True,
        shuffle=False,
    )


def extract_feature(model: nn.Module, loader: DataLoader) -> np.ndarray:
    return np.concatenate([
        model.forward(x.cuda()).cpu().numpy()
        for x in tqdm(loader)
    ])

In [None]:
seed = 1
seed_everything(seed)

In [None]:
df = pd.read_csv('dataset/train_mod.csv')
# df = df.drop_duplicates(subset=['md5_img'])
df.shape

In [None]:
tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
df['target'] = df.label_group.map(tmp)

In [None]:
df.head()

In [None]:
model = MyModel().eval().cuda()
loader = get_loader()

In [None]:
# feats = extract_feature(model, loader)
# joblib.dump(feats, 'tmp/tf_efficientnet_b8_feats.joblib')

feats = joblib.load('tmp/tf_efficientnet_b8_feats.joblib')
feats.shape

In [None]:
explained_var = pca_explained_variance_ratio(feats)

In [None]:
# reduced_2 = UMAP(n_components=2, random_state=seed, transform_seed=seed).fit_transform(feats)
# joblib.dump(reduced_2, 'tmp/tf_efficientnet_b8_reduced_2.joblib')

reduced_2 = joblib.load('tmp/tf_efficientnet_b8_reduced_2.joblib')
plt.scatter(reduced_2[:, 0], reduced_2[:, 1])
plt.tight_layout()
plt.show()

In [None]:
# reduced_400 = UMAP(n_components=400, random_state=seed, transform_seed=seed).fit_transform(feats)
# joblib.dump(reduced_400, 'tmp/tf_efficientnet_b8_reduced_400.joblib')

reduced_400 = joblib.load('tmp/tf_efficientnet_b8_reduced_400.joblib')
reduced_400.shape

In [None]:
labels = DBSCAN(eps=0.05, min_samples=2, n_jobs=-1).fit_predict(reduced_400)
uniq_labels = list(sorted(set(labels)))

# print(Counter(labels))

for label in uniq_labels:
    mask = (labels == label)
    plt.scatter(reduced_2[mask, 0], reduced_2[mask, 1])
plt.legend(uniq_labels, bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()

In [None]:
len(uniq_labels)

In [None]:
df['labels'] = labels

In [None]:
tmp = df.groupby('labels').posting_id.agg('unique').to_dict()
df['oof'] = df.labels.map(tmp)

In [None]:
def outlier_group(row):
    if row.labels == -1:
        return [row.posting_id]
    else:
        return row.oof
df['oof'] = df.apply(outlier_group, axis=1)

In [None]:
df['f1'] = df.apply(getMetric('oof'), axis=1)
print('CV score =', df.f1.mean())