In [1]:
from scipy.spatial.distance import cosine
import numpy as np
import pandas as pd
from PIL import Image
from transformers import AutoProcessor, CLIPModel
import torch
from tqdm import tqdm
import pickle
import os
import torch

In [2]:
from urllib.request import urlopen
from PIL import Image
import timm

In [5]:
from urllib.request import urlopen
from PIL import Image
import timm

img = Image.open(urlopen(
    'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
))

model = timm.create_model(
    'convnext_xxlarge.clip_laion2b_soup_ft_in1k',
    pretrained=True,
    num_classes=0,  # remove classifier nn.Linear
)
model = model.eval()

# get model specific transforms (normalization, resize)
data_config = timm.data.resolve_model_data_config(model)
transforms = timm.data.create_transform(**data_config, is_training=False)

output = model(transforms(img).unsqueeze(0))  # output is (batch_size, num_features) shaped tensor

# or equivalently (without needing to set num_classes=0)

output = model.forward_features(transforms(img).unsqueeze(0))
# output is unpooled, a (1, 3072, 8, 8) shaped tensor

output = model.forward_head(output, pre_logits=True)

Downloading model.safetensors:   0%|          | 0.00/3.39G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [3]:
from scipy.spatial.distance import cosine
import numpy as np
import pandas as pd
from PIL import Image
from transformers import AutoProcessor, CLIPModel
from tqdm import tqdm
import pickle
import os
import torch

In [4]:
def get_path(path1,path2,path3):
    paths = []
    cls = []
    for gen_path in [path1,path2,path3]:
        for im_path in os.listdir(gen_path):
            if 'mask' not in im_path:
                paths += [gen_path+'/'+im_path]
                cls += [gen_path]
    return paths,cls

In [5]:
def get_val(paths,cls):
    train_paths,train_labels = [],[]
    test_paths,test_labels = [],[]
    for path,cls in zip(paths,cls):
        if 'klikun' in path and '31' in path and 'img' in path:
            test_paths += [path]
            test_labels += [cls]
        elif 'разметка_малый' in path and '16' in path and 'img' in path:
            test_paths += [path]
            test_labels += [cls]
        elif 'разметка_шипун' in path and '30' in path and 'img' in path:
            test_paths += [path]
            test_labels += [cls]
        else:
            train_paths += [path]
            train_labels += [cls]
    return train_paths,train_labels,test_paths,test_labels

In [6]:
def create_df(path,cls,embed):
    df = pd.DataFrame()
    df['cls'] = cls
    df['indexes'] = path
    for i in range(embed.shape[-1]):
        df[f'embed_{i}'] = embed.T[i]
    return df

In [7]:
class Label_encoder():
    def __init__(self):
        self.vc = {'razmetka/klikun/images':1,
                   'razmetkaразметка_малый/images':2,
                   'razmetkaразметка_шипун/images':0}
        
    def encode(self,x):
        return self.vc[x]
    
    def decode(self,x):
        return list(self.vc.keys())[x]

In [16]:
df = pd.read_json('clip_embed.json')

EMBEDDING_LENGTH = 768
mean_embeddings = {}
for label2find, out_label in [['klikun', 'whooper'], ['разметка_шипун', 'mute'], ['разметка_малый', 'bewick']]:
    labeled_df = df[df['indexes'].str.contains(label)]
    mean_embedding = np.array([labeled_df[f'embed_{i}'].mean() for i in range(EMBEDDING_LENGTH)])
    mean_embeddings[out_label] = mean_embedding
pickle.dump(mean_embeddings, open('mean_embedding.pkl', 'wb'))

In [19]:
def calculate_metric(y_true,y_pred):
    # klikun - 1;разметка_малый - 2; разметка_шипун - 0
    score = 0
    for y_t, y_p in zip(y_true,y_pred):
        if y_t == y_p == 0:
            score += 3
        if y_t == 0 and y_p == 1:
            score -= 1
        if y_t == 0 and y_p == 2:
            score -= 3
            
        if y_t == y_p == 1:
            score += 3
        if y_t == 1 and y_p == 0:
            score -= 1
        if y_t == 1 and y_p == 2:
            score -= 3
            
        if y_t == y_p == 2:
            score += 2
        if y_t == 2 and y_p == 0:
            score -= 3
        if y_t == 2 and y_p == 1:
            score -= 3
    return score / len(y_true)

In [8]:
clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14-336").to('cuda')
processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14-336")

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [18]:
def clip_encode(indexes):
    embeds = {'klikun': [], 'malyi': [], 'shipun': []}
    for i, img_path in tqdm(list(enumerate(indexes))):
        img = Image.open(img_path)
        img = processor(images=img, return_tensors="pt")
        cls = 'klikun' if '/klikun/' in img_path else 'malyi' if '/malyi/' in img_path else 'shipun'
        embeds[cls].append([cls, clip.get_image_features(img.pixel_values.to('cuda')).cpu().detach().numpy()[0]])
    return embeds

In [26]:
paths,cls = get_path('yolo_preds_swan_no_classify/klikun',
                     'yolo_preds_swan_no_classify/malyi',
                     'yolo_preds_swan_no_classify/shipun')

In [16]:
train_paths, train_label, test_paths, test_labels = get_val(paths,cls)

In [27]:
embedings = clip_encode(train_paths)

100%|██████████| 8710/8710 [17:27<00:00,  8.32it/s]


In [23]:
pickle.dump(embedings, open('head_embeddings.pkl', 'wb'))

In [24]:
embedings = pickle.load(open('head_embeddings.pkl', 'rb'))

In [25]:
embedings['klikun'][:10]

[['klikun',
  array([-6.97019696e-02,  7.76941776e-01, -1.01611897e-01, -4.24751043e-01,
          7.39538670e-03, -3.11236262e-01,  6.02114201e-01, -3.68818700e-01,
         -1.30258113e-01, -7.84446359e-01,  4.68347728e-01, -3.64642918e-01,
         -4.92282450e-01, -6.01461232e-01, -2.00590014e-01,  3.42075050e-01,
         -6.63577318e-01,  6.48186982e-01,  1.12197649e+00,  6.68661743e-02,
          1.21379428e-01,  1.59148529e-01,  2.73930788e-01, -1.29697025e-02,
         -3.92772257e-01,  4.29710746e-01, -7.18206912e-02, -2.67299265e-01,
          1.33561581e-01,  6.23888493e-01, -5.96868694e-01, -6.77860081e-02,
         -3.10703069e-02, -4.93654817e-01,  6.42050743e-01,  6.69705570e-02,
         -7.87684396e-02,  1.13877714e-01, -4.80190545e-01, -2.94320077e-01,
          5.01420200e-01, -1.09363604e+00,  1.13799125e-01, -6.45981789e-01,
          1.76617205e-02, -8.24058652e-01,  1.13969445e-01, -8.96823108e-02,
          3.71058345e-01, -9.14050400e-01, -5.02020597e-01, -1.2

In [None]:
cls = list(map(lambda x:Label_encoder().encode(x),train_label))

In [27]:
df = create_df(train_paths,cls,np.stack(embedings))

  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'embed_{i}'] = embed.T[i]
  df[f'e

In [30]:
df.to_json('clip_embed.json')

In [33]:
!rm -r /kaggle/working/catboost_info
from catboost import CatBoostClassifier,Pool,cv
params = {'iterations':400,
         'loss_function':'MultiClass',
         'random_seed':42,
          'max_depth':3,
         'eval_metric':'TotalF1:average=Weighted',
         'task_type':'GPU'}
train_pool = Pool(data = df.drop(['cls','indexes'],axis=1),
                 label = df['cls'])
cv_data = cv(train_pool,
            params = params,
            fold_count = 5,
            shuffle = True,
            stratified =True,
            verbose = False,
            seed = 42)

rm: cannot remove '/kaggle/working/catboost_info': No such file or directory




Training on fold [0/5]
bestTest = 0.8949292011
bestIteration = 386
Training on fold [1/5]
bestTest = 0.9097478587
bestIteration = 386
Training on fold [2/5]
bestTest = 0.8888976918
bestIteration = 387
Training on fold [3/5]
bestTest = 0.8958708184
bestIteration = 398
Training on fold [4/5]
bestTest = 0.9100022456
bestIteration = 398


In [34]:
cv_data

Unnamed: 0,iterations,test-TotalF1:average=Weighted-mean,test-TotalF1:average=Weighted-std,train-TotalF1:average=Weighted-mean,train-TotalF1:average=Weighted-std,test-MultiClass-mean,test-MultiClass-std,train-MultiClass-mean,train-MultiClass-std
0,0,0.726517,0.011172,0.728295,0.005816,1.074158,0.000221,1.073920,0.000273
1,1,0.743079,0.016581,0.747459,0.009683,1.051153,0.000577,1.050648,0.000542
2,2,0.749698,0.018472,0.754735,0.011947,1.029547,0.000564,1.028710,0.000908
3,3,0.750127,0.015687,0.754939,0.004275,1.009162,0.000921,1.008092,0.000943
4,4,0.751999,0.015411,0.759032,0.007911,0.990124,0.001027,0.988809,0.001031
...,...,...,...,...,...,...,...,...,...
395,395,0.898660,0.009580,0.919990,0.002102,0.284885,0.014429,0.250219,0.004116
396,396,0.899102,0.009220,0.919906,0.002348,0.284665,0.014435,0.249925,0.004114
397,397,0.898868,0.009158,0.919993,0.001937,0.284528,0.014432,0.249687,0.004141
398,398,0.899539,0.009404,0.920188,0.001998,0.284284,0.014422,0.249422,0.004134


In [37]:
print(f'best_score {cv_data["test-TotalF1:average=Weighted-mean"].max()}')

best_score 0.8995394350902497


In [None]:
from sklearn.model_selection import train