In [1]:
pip install lancedb

Collecting lancedb
  Downloading lancedb-0.13.0-cp38-abi3-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting deprecation (from lancedb)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting pylance==0.17.0 (from lancedb)
  Downloading pylance-0.17.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (7.2 kB)
Collecting retry>=0.9.2 (from lancedb)
  Downloading retry-0.9.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting py<2.0.0,>=1.4.26 (from retry>=0.9.2->lancedb)
  Downloading py-1.11.0-py2.py3-none-any.whl.metadata (2.8 kB)
Downloading lancedb-0.13.0-cp38-abi3-manylinux_2_28_x86_64.whl (24.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.2/24.2 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading pylance-0.17.0-cp39-abi3-manylinux_2_28_x86_64.whl (29.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.2/29.2 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownl

In [2]:
import cv2
import tqdm
import torch
import lancedb
import numpy as np
import pandas as pd
import pyarrow as pa
import urllib.request
import torch.nn.functional as F

from sklearn.metrics import f1_score
from pydantic import BaseModel
from transformers import AutoImageProcessor, TimesformerForVideoClassification

In [3]:
model_name = "facebook/timesformer-base-finetuned-k600"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoImageProcessor.from_pretrained(model_name)
model = TimesformerForVideoClassification.from_pretrained(model_name).to(device)

preprocessor_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/34.4k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/487M [00:00<?, ?B/s]

In [4]:
# Функция для загрузки и обработки видео
def load_video(video_path, f = 0, frame_height=480, frame_width=480):
    cap = cv2.VideoCapture(video_path)
    frames = []
    ret, frame = cap.read()
    while ret:
            frame = cv2.resize(frame, (frame_width, frame_height))
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)   
            frame = frame[110:350, 50:430]
            frames.append(frame)
            ret, frame = cap.read()
    cap.release()
    idxs = np.linspace(0, len(frames) - 1, 16, dtype=int)
    frames = list(np.array(frames)[idxs])
    if f:
        frames = [cv2.flip(frame, 1) for frame in frames]
    return frames

# Функция для получения эмбеддинга видео
def make_embedding(video_path, model, processor, f = 0):
    frames = load_video(video_path, f)
    inputs = processor(images=frames, return_tensors="pt").to(device)
    del frames
    outputs = model(**inputs, output_hidden_states=True)
    del inputs
    return outputs.hidden_states[-1].squeeze(0).mean(dim=1)

In [18]:
# url = 'https://s3.ritm.media/yappy-db-duplicates/2fa37210-3c25-4a87-88f2-1242c2c8a699.mp4'


# file_Path = 'short.mp4'
# urllib.request.urlretrieve(url, file_Path)

# emb = make_embedding(file_Path, model, processor)
# print(emb.shape)

torch.Size([3137])


In [5]:
table = pd.read_csv("/kaggle/input/vseros/test.csv")

In [6]:
uri = "data/video-lancedb"
db = lancedb.connect(uri)

In [7]:
custom_schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 3137)),
                           pa.field("id", pa.string())
                           ])

tbl = db.create_table("emb_table", schema=custom_schema)

In [8]:
uuid, link = table[["uuid", "link"]].iloc[0]
url = link
file_Path = 'short.mp4'
urllib.request.urlretrieve(url, file_Path)
emb = make_embedding(file_Path, model, processor).cpu().detach().numpy()
data = [{"vector": emb, "id": uuid}]
tbl.add(data)

  return torch.tensor(value)


In [9]:
preds_flags = [False]
preds_uuid = ['']

In [10]:
for uuid, link in tqdm.tqdm(table[["uuid", "link"]].values[1:]):
    url = link
    file_Path = 'short.mp4'
    urllib.request.urlretrieve(url, file_Path)
    emb = make_embedding(file_Path, model, processor)
    numpy_emb = emb.cpu().detach().numpy()
    res = tbl.search(numpy_emb).limit(1).metric("cosine").to_pandas()
    id_vec = res.id.values[0]
    vec = torch.tensor(list(res.vector)).squeeze(0).to(device)
    cosine_similarity = F.cosine_similarity(emb, vec, dim=0).item()
    del emb
    if cosine_similarity < 0.4:
        mirror_emb = make_embedding(file_Path, model, processor, 1)
        mirror_cosine_similarity = F.cosine_similarity(mirror_emb, vec, dim=0).item()
        if mirror_cosine_similarity < 0.4:
            data = [{"vector": numpy_emb, "id": uuid}]
            tbl.add(data)
            preds_flags.append(False)
            preds_uuid.append('')
        else:
            preds_flags.append(True)
            preds_uuid.append(id_vec)
    else:
        preds_flags.append(True)
        preds_uuid.append(id_vec)

100%|██████████| 999/999 [2:24:30<00:00,  8.68s/it]  


In [11]:
table["is_duplicate"] = preds_flags
table["duplicate_for"] = preds_uuid

In [12]:
table = table.set_index('created')

In [13]:
table.to_csv('res.csv')