In [1]:
# !pip install -r ../requirements.txt

In [2]:
from datasets import load_dataset, Dataset
import pandas as pd

parakeet_ds = load_dataset("vambassa/parakeet_inference", split="train")
gigaam_ds = load_dataset("vambassa/gigaam_inference", split="train")
tone_ds = load_dataset("vambassa/t-one_inference", split="train")

raw_ds = load_dataset("arood0/mmm_project_with_audio_ru_final", split="train")

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 24.0/24.0 [00:00<00:00, 116B/s]
Downloading data: 100%|██████████| 116k/116k [00:00<00:00, 536kB/s]
Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 49555.80 examples/s]
Downloading readme: 100%|██████████| 24.0/24.0 [00:00<00:00, 91.7B/s]
Downloading data: 100%|██████████| 116k/116k [00:00<00:00, 430kB/s]
Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 107076.77 examples/s]
Downloading readme: 100%|██████████| 24.0/24.0 [00:00<00:00, 116B/s]
Downloading data: 100%|██████████| 115k/115k [00:00<00:00, 541kB/s]
Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 111731.91 examples/s]


### Найдем общие IMAGE_ID во всех трёх датасетах и возьмем первые 100

In [3]:
parakeet_ids = set(parakeet_ds["IMAGE_ID"])
gigaam_ids = set(gigaam_ds["IMAGE_ID"])
tone_ids = set(tone_ds["IMAGE_ID"])

common_ids = parakeet_ids & gigaam_ids & tone_ids
print(f"Общих IMAGE_ID: {len(common_ids)}")


common_ids_list = sorted(list(common_ids))[:100]
common_ids_set = set(common_ids_list)
print(f"Используем {len(common_ids_list)} сэмплов")


Общих IMAGE_ID: 1000
Используем 100 сэмплов


In [4]:
parakeet_df = parakeet_ds.to_pandas()
gigaam_df = gigaam_ds.to_pandas()
tone_df = tone_ds.to_pandas()
raw_df = raw_ds.to_pandas()

parakeet_df = parakeet_df[parakeet_df["IMAGE_ID"].isin(common_ids_set)]
gigaam_df = gigaam_df[gigaam_df["IMAGE_ID"].isin(common_ids_set)]
tone_df = tone_df[tone_df["IMAGE_ID"].isin(common_ids_set)]
raw_df = raw_df[raw_df["IMAGE_ID"].isin(common_ids_set)]

len(parakeet_df), len(gigaam_df), len(tone_df), len(raw_df)

(100, 100, 100, 100)

In [5]:
parakeet_hyp = parakeet_df[["IMAGE_ID", "hypothesis_raw"]].rename(
    columns={"hypothesis_raw": "hypothesis_parakeet"}
)
gigaam_hyp = gigaam_df[["IMAGE_ID", "hypothesis_raw"]].rename(
    columns={"hypothesis_raw": "hypothesis_gigaam"}
)
tone_hyp = tone_df[["IMAGE_ID", "hypothesis_raw"]].rename(
    columns={"hypothesis_raw": "hypothesis_tone"}
)

merged_df = raw_df.merge(parakeet_hyp, on="IMAGE_ID", how="inner")
merged_df = merged_df.merge(gigaam_hyp, on="IMAGE_ID", how="inner")
merged_df = merged_df.merge(tone_hyp, on="IMAGE_ID", how="inner")


In [6]:
import sys
import os

# Добавляем в sys.path корень проекта
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.models.image_editor import DiffusionImageEditor


In [7]:
editor = DiffusionImageEditor(
    model_type="instruct_pix2pix",
    model_name="timbrooks/instruct-pix2pix",
    device="cuda",
    num_inference_steps=40,
    strength=0.75,
    image_guidance_scale=1.2,
    guidance_scale=15,
    max_side=1024,
)


Инициализация DiffusionImageEditor: instruct_pix2pix


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]`torch_dtype` is deprecated! Use `dtype` instead!
Loading pipeline components...: 100%|██████████| 6/6 [00:01<00:00,  5.96it/s]


Модель загружена: timbrooks/instruct-pix2pix


In [8]:
from PIL import Image
from io import BytesIO
from datasets import Dataset, Image as HFImage
from tqdm import tqdm

image_feature = HFImage()

result_df = Dataset.from_pandas(merged_df, preserve_index=False)

rows_parakeet = []
rows_gigaam = []
rows_tone = []

for el in tqdm(result_df, desc="Generating images"):
    if isinstance(el["INPUT_IMG"], dict) and "bytes" in el["INPUT_IMG"]:
        input_img = Image.open(BytesIO(el["INPUT_IMG"]["bytes"]))
    else:
        input_img = el["INPUT_IMG"]
    
    image_id = el["IMAGE_ID"]
    
    # Генерация для Parakeet
    img_parakeet = editor.edit(image=input_img, instruction=el["hypothesis_parakeet"])
    rows_parakeet.append({
        "IMAGE_ID": image_id,
        "result_image": image_feature.encode_example(img_parakeet)
    })
    
    # Генерация для GigaAM
    img_gigaam = editor.edit(image=input_img, instruction=el["hypothesis_gigaam"])
    rows_gigaam.append({
        "IMAGE_ID": image_id,
        "result_image": image_feature.encode_example(img_gigaam)
    })
    
    # Генерация для T-One
    img_tone = editor.edit(image=input_img, instruction=el["hypothesis_tone"])
    rows_tone.append({
        "IMAGE_ID": image_id,
        "result_image": image_feature.encode_example(img_tone)
    })

print(f"\nGenerated images:")
print(f"  Parakeet: {len(rows_parakeet)}")
print(f"  GigaAM: {len(rows_gigaam)}")
print(f"  T-One: {len(rows_tone)}")


100%|██████████| 40/40 [00:25<00:00,  1.60it/s]0<?, ?it/s]
100%|██████████| 40/40 [00:25<00:00,  1.59it/s]
100%|██████████| 40/40 [00:25<00:00,  1.59it/s]
100%|██████████| 40/40 [00:03<00:00, 10.46it/s]9<2:11:22, 79.62s/it]
100%|██████████| 40/40 [00:03<00:00, 10.47it/s]
100%|██████████| 40/40 [00:03<00:00, 10.48it/s]
100%|██████████| 40/40 [00:03<00:00, 11.72it/s]1<1:05:15, 39.95s/it]
100%|██████████| 40/40 [00:03<00:00, 11.73it/s]
100%|██████████| 40/40 [00:03<00:00, 11.73it/s]
100%|██████████| 40/40 [00:03<00:00, 10.34it/s]2<43:06, 26.66s/it]  
100%|██████████| 40/40 [00:03<00:00, 10.30it/s]
100%|██████████| 40/40 [00:03<00:00, 10.30it/s]
100%|██████████| 40/40 [00:03<00:00, 11.73it/s]4<33:35, 20.99s/it]
100%|██████████| 40/40 [00:03<00:00, 11.70it/s]
100%|██████████| 40/40 [00:03<00:00, 11.72it/s]
100%|██████████| 40/40 [00:06<00:00,  6.42it/s]6<27:33, 17.41s/it]
100%|██████████| 40/40 [00:06<00:00,  6.43it/s]
100%|██████████| 40/40 [00:06<00:00,  6.42it/s]
100%|██████████| 40/40 [


Generated images:
  Parakeet: 100
  GigaAM: 100
  T-One: 100





In [9]:
ds_parakeet = Dataset.from_pandas(pd.DataFrame(rows_parakeet), preserve_index=False)
ds_parakeet = ds_parakeet.cast_column("result_image", image_feature)

ds_gigaam = Dataset.from_pandas(pd.DataFrame(rows_gigaam), preserve_index=False)
ds_gigaam = ds_gigaam.cast_column("result_image", image_feature)

ds_tone = Dataset.from_pandas(pd.DataFrame(rows_tone), preserve_index=False)
ds_tone = ds_tone.cast_column("result_image", image_feature)

In [None]:
from huggingface_hub import login
login(token="<token>") 

In [14]:
ds_parakeet.push_to_hub("gab1k/mmm_project_parakeet")

Map: 100%|██████████| 100/100 [00:00<00:00, 502.81 examples/s]it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  2.47ba/s]
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.83s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/gab1k/mmm_project_parakeet/commit/c0a3261ae1b6951bb835e622611f8e55ea3876e5', commit_message='Upload dataset', commit_description='', oid='c0a3261ae1b6951bb835e622611f8e55ea3876e5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/gab1k/mmm_project_parakeet', endpoint='https://huggingface.co', repo_type='dataset', repo_id='gab1k/mmm_project_parakeet'), pr_revision=None, pr_num=None)

In [15]:
ds_gigaam.push_to_hub("gab1k/mmm_project_gigaam")

Map: 100%|██████████| 100/100 [00:00<00:00, 488.07 examples/s]it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  2.49ba/s]
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.56s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/gab1k/mmm_project_gigaam/commit/e7d6efde69018199e2ee2dae31cf24d3183f8a7b', commit_message='Upload dataset', commit_description='', oid='e7d6efde69018199e2ee2dae31cf24d3183f8a7b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/gab1k/mmm_project_gigaam', endpoint='https://huggingface.co', repo_type='dataset', repo_id='gab1k/mmm_project_gigaam'), pr_revision=None, pr_num=None)

In [16]:
ds_tone.push_to_hub("gab1k/mmm_project_tone")

Map: 100%|██████████| 100/100 [00:00<00:00, 498.28 examples/s]it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  2.51ba/s]
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.09s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/gab1k/mmm_project_tone/commit/cc41a87b983645ae47b5d1f3b7e1d1bb05432a9a', commit_message='Upload dataset', commit_description='', oid='cc41a87b983645ae47b5d1f3b7e1d1bb05432a9a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/gab1k/mmm_project_tone', endpoint='https://huggingface.co', repo_type='dataset', repo_id='gab1k/mmm_project_tone'), pr_revision=None, pr_num=None)