In [67]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd


def get_words(df, start_col=10):
    df_essential: pd.DataFrame = df.iloc[:, start_col:]
    df_essential = df_essential.apply(lambda x: [pairs.split("=")[0] for pairs in x if isinstance(pairs, str)], axis=1)
    df_essential = df_essential.tolist()

    words = [word for item in df_essential for word in item]
    words = list(set(words))
    return words


def check_forced_train_images(forced_train_images, train_images):
    for forced_name in forced_train_images:
        if not forced_name in train_images:
            return False
    return True


def search_abbs(df, train_indices, seed_num, numbering, is_apply, new_images, fraction):
    np.random.seed(seed_num)
    indices = np.random.choice(len(new_images), int(len(new_images) * fraction), replace=False).tolist()
    train_images = []
    test_images = []
    for idx, name in enumerate(new_images):
        if idx in indices:
            test_images.append(name)
        else:
            train_images.append(name)

    df_train = df[df["샘플폴더"].isin(train_images) | train_indices]
    df_test = df[df["샘플폴더"].isin(test_images)]

    train_abbs = get_words(df_train)
    test_abbs = get_words(df_test)

    test_rest = list(set(test_abbs) - set(train_abbs))
    tmp =  df.iloc[:, 10:].apply(lambda x: [pairs for pairs in x if isinstance(pairs, str) and pairs.split("=")[0] in test_rest], axis=1).to_list()
    tmp = [tmp.index(i) for i in tmp if len(i) > 0]
    tmp = [df.loc[i]["name"] for i in tmp]

    if is_apply:
        df.loc[df["샘플폴더"].isin(train_images), "status"] = f"train_{numbering}"
        df.loc[df["샘플폴더"].isin(test_images), "status"] = f"test_{numbering}"
    
    return len(test_rest), df, tmp


In [69]:
csv_path = '/Users/dh/Desktop/상진/BTC-500_make_json/221114_BTC-500_ver.9.0.0.csv'
new_csv_path =  '/Users/dh/Desktop/상진/split_dataset/221116_BTC-500_ver.1.0.0.csv'
log_name = "BTC-500"
numbering = "005"
fraction = 0.1

df = pd.read_csv(csv_path)
print(f"Shape of dataframe: {df.shape}")

train_indices = df["status"].str.contains("train", na=False)
new_data_indices_500 = df["status"].str.contains("BTC-500", na=False)
new_data_indices_400 = df["status"].str.contains("BTC-400", na=False)
new_data_indices = new_data_indices_400 + new_data_indices_500
new_images = df.loc[new_data_indices, "샘플폴더"].unique()
print(f"New images: {len(new_images)}")

best_seed = -1
best_rest = 10e8

for seed_num in range(0, 10000):
    # check about current seed
    curr_rest, _, test_rest = search_abbs(
        df=df,
        train_indices=train_indices,
        seed_num=seed_num,
        numbering=numbering,
        is_apply=False,
        new_images=new_images,
        fraction=fraction,
    )

    if curr_rest < best_rest:
        best_rest = curr_rest
        best_seed = seed_num
        print(f"Best seed: {best_seed}, rest: {best_rest}, test_rest: {test_rest}\n\n")

print(f"Best seed: {best_seed}, rest: {best_rest}")

# save new dataframe
_, df_new, test_rest_ = search_abbs(
    df=df,
    train_indices=train_indices,
    seed_num=best_seed,
    numbering=numbering,
    is_apply=True,
    new_images=new_images,
    fraction=fraction,
)
df_new.to_csv(new_csv_path, encoding="utf-8-sig", index=False)

Shape of dataframe: (20403, 26)
New images: 200
Best seed: 0, rest: 39, test_rest: ['1492_img-1.jpg-22', '1492_img-1.jpg-26', '1663_img-3.jpg-6', '1663_img-3.jpg-10', '1663_img-4.jpg-21', '1983_img-2.jpg-6', '1983_img-2.jpg-6', '1983_img-2.jpg-9', '1983_img-2.jpg-13', '1983_img-2.jpg-14', '1983_img-2.jpg-16', '1983_img-2.jpg-6', '1983_img-2.jpg-6', '1983_img-2.jpg-9', '1983_img-2.jpg-13', '1983_img-3.jpg-14', '1983_img-2.jpg-16', '2586_img-1.jpg-9', '2586_img-1.jpg-14', '2586_img-1.jpg-14', '375_img-1.jpg-17', '375_img-1.jpg-17', '375_img-3.jpg-8', '375_img-3.jpg-16', '375_img-3.jpg-17', '446_img-1.jpg-12', '446_img-1.jpg-13', '4965_img-2.jpg-8', '1347_img-1.jpg-5', '1347_img-1.jpg-6', '1347_img-1.jpg-28', '2107_img-1.jpg-15', '2107_img-2.jpg-11', '2107_img-2.jpg-11', '2107_img-2.jpg-29', '2517_img-1.jpg-24', '2517_img-1.jpg-31', '2517_img-1.jpg-32', '2517_img-2.jpg-4', '3242_img-1.jpg-2', '3242_img-1.jpg-4', '3242_img-1.jpg-4', '3242_img-1.jpg-4', '3323_img-1.jpg-3', '4330_img-1.jpg-2

In [71]:
df_tmp = df.iloc[:, 10:]
df_tmp = df_tmp.apply(lambda x: [pairs.split("=")[0] for pairs in x if isinstance(pairs, str)], axis=1)

In [75]:
test_rest

['1983_img-2.jpg-6',
 '1983_img-2.jpg-6',
 '1983_img-2.jpg-9',
 '1983_img-2.jpg-13',
 '1983_img-2.jpg-14',
 '1983_img-2.jpg-16',
 '1983_img-2.jpg-6',
 '1983_img-2.jpg-6',
 '1983_img-2.jpg-9',
 '1983_img-2.jpg-13',
 '1983_img-3.jpg-14',
 '1983_img-2.jpg-16',
 '201_img-1.jpg-9',
 '2136_img-2.jpg-5',
 '5049_img-2.jpg-9',
 '1764_img-1.jpg-7',
 '1764_img-1.jpg-7',
 '1764_img-1.jpg-14',
 '1764_img-1.jpg-22',
 '1764_img-1.jpg-7',
 '1764_img-2.jpg-8',
 '1764_img-1.jpg-7',
 '1764_img-1.jpg-7',
 '2523_img-1.jpg-2',
 '2523_img-1.jpg-2',
 '3098_img-1.jpg-3',
 '3426_img-1.jpg-5',
 '3559_img-1.jpg-4',
 '3559_img-1.jpg-8',
 '3559_img-1.jpg-15',
 '3559_img-1.jpg-16',
 '3559_img-1.jpg-17',
 '3559_img-1.jpg-18',
 '3559_img-1.jpg-4',
 '3559_img-1.jpg-4',
 '4457_img-1.jpg-5',
 '4457_img-2.jpg-6',
 '4457_img-2.jpg-6',
 '4457_img-2.jpg-6',
 '458_img-1.jpg-15',
 '4457_img-2.jpg-6',
 '4457_img-2.jpg-6',
 '4457_img-2.jpg-6',
 '4457_img-2.jpg-6',
 '458_img-4.jpg-11',
 '458_img-1.jpg-15',
 '4457_img-2.jpg-6']