In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine, euclidean, cityblock, chebyshev
from joblib import Parallel, delayed

In [2]:
resnet = '/Users/annapetrov/Desktop/ozon_хакатон/competition/competition/resnet.parquet'
df_resnet = pd.read_parquet(resnet)

In [3]:
train = '/Users/annapetrov/Desktop/ozon_хакатон/competition/competition/train.parquet'
df_train = pd.read_parquet(train)

In [4]:
# Функция для объединения нескольких массивов в один и проверки формы
def concatenate_embeddings(embedding):
    if isinstance(embedding, list):
        concatenated = np.concatenate(embedding)
    else:
        concatenated = np.array(embedding)
    
    if concatenated.ndim > 1:  # Преобразуем в одномерный массив
        concatenated = concatenated.flatten()
    
    return concatenated

In [5]:
# Обработаем основной эмбеддинг
df_resnet['main_pic_embeddings_resnet_v1'] = df_resnet['main_pic_embeddings_resnet_v1'].apply(concatenate_embeddings)
df_resnet['pic_embeddings_resnet_v1'] = df_resnet['pic_embeddings_resnet_v1'].apply(
    lambda x: concatenate_embeddings(x) if x is not None else np.array([]))

In [6]:
# Объединение данных
df_merged = df_train.merge(df_resnet, left_on='variantid1', right_on='variantid', how='left')
df_merged = df_merged.merge(df_resnet, left_on='variantid2', right_on='variantid', how='left', suffixes=('_1', '_2'))

In [7]:
features = {
    'cosine_similarity_mean': [],
    'euclidean_distance_mean': [],
    'manhattan_distance_mean': [],
    'chebyshev_distance_mean': [],
    'cosine_similarity_std': [],
    'euclidean_distance_std': [],
    'manhattan_distance_std': [],
    'chebyshev_distance_std': [],
    'cosine_similarity_min': [],
    'euclidean_distance_min': [],
    'manhattan_distance_min': [],
    'chebyshev_distance_min': [],
    'cosine_similarity_max': [],
    'euclidean_distance_max': [],
    'manhattan_distance_max': [],
    'chebyshev_distance_max': []
}

In [8]:
for idx, row in df_merged.iterrows():
    emb1_set = row['main_pic_embeddings_resnet_v1_1']
    emb2_set = row['main_pic_embeddings_resnet_v1_2']

    # Объединяем основной и дополнительный эмбеддинги
    if len(row['pic_embeddings_resnet_v1_1']) > 0:
        emb1_set = np.concatenate([emb1_set, row['pic_embeddings_resnet_v1_1']])
    if len(row['pic_embeddings_resnet_v1_2']) > 0:
        emb2_set = np.concatenate([emb2_set, row['pic_embeddings_resnet_v1_2']])

    similarities_cos = []
    distances_euc = []
    distances_man = []
    distances_cheb = []

    for emb1, emb2 in zip(emb1_set, emb2_set):
        similarities_cos.append(1 - cosine(emb1, emb2))
        distances_euc.append(euclidean(emb1, emb2))
        distances_man.append(cityblock(emb1, emb2))
        distances_cheb.append(chebyshev(emb1, emb2))

    # Средние значения
    features['cosine_similarity_mean'].append(np.mean(similarities_cos))
    features['euclidean_distance_mean'].append(np.mean(distances_euc))
    features['manhattan_distance_mean'].append(np.mean(distances_man))
    features['chebyshev_distance_mean'].append(np.mean(distances_cheb))

    # Стандартное отклонение
    features['cosine_similarity_std'].append(np.std(similarities_cos))
    features['euclidean_distance_std'].append(np.std(distances_euc))
    features['manhattan_distance_std'].append(np.std(distances_man))
    features['chebyshev_distance_std'].append(np.std(distances_cheb))

    # Минимальные значения
    features['cosine_similarity_min'].append(np.min(similarities_cos))
    features['euclidean_distance_min'].append(np.min(distances_euc))
    features['manhattan_distance_min'].append(np.min(distances_man))
    features['chebyshev_distance_min'].append(np.min(distances_cheb))

    # Максимальные значения
    features['cosine_similarity_max'].append(np.max(similarities_cos))
    features['euclidean_distance_max'].append(np.max(distances_euc))
    features['manhattan_distance_max'].append(np.max(distances_man))
    features['chebyshev_distance_max'].append(np.max(distances_cheb))

In [9]:
# Создание признаков и целевой переменной
X = pd.DataFrame(features)
y = df_merged['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from catboost import CatBoostClassifier

# Используем CatBoost без подбора гиперпараметров
model = CatBoostClassifier(random_state=42, verbose=0)
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x5ef7f97c0>

In [11]:
# Прогнозирование и оценка модели
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [12]:
# Вывод метрик
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'AUC: {auc:.4f}')

Accuracy: 0.7230
Precision: 0.6912
Recall: 0.7658
AUC: 0.7873


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam



In [16]:
# Нормализуем признаки
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [17]:
# Разделение данных на тренировочные и тестовые
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [18]:
# Построение модели
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),  # Входной слой с 64 нейронами
    Dense(32, activation='relu'),  # Скрытый слой с 32 нейронами
    Dense(1, activation='sigmoid')  # Выходной слой с 1 нейроном для бинарной классификации
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [19]:
# Компиляция модели
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])


In [20]:
# Обучение модели
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/20
[1m23371/23371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 474us/step - accuracy: 0.6970 - loss: 0.5756 - val_accuracy: 0.7057 - val_loss: 0.5666
Epoch 2/20
[1m23371/23371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 446us/step - accuracy: 0.7049 - loss: 0.5675 - val_accuracy: 0.7081 - val_loss: 0.5653
Epoch 3/20
[1m23371/23371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 442us/step - accuracy: 0.7063 - loss: 0.5666 - val_accuracy: 0.7105 - val_loss: 0.5629
Epoch 4/20
[1m23371/23371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 434us/step - accuracy: 0.7092 - loss: 0.5648 - val_accuracy: 0.7110 - val_loss: 0.5629
Epoch 5/20
[1m23371/23371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 466us/step - accuracy: 0.7091 - loss: 0.5649 - val_accuracy: 0.7125 - val_loss: 0.5616
Epoch 6/20
[1m23371/23371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 436us/step - accuracy: 0.7111 - loss: 0.5631 - val_accuracy: 0.7120 - val

<keras.src.callbacks.history.History at 0x5ff3b4a30>

In [21]:
# Прогнозирование и оценка модели
y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_prob = model.predict(X_test)

[1m7304/7304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 253us/step
[1m7304/7304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 254us/step


In [22]:
# Вывод метрик
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'AUC: {auc:.4f}')

Accuracy: 0.7174
Precision: 0.6821
Recall: 0.7719
AUC: 0.7795


In [46]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import AUC
import keras_tuner as kt

In [47]:
# Подготовка данных
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [48]:
# Разделение данных на тренировочные и тестовые
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [49]:
# Определение функции для построения модели
def build_model(hp):
    model = Sequential()
    model.add(Dense(
        units=hp.Int('units_1', min_value=32, max_value=256, step=32),
        activation='relu',
        kernel_regularizer=l2(hp.Float('l2_reg_1', min_value=0.0001, max_value=0.01, sampling='LOG'))
    ))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_1', min_value=0.1, max_value=0.5, step=0.1)))
    
    model.add(Dense(
        units=hp.Int('units_2', min_value=32, max_value=128, step=32),
        activation='relu',
        kernel_regularizer=l2(hp.Float('l2_reg_2', min_value=0.0001, max_value=0.01, sampling='LOG'))
    ))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_2', min_value=0.1, max_value=0.5, step=0.1)))
    
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=0.0001, max_value=0.01, sampling='LOG')),
                  loss='binary_crossentropy',
                  metrics=['accuracy', AUC(name='auc')])
    
    return model

In [50]:
# Инициализация тюнера
tuner = kt.Hyperband(
    build_model,
    objective='val_auc',
    max_epochs=30,
    factor=3,
    directory='tuner_dir',
    project_name='hyperparameter_tuning'
)

In [51]:
# Ранняя остановка для предотвращения переобучения
early_stopping = EarlyStopping(monitor='val_auc', patience=10, mode='max', restore_best_weights=True)

In [52]:
# Поиск гиперпараметров
tuner.search(X_train, y_train, epochs=50, validation_split=0.2, callbacks=[early_stopping], verbose=1)

Trial 90 Complete [00h 03m 20s]
val_auc: 0.7590652704238892

Best val_auc So Far: 0.7736082077026367
Total elapsed time: 02h 28m 37s


In [53]:
# Вывод лучших гиперпараметров
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
Лучшие гиперпараметры:
- units_1: {best_hps.get('units_1')}
- units_2: {best_hps.get('units_2')}
- l2_reg_1: {best_hps.get('l2_reg_1')}
- l2_reg_2: {best_hps.get('l2_reg_2')}
- dropout_1: {best_hps.get('dropout_1')}
- dropout_2: {best_hps.get('dropout_2')}
- learning_rate: {best_hps.get('learning_rate')}
""")


Лучшие гиперпараметры:
- units_1: 96
- units_2: 64
- l2_reg_1: 0.0001611601449550487
- l2_reg_2: 0.0004377992596146039
- dropout_1: 0.2
- dropout_2: 0.5
- learning_rate: 0.00010469611294992556



In [54]:
# Обучение модели с лучшими гиперпараметрами
best_model = tuner.hypermodel.build(best_hps)
best_model.fit(X_train, y_train, epochs=50, validation_split=0.2, callbacks=[early_stopping], verbose=1)

Epoch 1/50
[1m23371/23371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 685us/step - accuracy: 0.6580 - auc: 0.7112 - loss: 0.6852 - val_accuracy: 0.7027 - val_auc: 0.7643 - val_loss: 0.5842
Epoch 2/50
[1m23371/23371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 669us/step - accuracy: 0.6973 - auc: 0.7554 - loss: 0.5906 - val_accuracy: 0.7048 - val_auc: 0.7662 - val_loss: 0.5748
Epoch 3/50
[1m23371/23371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 662us/step - accuracy: 0.6979 - auc: 0.7554 - loss: 0.5848 - val_accuracy: 0.7054 - val_auc: 0.7668 - val_loss: 0.5715
Epoch 4/50
[1m23371/23371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 652us/step - accuracy: 0.7005 - auc: 0.7587 - loss: 0.5801 - val_accuracy: 0.7064 - val_auc: 0.7684 - val_loss: 0.5703
Epoch 5/50
[1m23371/23371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 659us/step - accuracy: 0.7012 - auc: 0.7592 - loss: 0.5789 - val_accuracy: 0.7085 - val_auc: 0.7695 - val_loss: 0.

<keras.src.callbacks.history.History at 0x6ad916fa0>

In [55]:
# Прогнозирование и оценка модели
y_pred = (best_model.predict(X_test) > 0.5).astype("int32")
y_prob = best_model.predict(X_test)

[1m7304/7304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 298us/step
[1m7304/7304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 288us/step


In [56]:
# Вывод метрик
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'AUC: {auc:.4f}')

Accuracy: 0.7146
Precision: 0.6778
Recall: 0.7741
AUC: 0.7759


In [9]:
# Создание признаков и целевой переменной
X = pd.DataFrame(features)
y = df_merged['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.ensemble import RandomForestClassifier
# Используем LightGBM без подбора гиперпараметров
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [11]:
# Прогнозирование и оценка модели
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [12]:
# Вывод метрик
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'AUC: {auc:.4f}')

Accuracy: 0.7269
Precision: 0.6991
Recall: 0.7582
AUC: 0.7927
