In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine, euclidean
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

In [None]:
resnet = 'competition/resnet.parquet'
df_resnet = pd.read_parquet(resnet)

In [None]:
train = 'competition/train.parquet'
df_train = pd.read_parquet(train)

In [None]:
# Функция для объединения нескольких массивов в один и проверки формы
def concatenate_embeddings(embedding):
    if isinstance(embedding, list):
        concatenated = np.concatenate(embedding)
    else:
        concatenated = np.array(embedding)
    
    if concatenated.ndim > 1:  # Преобразуем в одномерный массив
        concatenated = concatenated.flatten()
    
    return concatenated

In [None]:
df_resnet['main_pic_embeddings_resnet_v1'] = df_resnet['main_pic_embeddings_resnet_v1'].apply(concatenate_embeddings)

In [None]:
# Объединение данных
df_merged = df_train.merge(df_resnet, left_on='variantid1', right_on='variantid', how='left')
df_merged = df_merged.merge(df_resnet, left_on='variantid2', right_on='variantid', how='left', suffixes=('_1', '_2'))

In [None]:
cosine_similarities = []
euclidean_distances = []

for idx, row in df_merged.iterrows():
    emb1_set = row['main_pic_embeddings_resnet_v1_1']
    emb2_set = row['main_pic_embeddings_resnet_v1_2']

    similarities_cos = []
    distances_euc = []
    for emb1, emb2 in zip(emb1_set, emb2_set):
        similarity_cos = 1 - cosine(emb1, emb2)  # 1 - косинусное расстояние дает косинусную схожесть
        distance_euc = euclidean(emb1, emb2)
        similarities_cos.append(similarity_cos)
        distances_euc.append(distance_euc)
    
    # Среднее значение косинусной схожести и эвклидова расстояния
    cosine_similarities.append(np.mean(similarities_cos))
    euclidean_distances.append(np.mean(distances_euc))

df_merged['cosine_similarity'] = cosine_similarities
df_merged['euclidean_distance'] = euclidean_distances

In [None]:
# Создание признаков и целевой переменной
X = df_merged[['cosine_similarity', 'euclidean_distance']]
y = df_merged['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Построение модели
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# Прогнозирование и оценка модели
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # Вероятности для AUC

In [None]:
# Вывод метрик
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'AUC: {auc:.4f}')