In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine, euclidean, cityblock, chebyshev
from joblib import Parallel, delayed

In [2]:
resnet = '/Users/annapetrov/Desktop/ozon_хакатон/competition/competition/resnet.parquet'
df_resnet = pd.read_parquet(resnet)

In [3]:
train = '/Users/annapetrov/Desktop/ozon_хакатон/competition/competition/train.parquet'
df_train = pd.read_parquet(train)

In [9]:
attributes = '/Users/annapetrov/Desktop/ozon_хакатон/competition/competition/attributes.parquet'
df_attributes = pd.read_parquet(attributes)

In [11]:
import re
import json

def get_dict() ->dict:
    '''
    Получение словаря из файла
    '''
    r_d= {}
    with open('data/characteristic.csv', 'r') as f:
        for line in f:
            line_l = line.strip().split(';')
            r_d[line_l[0]] = line_l[2]
    return r_d

d = get_dict()

def process_characteristics(attr_mapping_str, code_mapping):
    try:
        attr_dict = json.loads(attr_mapping_str)
        result = []
        
        for attr, values in attr_dict.items():
            if attr in code_mapping:
                for value in values:
                    numbers = re.findall(r'\d+[,\.]?\d*', value)
                    for number in numbers:
                        clean_number = re.sub(r'[^\d]', '', number)
                        result.append(f"{code_mapping[attr]}_{clean_number}")
        
        return ' '.join(result)
    
    except json.JSONDecodeError:
        return ""


In [12]:
df_attributes['processed_characteristics'] = df_attributes['characteristic_attributes_mapping'].apply(process_characteristics, code_mapping=d)

df_train = df_train.merge(df_attributes[['variantid', 'processed_characteristics']], left_on='variantid1', right_on='variantid', how='left')
df_train = df_train.rename(columns={'processed_characteristics': 'characteristics_1'})

df_train = df_train.merge(df_attributes[['variantid', 'processed_characteristics']], left_on='variantid2', right_on='variantid', how='left')
df_train = df_train.rename(columns={'processed_characteristics': 'characteristics_2'})

df_train = df_train.drop(columns=['variantid_x', 'variantid_y'])

In [4]:
# Функция для объединения нескольких массивов в один и проверки формы
def concatenate_embeddings(embedding):
    if isinstance(embedding, list):
        concatenated = np.concatenate(embedding)
    else:
        concatenated = np.array(embedding)
    
    if concatenated.ndim > 1:  # Преобразуем в одномерный массив
        concatenated = concatenated.flatten()
    
    return concatenated

In [5]:
# Обработаем основной эмбеддинг
df_resnet['main_pic_embeddings_resnet_v1'] = df_resnet['main_pic_embeddings_resnet_v1'].apply(concatenate_embeddings)
df_resnet['pic_embeddings_resnet_v1'] = df_resnet['pic_embeddings_resnet_v1'].apply(
    lambda x: concatenate_embeddings(x) if x is not None else np.array([]))

In [6]:
# Объединение данных
df_merged = df_train.merge(df_resnet, left_on='variantid1', right_on='variantid', how='left')
df_merged = df_merged.merge(df_resnet, left_on='variantid2', right_on='variantid', how='left', suffixes=('_1', '_2'))

In [7]:
features = {
    'cosine_similarity_mean': [],
    'euclidean_distance_mean': [],
    'manhattan_distance_mean': [],
    'chebyshev_distance_mean': [],
    'cosine_similarity_std': [],
    'euclidean_distance_std': [],
    'manhattan_distance_std': [],
    'chebyshev_distance_std': [],
    'cosine_similarity_min': [],
    'euclidean_distance_min': [],
    'manhattan_distance_min': [],
    'chebyshev_distance_min': [],
    'cosine_similarity_max': [],
    'euclidean_distance_max': [],
    'manhattan_distance_max': [],
    'chebyshev_distance_max': []
}

In [8]:
for idx, row in df_merged.iterrows():
    emb1_set = row['main_pic_embeddings_resnet_v1_1']
    emb2_set = row['main_pic_embeddings_resnet_v1_2']

    # Объединяем основной и дополнительный эмбеддинги
    if len(row['pic_embeddings_resnet_v1_1']) > 0:
        emb1_set = np.concatenate([emb1_set, row['pic_embeddings_resnet_v1_1']])
    if len(row['pic_embeddings_resnet_v1_2']) > 0:
        emb2_set = np.concatenate([emb2_set, row['pic_embeddings_resnet_v1_2']])

    similarities_cos = []
    distances_euc = []
    distances_man = []
    distances_cheb = []

    for emb1, emb2 in zip(emb1_set, emb2_set):
        similarities_cos.append(1 - cosine(emb1, emb2))
        distances_euc.append(euclidean(emb1, emb2))
        distances_man.append(cityblock(emb1, emb2))
        distances_cheb.append(chebyshev(emb1, emb2))

    # Средние значения
    features['cosine_similarity_mean'].append(np.mean(similarities_cos))
    features['euclidean_distance_mean'].append(np.mean(distances_euc))
    features['manhattan_distance_mean'].append(np.mean(distances_man))
    features['chebyshev_distance_mean'].append(np.mean(distances_cheb))

    # Стандартное отклонение
    features['cosine_similarity_std'].append(np.std(similarities_cos))
    features['euclidean_distance_std'].append(np.std(distances_euc))
    features['manhattan_distance_std'].append(np.std(distances_man))
    features['chebyshev_distance_std'].append(np.std(distances_cheb))

    # Минимальные значения
    features['cosine_similarity_min'].append(np.min(similarities_cos))
    features['euclidean_distance_min'].append(np.min(distances_euc))
    features['manhattan_distance_min'].append(np.min(distances_man))
    features['chebyshev_distance_min'].append(np.min(distances_cheb))

    # Максимальные значения
    features['cosine_similarity_max'].append(np.max(similarities_cos))
    features['euclidean_distance_max'].append(np.max(distances_euc))
    features['manhattan_distance_max'].append(np.max(distances_man))
    features['chebyshev_distance_max'].append(np.max(distances_cheb))

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

all_characteristics = df_train['characteristics_1'].tolist() + df_train['characteristics_2'].tolist()
tfidf_matrix = tfidf_vectorizer.fit_transform(all_characteristics)


tfidf_1 = tfidf_vectorizer.transform(df_train['characteristics_1'])
tfidf_2 = tfidf_vectorizer.transform(df_train['characteristics_2'])


from sklearn.metrics.pairwise import cosine_similarity

df_train['similarity'] = [cosine_similarity(tfidf_1[i], tfidf_2[i])[0][0] for i in range(len(df_train))]


from sklearn.model_selection import train_test_split


def jaccard_similarity(str1, str2):
    set1 = set(str1.split())
    set2 = set(str2.split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if len(union) > 0 else 0.0

df_train['jaccard_similarity'] = df_train.apply(lambda row: jaccard_similarity(row['characteristics_1'], row['characteristics_2']), axis=1)

df_train['difference'] = (tfidf_1 - tfidf_2).power(2).sum(axis=1)

from sklearn.metrics.pairwise import manhattan_distances

df_train['manhattan_distance'] = [manhattan_distances(tfidf_1[i], tfidf_2[i])[0][0] for i in range(len(df_train))]



In [14]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

In [18]:
X_1 = pd.DataFrame(features)
y_1 = df_merged['target']

In [19]:
X_2 = df_train[['similarity', 'manhattan_distance', 'jaccard_similarity']]
y_2 = df_train['target']

In [20]:
# Разделение на обучающую и тестовую выборки
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, test_size=0.2, random_state=42)
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, test_size=0.2, random_state=42)  # Используем тот же y_train и y_test

In [21]:
# Первая модель - на основе первых признаков
model_1 = RandomForestClassifier(n_estimators=100, random_state=42)
model_1.fit(X_1_train, y_1_train)

In [22]:
# Вторая модель - на основе других признаков
model_2 = RandomForestClassifier(n_estimators=100, random_state=42)
model_2.fit(X_2_train, y_2_train)

In [23]:
# Создание ансамбля через множественное голосование
ensemble_model = VotingClassifier(
    estimators=[
        ('model_1', model_1), 
        ('model_2', model_2)
    ],
    voting='hard'
)

In [24]:
# Обучение ансамбля
ensemble_model.fit(np.hstack((X_1_train, X_2_train)), y_1_train)

In [25]:
# Прогнозирование с помощью ансамбля
y_pred = ensemble_model.predict(np.hstack((X_1_test, X_2_test)))

In [27]:
# Вычисление метрик
accuracy = accuracy_score(y_1_test, y_pred)
precision = precision_score(y_1_test, y_pred)
recall = recall_score(y_1_test, y_pred)
roc_auc = roc_auc_score(y_1_test, ensemble_model.predict_proba(np.hstack((X_1_test, X_2_test)))[:, 1])

# Вывод метрик
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
# print("AUC:", roc_auc)

Accuracy: 0.7880951973436484
Precision: 0.7950966020147346
Recall: 0.7532427066919496


In [29]:
import joblib

In [30]:
# Сохранение модели в файл
joblib.dump(ensemble_model, 'ensemble_model.pkl')

['ensemble_model.pkl']

In [None]:
# Загрузка модели из файла
loaded_model = joblib.load('ensemble_model.pkl')

# Использование загруженной модели для предсказаний
y_pred = loaded_model.predict(np.hstack((X_1_test, X_2_test)))

## Вторая модель с voting soft

In [31]:
# Данные для первой модели
X_1 = pd.DataFrame(features)  # Объект X для первой модели
y = df_merged['target']  # Целевая переменная

In [32]:
X_2 = df_train[['similarity', 'manhattan_distance', 'jaccard_similarity']]  # Объект X для второй модели

In [33]:
# Разделение на обучающую и тестовую выборки
X_1_train, X_1_test, y_train, y_test = train_test_split(X_1, y, test_size=0.2, random_state=42)
X_2_train, X_2_test, _, _ = train_test_split(X_2, y, test_size=0.2, random_state=42)  # Используем тот же y_train и y_test

In [34]:
# Первая модель - на основе первых признаков
model_1 = RandomForestClassifier(n_estimators=100, random_state=42)
model_1.fit(X_1_train, y_train)

# Вторая модель - на основе других признаков
model_2 = RandomForestClassifier(n_estimators=100, random_state=42)
model_2.fit(X_2_train, y_train)

In [35]:
# Создание ансамбля через множественное голосование с использованием soft voting
ensemble_model = VotingClassifier(
    estimators=[
        ('model_1', model_1), 
        ('model_2', model_2)
    ],
    voting='soft'  # используем soft voting для предсказания вероятностей
)

In [36]:
# Обучение ансамбля
ensemble_model.fit(np.hstack((X_1_train, X_2_train)), y_train)

In [37]:
# Прогнозирование с помощью ансамбля
y_pred = ensemble_model.predict(np.hstack((X_1_test, X_2_test)))

In [38]:
# Прогнозирование вероятностей для расчета AUC
y_pred_proba = ensemble_model.predict_proba(np.hstack((X_1_test, X_2_test)))[:, 1]

In [39]:
# Вычисление метрик
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Вывод метрик
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("AUC:", roc_auc)

Accuracy: 0.7880951973436484
Precision: 0.7950966020147346
Recall: 0.7532427066919496
AUC: 0.8650401735984847
