# Базовый пайплайн для соревнования по определению контрафакта

## 1. Загрузка данных

В данном случае качество работы модели будет продемонстрировано на заранее отделённой части от тренировочного датафрейма

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import warnings
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

starting_df = pd.read_csv('df_train_w_res&sents.csv', index_col=0)

X = starting_df.drop('resolution',axis=1)
y = starting_df['resolution']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, stratify=y, random_state=42)

df_train = X_train.copy()
df_test = X_test.copy()

df_train['resolution'] = y_train
df_test['resolution'] = y_test

del X_train, X_test, y_train, y_test



print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")
print(f"Target distribution in train:")
print(df_train['resolution'].value_counts(normalize=True))
print(f"Target distribution in test:")
print(df_test['resolution'].value_counts(normalize=True))

print()
pd.set_option('display.max_columns', None)

Train shape: (147898, 47)
Test shape: (49300, 47)
Target distribution in train:
resolution
0    0.933812
1    0.066188
Name: proportion, dtype: float64
Target distribution in test:
resolution
0    0.933813
1    0.066187
Name: proportion, dtype: float64



## 2. Чистка данных

In [3]:
# почистим столбцы с медиа данными

def clean_df_mediadata(df):
    df['videos_published_count'] = df['videos_published_count'].fillna(0)
    df['comments_published_count'] = df['comments_published_count'].fillna(0)
    df['photos_published_count'] = df['photos_published_count'].fillna(0)
    df['description'] = df['description'].fillna('')
    return df

df_train1 = clean_df_mediadata(df_train)
df_test1 = clean_df_mediadata(df_test)

In [4]:
# заменим редкие значения в столбце CommercialTypeName4, во избежание переобучения

def replace_rare_names(df_train, df_test):
    df = pd.concat([df_train.drop('resolution', axis=1), df_test])

    # Создаем список типов с малым количеством наблюдений
    threshold = 20
    type_counts = df['CommercialTypeName4'].value_counts()
    rare_types = type_counts[type_counts <= threshold].index.tolist()

    df_train.loc[df_train['CommercialTypeName4'].isin(rare_types), 'CommercialTypeName4'] = 'OtherTypeName'
    df_test.loc[df_test['CommercialTypeName4'].isin(rare_types), 'CommercialTypeName4'] = 'OtherTypeName'

    return df_train, df_test

df_train1, df_test1 = replace_rare_names(df_train, df_test)

df_test1.loc[df_test1['CommercialTypeName4']=='OtherTypeName']

Unnamed: 0,brand_name,description,name_rus,CommercialTypeName4,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,photos_published_count,videos_published_count,PriceDiscounted,item_time_alive,item_count_fake_returns7,item_count_fake_returns30,item_count_fake_returns90,item_count_sales7,item_count_sales30,item_count_sales90,item_count_returns7,item_count_returns30,item_count_returns90,GmvTotal7,GmvTotal30,GmvTotal90,ExemplarAcceptedCountTotal7,ExemplarAcceptedCountTotal30,ExemplarAcceptedCountTotal90,OrderAcceptedCountTotal7,OrderAcceptedCountTotal30,OrderAcceptedCountTotal90,ExemplarReturnedCountTotal7,ExemplarReturnedCountTotal30,ExemplarReturnedCountTotal90,ExemplarReturnedValueTotal7,ExemplarReturnedValueTotal30,ExemplarReturnedValueTotal90,ItemVarietyCount,ItemAvailableCount,seller_time_alive,ItemID,SellerID,index,res,positive_score,resolution
15605,Thuraya,Thuraya XT-LITE это бюджетная модель спутников...,Спутниковый телефон Thuraya XT-Lite с внешней ...,OtherTypeName,,,,,,0.0,0.0,0.0,1156.938655,176,0,0,0,0,0,2,0,0,0,,690.644248,1297.611674,,1.0,33.0,,1.0,19.0,,0.0,0.0,,0.000000,0.000000,1.0,1.0,910.0,201367,10262,412950,1691.0x1725.0,0.781134,0
38828,Arylic,Инновационный мультирум стерео усилитель Aryli...,Arylic H50 мультирум медиаплеер с усилителем,OtherTypeName,0.0,0.0,0.0,1.0,2.0,3.0,1.0,0.0,1036.030270,249,0,0,0,0,0,0,0,0,0,1361.270096,1582.156612,1666.393272,28.0,196.0,504.0,27.0,193.0,495.0,3.0,8.0,18.0,1124.365624,1254.920297,1328.671738,1.0,1.0,1026.0,20927,4305,41878,1920.0x1080.0,0.154720,0
64236,,Картхолдер MagSafe - это стильный и удобный ак...,Кардхолдер MagSafe Для IPhone/ Визитница на ма...,OtherTypeName,,,,,,0.0,0.0,0.0,624.356364,213,0,0,0,0,1,1,0,0,0,955.370516,1140.293676,1274.129573,38.0,273.0,1009.0,36.0,239.0,903.0,3.0,11.0,36.0,680.096564,824.913449,949.538959,1.0,1.0,664.0,154162,5198,313315,1290.0x2076.0,0.519664,0
189093,Elica,Винный климатический шкаф. Максимальное количе...,Винный шкаф ELICA VERITAS BI24 BL,OtherTypeName,,,,,,0.0,0.0,0.0,1210.374447,78,0,0,0,0,0,0,0,0,0,1432.868642,1632.034818,1714.104421,61.0,358.0,932.0,57.0,346.0,875.0,4.0,15.0,54.0,1154.643720,1325.618831,1425.140099,9.0,9.0,1136.0,166923,275,339175,750.0x572.0,0.142860,0
37446,ATLANT,"""Двухкомпрессорный холодильник Atlant с электр...",Холодильник Atlant 6023-031,OtherTypeName,,,,,,0.0,0.0,0.0,1090.138676,340,0,0,0,0,0,0,0,0,0,1238.230899,1384.412053,1492.965654,46.0,188.0,519.0,46.0,187.0,510.0,1.0,7.0,24.0,853.096829,1016.394264,1180.260405,1.0,1.0,348.0,91371,1761,186700,430.0x1400.0,0.146215,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191933,BarTon,Ваш ключ к миру развлечений и безграничного ко...,"Приставка для цифрового ТВ BarTon TA-561, Full...",OtherTypeName,,,,,,0.0,0.0,0.0,757.679757,10,0,0,0,0,0,0,0,0,0,1347.187704,1431.578768,1479.682188,1044.0,2754.0,5170.0,731.0,2083.0,3888.0,8.0,35.0,47.0,888.916517,978.718828,1016.808473,2.0,2.0,102.0,209452,6107,429929,700.0x700.0,0.228142,0
195431,Infortrend,Система хранения данных EonStor GS 3000 Gen2 2...,"Infortrend EonStor GS 3000 Gen2 2U/25bay,redun...",OtherTypeName,,,,,,0.0,0.0,0.0,1467.030154,315,0,0,0,0,0,0,0,0,0,1639.760770,1780.508964,1870.828790,742.0,3208.0,8004.0,618.0,2668.0,6699.0,24.0,81.0,205.0,1247.374901,1416.744844,1506.165425,12.0,12.0,665.0,104049,4029,212130,800.0x264.0,0.147368,0
190983,REDMOND,ножка со штифтом откидная RMB-M716/3<br/><br/>...,ножка со штифтом откидная RMB-M716/3,OtherTypeName,,,,,,0.0,0.0,0.0,658.389845,75,0,0,0,0,0,0,0,0,0,1441.594069,1575.761786,1675.084063,294.0,1130.0,3179.0,281.0,1099.0,3112.0,15.0,64.0,194.0,1165.131545,1302.534068,1405.027304,10.0,10.0,987.0,128679,9,261730,750.0x750.0,0.067848,0
37028,Arturia,"Независимо от того, являетесь ли вы начинающим...",Arturia KeyLab Essential 49 mk3 Black MIDI-кла...,OtherTypeName,,,,,,0.0,0.0,0.0,1032.244362,75,0,0,0,0,0,0,0,0,0,1263.821606,1385.109054,1479.082799,47.0,191.0,352.0,39.0,163.0,289.0,0.0,1.0,2.0,0.000000,1096.448114,1119.583902,1.0,1.0,511.0,52580,3493,106608,1020.0x1020.0,0.157857,0


In [5]:
# уберём из обучающей выборки информацию о тех случаях, когда количество проданных товаров не существенно
# и меньше чем количество возвращенных товаров

def clean_train(df_train, df_test):
    print('shape before clean_train(): ', df_train.shape)
    df_train = df_train[df_train['item_count_sales7'] >= df_train['item_count_returns7']]
    df_train = df_train[df_train['item_count_sales30'] >= df_train['item_count_returns30']]
    df_train = df_train[df_train['item_count_sales90'] >= df_train['item_count_returns90']]
    df_train = df_train[df_train['item_count_fake_returns7'] <= df_train['item_count_returns7']]
    df_train = df_train[df_train['item_count_fake_returns30'] <= df_train['item_count_returns30']]
    df_train = df_train[df_train['item_count_fake_returns90'] <= df_train['item_count_returns90']]
    df_train = df_train[df_train['ExemplarAcceptedCountTotal7'] > df_train['ExemplarReturnedCountTotal7']]
    df_train = df_train[df_train['ExemplarAcceptedCountTotal30'] > df_train['ExemplarReturnedCountTotal30']]
    df_train = df_train[df_train['ExemplarAcceptedCountTotal90'] > df_train['ExemplarReturnedCountTotal90']]
    df_train = df_train[df_train['ExemplarReturnedValueTotal7'] <= df_train['GmvTotal7']]
    df_train = df_train[df_train['ExemplarReturnedValueTotal30'] <= df_train['GmvTotal30']]
    df_train = df_train[df_train['ExemplarReturnedValueTotal90'] <= df_train['GmvTotal90']]
    df_train = df_train[df_train.CommercialTypeName4.isin(df_test.CommercialTypeName4.to_list())]

    df_train['brand_name'] = df_train['brand_name'].fillna(0)

    for i in range(1, 6):
        df_train[f'rating_{i}_count'] = df_train[f'rating_{i}_count'].fillna(0)
    print('shape after clean_train():', df_train.shape)
    return df_train



df_train1 = clean_train(df_train1,df_test1)


df_train1

shape before clean_train():  (147898, 47)
shape after clean_train(): (139367, 47)


Unnamed: 0,brand_name,description,name_rus,CommercialTypeName4,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,photos_published_count,videos_published_count,PriceDiscounted,item_time_alive,item_count_fake_returns7,item_count_fake_returns30,item_count_fake_returns90,item_count_sales7,item_count_sales30,item_count_sales90,item_count_returns7,item_count_returns30,item_count_returns90,GmvTotal7,GmvTotal30,GmvTotal90,ExemplarAcceptedCountTotal7,ExemplarAcceptedCountTotal30,ExemplarAcceptedCountTotal90,OrderAcceptedCountTotal7,OrderAcceptedCountTotal30,OrderAcceptedCountTotal90,ExemplarReturnedCountTotal7,ExemplarReturnedCountTotal30,ExemplarReturnedCountTotal90,ExemplarReturnedValueTotal7,ExemplarReturnedValueTotal30,ExemplarReturnedValueTotal90,ItemVarietyCount,ItemAvailableCount,seller_time_alive,ItemID,SellerID,index,res,positive_score,resolution
180164,0,"В нашем магазине, вы можете приобрести аккумул...",Аккумулятор для Xiaomi BN5C (Poco M4 Pro 5G / ...,Аккумулятор для мобильного телефона,4.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,664.361097,205,0,0,0,0,0,0,0,0,0,1315.124454,1462.866982,1549.316974,735.0,3194.0,7525.0,725.0,3118.0,7358.0,35.0,156.0,345.0,996.676122,1158.752309,1237.098335,96.0,96.0,298.0,8394,37,16660,1080.0x1080.0,0.148325,0
88468,0,Подрулевые лепестки DimSimRacing - это высокок...,Подрулевые лепестки переключатели 120мм для иг...,Руль для игровой приставки,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,701.129491,228,0,0,0,0,0,0,0,0,0,1170.922221,1322.804983,1433.398117,29.0,112.0,333.0,21.0,90.0,296.0,2.0,3.0,7.0,954.746742,966.167081,1067.270780,1.0,1.0,235.0,160777,5523,326632,3072.0x4096.0,0.134271,0
131771,0,Модуль оперативной памяти 16GB DDR5 M425 M425R...,Оперативная память 1x16 ГБ (M425R2GA3BB0-CQK),Модуль памяти,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,818.391886,2,0,0,0,0,0,0,0,0,0,984.200650,1194.535404,1292.292943,3.0,19.0,37.0,3.0,19.0,37.0,0.0,1.0,2.0,0.000000,812.476481,879.054116,1.0,1.0,596.0,36455,5719,73038,3735.0x3735.0,0.149487,1
7004,0,"Сальник для стиральных машин S000LG (37*66*9,5...","Сальник для стиральных машин S000LG (37*66*9,5...","Запчасти для стиральной, сушильной машины",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,670.529954,21,0,0,0,0,0,0,0,0,0,808.127484,879.379779,879.390811,3.0,9.0,9.0,3.0,7.0,7.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,2.0,2.0,26.0,69949,1959,142459,396.0x398.0,0.157287,0
99378,Logitech G,Зарядись магией Хекстека с новой коллекцией де...,Коврик для мыши Logitech G840 XL KDA Collectio...,Коврик для мыши,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,812.682693,47,0,0,0,0,0,0,0,0,0,1235.215429,1310.172389,1352.652335,37.0,83.0,136.0,32.0,77.0,127.0,4.0,5.0,6.0,1036.853027,1042.329460,1046.074157,1.0,1.0,78.0,48634,4519,98451,1000.0x1000.0,0.146076,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171481,Мистер Гаджет,Ищете идеальное решение для работы с вашим уст...,"Стилус для телефона и для планшетов iPad, Андр...",Стилус,0.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,833.303788,1017,0,0,0,0,3,14,0,1,2,1021.311136,1164.326408,1278.373264,61.0,235.0,784.0,59.0,226.0,763.0,0.0,10.0,30.0,0.000000,882.181289,988.367800,1.0,1.0,1101.0,150616,10866,305955,900.0x1600.0,0.946659,0
170125,0,Шлейф для Samsung Galaxy S20 Ultra (G988B/EU v...,Шлейф для Samsung Galaxy S20 Ultra (G988B/EU v...,Системные платы для телефонов,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,643.569970,4,0,0,0,0,0,0,0,0,0,1111.084379,1274.137969,1369.714201,74.0,355.0,915.0,72.0,349.0,906.0,8.0,27.0,65.0,922.015571,1029.923722,1109.522570,38.0,38.0,665.0,161869,119,328699,2048.0x2048.0,0.160844,0
91698,BaseMarket,Шлейф - это необходимая запчасть для вашего мо...,Шлейф для Apple iPhone 13 Pro Max на датчик пр...,Шлейфы для телефонов,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,644.870262,3,0,0,0,0,0,0,0,0,0,1249.630527,1368.811646,1482.039487,225.0,886.0,2853.0,222.0,865.0,2807.0,12.0,44.0,158.0,1006.245858,1125.218497,1247.695414,8.0,8.0,1305.0,132799,296,270102,900.0x1200.0,0.958781,0
155054,0,Мешки синтетические 3 шт для пылесоса SAMSUNG ...,Мешки SAMSUNG SC20M255AWB синтетические 3 шт д...,Пылесборник,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,643.385534,816,0,0,0,0,0,0,0,0,0,1464.753935,1614.413751,1742.537550,1811.0,7986.0,33306.0,1609.0,7257.0,29922.0,39.0,206.0,758.0,1105.751115,1259.310746,1376.141242,352.0,352.0,972.0,119540,198,243406,1000.0x1000.0,0.157294,0


## 3. Создание новых признаков

In [6]:
# Проводится подсчёт взвешенного рейтинга, на основе данных из колонок rating_1_count, rating_2_count и т.д.

rating_columns = ['brand_name', 'name_rus', 'SellerID', 'CommercialTypeName4', 'ItemVarietyCount']


def add_weighted_ratings_efficient(df_train, df_test, rating_columns=rating_columns):
    """
    Эффективная версия с использованием transform для расчета рейтингов
    """
    # Сохраняем индексы
    train_idx = df_train.index
    test_idx = df_test.index

    # Добавляем метки
    df_train = df_train.copy()
    df_test = df_test.copy()
    df_train['_source'] = 'train'
    df_test['_source'] = 'test'

    # Объединяем
    combined = pd.concat([df_train, df_test])

    # Рассчитываем общие суммы для взвешенного рейтинга
    rating_cols = ['rating_1_count', 'rating_2_count', 'rating_3_count', 'rating_4_count', 'rating_5_count']

    combined['total_score_row'] = (
        combined['rating_1_count'] * 1 +
        combined['rating_2_count'] * 2 +
        combined['rating_3_count'] * 3 +
        combined['rating_4_count'] * 4 +
        combined['rating_5_count'] * 5
    )

    combined['total_count_row'] = combined[rating_cols].sum(axis=1)

    # Для каждой колонки добавляем взвешенный рейтинг
    for column in rating_columns:
        # Группируем и считаем суммы
        group_sums = combined.groupby(column).agg({
            'total_score_row': 'sum',
            'total_count_row': 'sum'
        }).reset_index()

        group_sums[f'{column}_weighted_rating'] = (
            group_sums['total_score_row'] / group_sums['total_count_row']
        ).fillna(0)

        # Создаем mapping
        rating_map = dict(zip(group_sums[column], group_sums[f'{column}_weighted_rating']))

        # Добавляем к данным
        combined[f'{column}_weighted_rating'] = combined[column].map(rating_map)

    # Разделяем обратно
    df_train_result = combined[combined['_source'] == 'train'].drop('_source', axis=1)
    df_test_result = combined[combined['_source'] == 'test'].drop('_source', axis=1)

    # Удаляем временные колонки
    for df in [df_train_result, df_test_result]:
        df.drop(['total_score_row', 'total_count_row'], axis=1, errors='ignore', inplace=True)

    # Восстанавливаем индексы
    df_train_result.index = train_idx
    df_test_result.index = test_idx

    return df_train_result, df_test_result

# Пример использования:


df_train_with_all_ratings, df_test_with_all_ratings = add_weighted_ratings_efficient(df_train, df_test)

df_train_with_all_ratings

Unnamed: 0,brand_name,description,name_rus,CommercialTypeName4,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,photos_published_count,videos_published_count,PriceDiscounted,item_time_alive,item_count_fake_returns7,item_count_fake_returns30,item_count_fake_returns90,item_count_sales7,item_count_sales30,item_count_sales90,item_count_returns7,item_count_returns30,item_count_returns90,GmvTotal7,GmvTotal30,GmvTotal90,ExemplarAcceptedCountTotal7,ExemplarAcceptedCountTotal30,ExemplarAcceptedCountTotal90,OrderAcceptedCountTotal7,OrderAcceptedCountTotal30,OrderAcceptedCountTotal90,ExemplarReturnedCountTotal7,ExemplarReturnedCountTotal30,ExemplarReturnedCountTotal90,ExemplarReturnedValueTotal7,ExemplarReturnedValueTotal30,ExemplarReturnedValueTotal90,ItemVarietyCount,ItemAvailableCount,seller_time_alive,ItemID,SellerID,index,res,positive_score,resolution,brand_name_weighted_rating,name_rus_weighted_rating,SellerID_weighted_rating,CommercialTypeName4_weighted_rating,ItemVarietyCount_weighted_rating
180164,,"В нашем магазине, вы можете приобрести аккумул...",Аккумулятор для Xiaomi BN5C (Poco M4 Pro 5G / ...,Аккумулятор для мобильного телефона,4.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,664.361097,205,0,0,0,0,0,0,0,0,0,1315.124454,1462.866982,1549.316974,735.0,3194.0,7525.0,725.0,3118.0,7358.0,35.0,156.0,345.0,996.676122,1158.752309,1237.098335,96.0,96.0,298.0,8394,37,16660,1080.0x1080.0,0.148325,0,,2.285714,3.913529,4.093286,3.650246
43525,grand-usb,USB Флеш накопитель в виде Матрёшки.\nДлина бр...,"grand-usb 128 ГБ USB-флеш-накопитель ""Матрёшка...",Флэш драйв,,,,,,0.0,0.0,0.0,761.578159,1,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,13.0,13.0,42.0,146516,691,297847,900.0x1200.0,0.149370,0,4.296296,0.000000,4.296296,3.210326,4.450543
88468,,Подрулевые лепестки DimSimRacing - это высокок...,Подрулевые лепестки переключатели 120мм для иг...,Руль для игровой приставки,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,701.129491,228,0,0,0,0,0,0,0,0,0,1170.922221,1322.804983,1433.398117,29.0,112.0,333.0,21.0,90.0,296.0,2.0,3.0,7.0,954.746742,966.167081,1067.270780,1.0,1.0,235.0,160777,5523,326632,3072.0x4096.0,0.134271,0,,5.000000,4.454545,4.425175,4.287973
131771,,Модуль оперативной памяти 16GB DDR5 M425 M425R...,Оперативная память 1x16 ГБ (M425R2GA3BB0-CQK),Модуль памяти,,,,,,0.0,0.0,0.0,818.391886,2,0,0,0,0,0,0,0,0,0,984.200650,1194.535404,1292.292943,3.0,19.0,37.0,3.0,19.0,37.0,0.0,1.0,2.0,0.000000,812.476481,879.054116,1.0,1.0,596.0,36455,5719,73038,3735.0x3735.0,0.149487,1,,0.000000,0.000000,4.633882,4.287973
7004,,"Сальник для стиральных машин S000LG (37*66*9,5...","Сальник для стиральных машин S000LG (37*66*9,5...","Запчасти для стиральной, сушильной машины",,,,,,0.0,0.0,0.0,670.529954,21,0,0,0,0,0,0,0,0,0,808.127484,879.379779,879.390811,3.0,9.0,9.0,3.0,7.0,7.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,2.0,2.0,26.0,69949,1959,142459,396.0x398.0,0.157287,0,,0.000000,0.000000,4.627197,4.308772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171481,Мистер Гаджет,Ищете идеальное решение для работы с вашим уст...,"Стилус для телефона и для планшетов iPad, Андр...",Стилус,0.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,833.303788,1017,0,0,0,0,3,14,0,1,2,1021.311136,1164.326408,1278.373264,61.0,235.0,784.0,59.0,226.0,763.0,0.0,10.0,30.0,0.000000,882.181289,988.367800,1.0,1.0,1101.0,150616,10866,305955,900.0x1600.0,0.946659,0,4.333333,3.651376,4.333333,4.267513,4.287973
170125,,Шлейф для Samsung Galaxy S20 Ultra (G988B/EU v...,Шлейф для Samsung Galaxy S20 Ultra (G988B/EU v...,Системные платы для телефонов,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,643.569970,4,0,0,0,0,0,0,0,0,0,1111.084379,1274.137969,1369.714201,74.0,355.0,915.0,72.0,349.0,906.0,8.0,27.0,65.0,922.015571,1029.923722,1109.522570,38.0,38.0,665.0,161869,119,328699,2048.0x2048.0,0.160844,0,,4.666667,3.918919,4.062284,4.222727
91698,BaseMarket,Шлейф - это необходимая запчасть для вашего мо...,Шлейф для Apple iPhone 13 Pro Max на датчик пр...,Шлейфы для телефонов,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,644.870262,3,0,0,0,0,0,0,0,0,0,1249.630527,1368.811646,1482.039487,225.0,886.0,2853.0,222.0,865.0,2807.0,12.0,44.0,158.0,1006.245858,1125.218497,1247.695414,8.0,8.0,1305.0,132799,296,270102,900.0x1200.0,0.958781,0,3.775081,5.000000,3.780899,4.238067,4.184956
155054,,Мешки синтетические 3 шт для пылесоса SAMSUNG ...,Мешки SAMSUNG SC20M255AWB синтетические 3 шт д...,Пылесборник,,,,,,0.0,0.0,0.0,643.385534,816,0,0,0,0,0,0,0,0,0,1464.753935,1614.413751,1742.537550,1811.0,7986.0,33306.0,1609.0,7257.0,29922.0,39.0,206.0,758.0,1105.751115,1259.310746,1376.141242,352.0,352.0,972.0,119540,198,243406,1000.0x1000.0,0.157294,0,,0.000000,4.323129,4.615410,4.281690


In [7]:
#  Собрать агрегационную статистику по ценам на основе разных столбцов

group_columns = ['brand_name', 'name_rus', 'SellerID', 'CommercialTypeName4', 'ItemVarietyCount']

def add_price_statistics(df_train, df_test, group_columns=group_columns, n_bins=10):
    """
    Версия с дополнительными статистиками по ценам и бинами

    Parameters:
    df_train (DataFrame): тренировочный датафрейм
    df_test (DataFrame): тестовый датафрейм
    group_columns (list): список колонок для группировки
    n_bins (int): количество бинов для цены

    Returns:
    tuple: (df_train_with_stats, df_test_with_stats)
    """
    # Сохраняем индексы
    train_idx = df_train.index
    test_idx = df_test.index

    # Добавляем метки
    df_train = df_train.copy()
    df_test = df_test.copy()
    df_train['_source'] = 'train'
    df_test['_source'] = 'test'

    # Объединяем
    combined = pd.concat([df_train, df_test])

    # Создаем бины для цены (на основе train данных чтобы избежать data leakage)

    # Добавляем бины к объединенным данным
    combined['price_bin'] = pd.cut(combined['PriceDiscounted'], bins=10, labels=False)

    for column in group_columns:
        # Рассчитываем различные статистики
        group_stats = combined.groupby(column)['PriceDiscounted'].agg([
            'mean', 'median', 'std', 'count', 'min', 'max'
        ]).reset_index()

        group_stats.columns = [
            column,
            f'{column}_mean_price',
            f'{column}_median_price',
            f'{column}_price_std',
            f'{column}_price_count',
            f'{column}_min_price',
            f'{column}_max_price'
        ]

        # Добавляем все статистики к данным
        for stat_col in group_stats.columns[1:]:
            stat_map = dict(zip(group_stats[column], group_stats[stat_col]))
            combined[stat_col] = combined[column].map(stat_map)

            # Заполняем пропущенные значения
            if 'mean' in stat_col:
                combined[stat_col] = combined[stat_col].fillna(combined['PriceDiscounted'].mean())
            elif 'median' in stat_col:
                combined[stat_col] = combined[stat_col].fillna(combined['PriceDiscounted'].median())
            elif 'std' in stat_col:
                combined[stat_col] = combined[stat_col].fillna(combined['PriceDiscounted'].std())
            elif 'count' in stat_col:
                combined[stat_col] = combined[stat_col].fillna(0)
            elif 'min' in stat_col:
                combined[stat_col] = combined[stat_col].fillna(combined['PriceDiscounted'].min())
            elif 'max' in stat_col:
                combined[stat_col] = combined[stat_col].fillna(combined['PriceDiscounted'].max())

    # Разделяем обратно
    df_train_result = combined[combined['_source'] == 'train'].drop('_source', axis=1)
    df_test_result = combined[combined['_source'] == 'test'].drop('_source', axis=1)

    # Восстанавливаем индексы
    df_train_result.index = train_idx
    df_test_result.index = test_idx

    return df_train_result, df_test_result


df_train_with_stats, df_test_with_stats = add_price_statistics(
    df_train_with_all_ratings, df_test_with_all_ratings
)

df_train_with_stats

Unnamed: 0,brand_name,description,name_rus,CommercialTypeName4,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,photos_published_count,videos_published_count,PriceDiscounted,item_time_alive,item_count_fake_returns7,item_count_fake_returns30,item_count_fake_returns90,item_count_sales7,item_count_sales30,item_count_sales90,item_count_returns7,item_count_returns30,item_count_returns90,GmvTotal7,GmvTotal30,GmvTotal90,ExemplarAcceptedCountTotal7,ExemplarAcceptedCountTotal30,ExemplarAcceptedCountTotal90,OrderAcceptedCountTotal7,OrderAcceptedCountTotal30,OrderAcceptedCountTotal90,ExemplarReturnedCountTotal7,ExemplarReturnedCountTotal30,ExemplarReturnedCountTotal90,ExemplarReturnedValueTotal7,ExemplarReturnedValueTotal30,ExemplarReturnedValueTotal90,ItemVarietyCount,ItemAvailableCount,seller_time_alive,ItemID,SellerID,index,res,positive_score,resolution,brand_name_weighted_rating,name_rus_weighted_rating,SellerID_weighted_rating,CommercialTypeName4_weighted_rating,ItemVarietyCount_weighted_rating,price_bin,brand_name_mean_price,brand_name_median_price,brand_name_price_std,brand_name_price_count,brand_name_min_price,brand_name_max_price,name_rus_mean_price,name_rus_median_price,name_rus_price_std,name_rus_price_count,name_rus_min_price,name_rus_max_price,SellerID_mean_price,SellerID_median_price,SellerID_price_std,SellerID_price_count,SellerID_min_price,SellerID_max_price,CommercialTypeName4_mean_price,CommercialTypeName4_median_price,CommercialTypeName4_price_std,CommercialTypeName4_price_count,CommercialTypeName4_min_price,CommercialTypeName4_max_price,ItemVarietyCount_mean_price,ItemVarietyCount_median_price,ItemVarietyCount_price_std,ItemVarietyCount_price_count,ItemVarietyCount_min_price,ItemVarietyCount_max_price
180164,,"В нашем магазине, вы можете приобрести аккумул...",Аккумулятор для Xiaomi BN5C (Poco M4 Pro 5G / ...,Аккумулятор для мобильного телефона,4.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,664.361097,205,0,0,0,0,0,0,0,0,0,1315.124454,1462.866982,1549.316974,735.0,3194.0,7525.0,725.0,3118.0,7358.0,35.0,156.0,345.0,996.676122,1158.752309,1237.098335,96.0,96.0,298.0,8394,37,16660,1080.0x1080.0,0.148325,0,,2.285714,3.913529,4.093286,3.650246,3,758.971530,736.695421,155.940582,0.0,0.000000,1816.563093,664.361097,664.361097,155.940582,1,664.361097,664.361097,651.378489,649.049636,31.847946,426,501.550005,742.651530,709.623197,712.291831,49.154302,11965,441.637803,1295.367904,711.311413,703.075340,77.916763,192.0,580.006725,944.297073
43525,grand-usb,USB Флеш накопитель в виде Матрёшки.\nДлина бр...,"grand-usb 128 ГБ USB-флеш-накопитель ""Матрёшка...",Флэш драйв,,,,,,0.0,0.0,0.0,761.578159,1,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,13.0,13.0,42.0,146516,691,297847,900.0x1200.0,0.149370,0,4.296296,0.000000,4.296296,3.210326,4.450543,4,741.194779,726.509885,71.571661,88.0,625.753623,918.843862,761.578159,761.578159,155.940582,1,761.578159,761.578159,740.935092,726.503847,71.205999,89,625.753623,918.843862,737.533669,727.312954,92.335616,1924,532.461129,1101.199599,762.675140,736.260869,142.224309,2180.0,462.816323,1302.328094
88468,,Подрулевые лепестки DimSimRacing - это высокок...,Подрулевые лепестки переключатели 120мм для иг...,Руль для игровой приставки,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,701.129491,228,0,0,0,0,0,0,0,0,0,1170.922221,1322.804983,1433.398117,29.0,112.0,333.0,21.0,90.0,296.0,2.0,3.0,7.0,954.746742,966.167081,1067.270780,1.0,1.0,235.0,160777,5523,326632,3072.0x4096.0,0.134271,0,,5.000000,4.454545,4.425175,4.287973,3,758.971530,736.695421,155.940582,0.0,0.000000,1816.563093,701.129491,701.129491,155.940582,1,701.129491,701.129491,761.695964,701.129491,110.403646,3,694.831159,889.127241,947.051068,957.210253,163.433617,213,550.375789,1388.453328,756.608067,734.994800,149.994407,43017.0,231.411808,1816.563093
131771,,Модуль оперативной памяти 16GB DDR5 M425 M425R...,Оперативная память 1x16 ГБ (M425R2GA3BB0-CQK),Модуль памяти,,,,,,0.0,0.0,0.0,818.391886,2,0,0,0,0,0,0,0,0,0,984.200650,1194.535404,1292.292943,3.0,19.0,37.0,3.0,19.0,37.0,0.0,1.0,2.0,0.000000,812.476481,879.054116,1.0,1.0,596.0,36455,5719,73038,3735.0x3735.0,0.149487,1,,0.000000,0.000000,4.633882,4.287973,4,758.971530,736.695421,155.940582,0.0,0.000000,1816.563093,818.391886,818.391886,155.940582,1,818.391886,818.391886,818.391886,818.391886,155.940582,1,818.391886,818.391886,864.308441,871.723025,109.500496,655,579.711764,1128.757131,756.608067,734.994800,149.994407,43017.0,231.411808,1816.563093
7004,,"Сальник для стиральных машин S000LG (37*66*9,5...","Сальник для стиральных машин S000LG (37*66*9,5...","Запчасти для стиральной, сушильной машины",,,,,,0.0,0.0,0.0,670.529954,21,0,0,0,0,0,0,0,0,0,808.127484,879.379779,879.390811,3.0,9.0,9.0,3.0,7.0,7.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,2.0,2.0,26.0,69949,1959,142459,396.0x398.0,0.157287,0,,0.000000,0.000000,4.627197,4.308772,3,758.971530,736.695421,155.940582,0.0,0.000000,1816.563093,648.697834,646.335026,19.811680,15,609.176131,670.543804,712.808173,704.267681,47.344195,21,622.537691,835.409956,709.601867,708.270020,83.154388,6807,459.751628,1152.102823,751.290942,733.820842,140.632475,18731.0,301.052965,1412.830272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171481,Мистер Гаджет,Ищете идеальное решение для работы с вашим уст...,"Стилус для телефона и для планшетов iPad, Андр...",Стилус,0.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,833.303788,1017,0,0,0,0,3,14,0,1,2,1021.311136,1164.326408,1278.373264,61.0,235.0,784.0,59.0,226.0,763.0,0.0,10.0,30.0,0.000000,882.181289,988.367800,1.0,1.0,1101.0,150616,10866,305955,900.0x1600.0,0.946659,0,4.333333,3.651376,4.333333,4.267513,4.287973,4,833.303788,833.303788,155.940582,1.0,833.303788,833.303788,770.562796,770.562796,88.729163,2,707.821803,833.303788,833.303788,833.303788,155.940582,1,833.303788,833.303788,734.155725,731.450900,111.867245,586,460.807050,1132.395579,756.608067,734.994800,149.994407,43017.0,231.411808,1816.563093
170125,,Шлейф для Samsung Galaxy S20 Ultra (G988B/EU v...,Шлейф для Samsung Galaxy S20 Ultra (G988B/EU v...,Системные платы для телефонов,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,643.569970,4,0,0,0,0,0,0,0,0,0,1111.084379,1274.137969,1369.714201,74.0,355.0,915.0,72.0,349.0,906.0,8.0,27.0,65.0,922.015571,1029.923722,1109.522570,38.0,38.0,665.0,161869,119,328699,2048.0x2048.0,0.160844,0,,4.666667,3.918919,4.062284,4.222727,3,758.971530,736.695421,155.940582,0.0,0.000000,1816.563093,643.569970,643.569970,155.940582,1,643.569970,643.569970,642.038431,628.502096,60.745407,322,565.575660,827.293141,653.276606,641.202773,55.999167,530,507.501373,978.045818,741.236021,716.751469,142.667056,641.0,532.463950,1215.697772
91698,BaseMarket,Шлейф - это необходимая запчасть для вашего мо...,Шлейф для Apple iPhone 13 Pro Max на датчик пр...,Шлейфы для телефонов,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,644.870262,3,0,0,0,0,0,0,0,0,0,1249.630527,1368.811646,1482.039487,225.0,886.0,2853.0,222.0,865.0,2807.0,12.0,44.0,158.0,1006.245858,1125.218497,1247.695414,8.0,8.0,1305.0,132799,296,270102,900.0x1200.0,0.958781,0,3.775081,5.000000,3.780899,4.238067,4.184956,3,714.297011,698.142665,102.230078,1719.0,552.880188,1091.252996,644.870262,644.870262,155.940582,1,644.870262,644.870262,714.410127,698.166454,102.225124,1725,552.880188,1091.252996,614.456597,607.026717,58.193883,8442,323.477931,888.048074,753.690759,736.381127,142.270171,3489.0,301.048243,1267.132428
155054,,Мешки синтетические 3 шт для пылесоса SAMSUNG ...,Мешки SAMSUNG SC20M255AWB синтетические 3 шт д...,Пылесборник,,,,,,0.0,0.0,0.0,643.385534,816,0,0,0,0,0,0,0,0,0,1464.753935,1614.413751,1742.537550,1811.0,7986.0,33306.0,1609.0,7257.0,29922.0,39.0,206.0,758.0,1105.751115,1259.310746,1376.141242,352.0,352.0,972.0,119540,198,243406,1000.0x1000.0,0.157294,0,,0.000000,4.323129,4.615410,4.281690,3,758.971530,736.695421,155.940582,0.0,0.000000,1816.563093,643.385534,643.385534,155.940582,1,643.385534,643.385534,711.752769,713.888653,30.556919,1749,637.877157,792.033609,704.246587,713.875067,59.537722,3381,301.059328,1093.061551,709.382452,718.868015,29.863142,352.0,637.903510,791.455367


#### Средний таргет по строковым свойствам текста

In [8]:
# создать признаки на основе простых свойств текста (количество символов и букв в текстовых столбцах)


def add_string_feats(df):
    def count_words(text):
        if pd.isna(text):
            return 0
        return len(str(text).split())

    df['len_description'] = df['description'].apply(lambda x: len(str(x)) if pd.notna(x) else 0)
    df['len_name_rus'] = df['name_rus'].apply(lambda x: len(str(x)) if pd.notna(x) else 0)
    df['count_words_description'] = df['description'].apply(count_words)
    df['count_words_name_rus'] = df['name_rus'].apply(count_words)
    return df


df_train1 = add_string_feats(df_train_with_stats)
df_test1 = add_string_feats(df_test_with_stats)

df_train1

Unnamed: 0,brand_name,description,name_rus,CommercialTypeName4,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,photos_published_count,videos_published_count,PriceDiscounted,item_time_alive,item_count_fake_returns7,item_count_fake_returns30,item_count_fake_returns90,item_count_sales7,item_count_sales30,item_count_sales90,item_count_returns7,item_count_returns30,item_count_returns90,GmvTotal7,GmvTotal30,GmvTotal90,ExemplarAcceptedCountTotal7,ExemplarAcceptedCountTotal30,ExemplarAcceptedCountTotal90,OrderAcceptedCountTotal7,OrderAcceptedCountTotal30,OrderAcceptedCountTotal90,ExemplarReturnedCountTotal7,ExemplarReturnedCountTotal30,ExemplarReturnedCountTotal90,ExemplarReturnedValueTotal7,ExemplarReturnedValueTotal30,ExemplarReturnedValueTotal90,ItemVarietyCount,ItemAvailableCount,seller_time_alive,ItemID,SellerID,index,res,positive_score,resolution,brand_name_weighted_rating,name_rus_weighted_rating,SellerID_weighted_rating,CommercialTypeName4_weighted_rating,ItemVarietyCount_weighted_rating,price_bin,brand_name_mean_price,brand_name_median_price,brand_name_price_std,brand_name_price_count,brand_name_min_price,brand_name_max_price,name_rus_mean_price,name_rus_median_price,name_rus_price_std,name_rus_price_count,name_rus_min_price,name_rus_max_price,SellerID_mean_price,SellerID_median_price,SellerID_price_std,SellerID_price_count,SellerID_min_price,SellerID_max_price,CommercialTypeName4_mean_price,CommercialTypeName4_median_price,CommercialTypeName4_price_std,CommercialTypeName4_price_count,CommercialTypeName4_min_price,CommercialTypeName4_max_price,ItemVarietyCount_mean_price,ItemVarietyCount_median_price,ItemVarietyCount_price_std,ItemVarietyCount_price_count,ItemVarietyCount_min_price,ItemVarietyCount_max_price,len_description,len_name_rus,count_words_description,count_words_name_rus
180164,,"В нашем магазине, вы можете приобрести аккумул...",Аккумулятор для Xiaomi BN5C (Poco M4 Pro 5G / ...,Аккумулятор для мобильного телефона,4.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,664.361097,205,0,0,0,0,0,0,0,0,0,1315.124454,1462.866982,1549.316974,735.0,3194.0,7525.0,725.0,3118.0,7358.0,35.0,156.0,345.0,996.676122,1158.752309,1237.098335,96.0,96.0,298.0,8394,37,16660,1080.0x1080.0,0.148325,0,,2.285714,3.913529,4.093286,3.650246,3,758.971530,736.695421,155.940582,0.0,0.000000,1816.563093,664.361097,664.361097,155.940582,1,664.361097,664.361097,651.378489,649.049636,31.847946,426,501.550005,742.651530,709.623197,712.291831,49.154302,11965,441.637803,1295.367904,711.311413,703.075340,77.916763,192.0,580.006725,944.297073,601,72,74,14
43525,grand-usb,USB Флеш накопитель в виде Матрёшки.\nДлина бр...,"grand-usb 128 ГБ USB-флеш-накопитель ""Матрёшка...",Флэш драйв,,,,,,0.0,0.0,0.0,761.578159,1,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,13.0,13.0,42.0,146516,691,297847,900.0x1200.0,0.149370,0,4.296296,0.000000,4.296296,3.210326,4.450543,4,741.194779,726.509885,71.571661,88.0,625.753623,918.843862,761.578159,761.578159,155.940582,1,761.578159,761.578159,740.935092,726.503847,71.205999,89,625.753623,918.843862,737.533669,727.312954,92.335616,1924,532.461129,1101.199599,762.675140,736.260869,142.224309,2180.0,462.816323,1302.328094,577,98,80,14
88468,,Подрулевые лепестки DimSimRacing - это высокок...,Подрулевые лепестки переключатели 120мм для иг...,Руль для игровой приставки,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,701.129491,228,0,0,0,0,0,0,0,0,0,1170.922221,1322.804983,1433.398117,29.0,112.0,333.0,21.0,90.0,296.0,2.0,3.0,7.0,954.746742,966.167081,1067.270780,1.0,1.0,235.0,160777,5523,326632,3072.0x4096.0,0.134271,0,,5.000000,4.454545,4.425175,4.287973,3,758.971530,736.695421,155.940582,0.0,0.000000,1816.563093,701.129491,701.129491,155.940582,1,701.129491,701.129491,761.695964,701.129491,110.403646,3,694.831159,889.127241,947.051068,957.210253,163.433617,213,550.375789,1388.453328,756.608067,734.994800,149.994407,43017.0,231.411808,1816.563093,486,88,60,13
131771,,Модуль оперативной памяти 16GB DDR5 M425 M425R...,Оперативная память 1x16 ГБ (M425R2GA3BB0-CQK),Модуль памяти,,,,,,0.0,0.0,0.0,818.391886,2,0,0,0,0,0,0,0,0,0,984.200650,1194.535404,1292.292943,3.0,19.0,37.0,3.0,19.0,37.0,0.0,1.0,2.0,0.000000,812.476481,879.054116,1.0,1.0,596.0,36455,5719,73038,3735.0x3735.0,0.149487,1,,0.000000,0.000000,4.633882,4.287973,4,758.971530,736.695421,155.940582,0.0,0.000000,1816.563093,818.391886,818.391886,155.940582,1,818.391886,818.391886,818.391886,818.391886,155.940582,1,818.391886,818.391886,864.308441,871.723025,109.500496,655,579.711764,1128.757131,756.608067,734.994800,149.994407,43017.0,231.411808,1816.563093,688,45,65,5
7004,,"Сальник для стиральных машин S000LG (37*66*9,5...","Сальник для стиральных машин S000LG (37*66*9,5...","Запчасти для стиральной, сушильной машины",,,,,,0.0,0.0,0.0,670.529954,21,0,0,0,0,0,0,0,0,0,808.127484,879.379779,879.390811,3.0,9.0,9.0,3.0,7.0,7.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,2.0,2.0,26.0,69949,1959,142459,396.0x398.0,0.157287,0,,0.000000,0.000000,4.627197,4.308772,3,758.971530,736.695421,155.940582,0.0,0.000000,1816.563093,648.697834,646.335026,19.811680,15,609.176131,670.543804,712.808173,704.267681,47.344195,21,622.537691,835.409956,709.601867,708.270020,83.154388,6807,459.751628,1152.102823,751.290942,733.820842,140.632475,18731.0,301.052965,1412.830272,63,63,9,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171481,Мистер Гаджет,Ищете идеальное решение для работы с вашим уст...,"Стилус для телефона и для планшетов iPad, Андр...",Стилус,0.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,833.303788,1017,0,0,0,0,3,14,0,1,2,1021.311136,1164.326408,1278.373264,61.0,235.0,784.0,59.0,226.0,763.0,0.0,10.0,30.0,0.000000,882.181289,988.367800,1.0,1.0,1101.0,150616,10866,305955,900.0x1600.0,0.946659,0,4.333333,3.651376,4.333333,4.267513,4.287973,4,833.303788,833.303788,155.940582,1.0,833.303788,833.303788,770.562796,770.562796,88.729163,2,707.821803,833.303788,833.303788,833.303788,155.940582,1,833.303788,833.303788,734.155725,731.450900,111.867245,586,460.807050,1132.395579,756.608067,734.994800,149.994407,43017.0,231.411808,1816.563093,571,87,80,12
170125,,Шлейф для Samsung Galaxy S20 Ultra (G988B/EU v...,Шлейф для Samsung Galaxy S20 Ultra (G988B/EU v...,Системные платы для телефонов,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,643.569970,4,0,0,0,0,0,0,0,0,0,1111.084379,1274.137969,1369.714201,74.0,355.0,915.0,72.0,349.0,906.0,8.0,27.0,65.0,922.015571,1029.923722,1109.522570,38.0,38.0,665.0,161869,119,328699,2048.0x2048.0,0.160844,0,,4.666667,3.918919,4.062284,4.222727,3,758.971530,736.695421,155.940582,0.0,0.000000,1816.563093,643.569970,643.569970,155.940582,1,643.569970,643.569970,642.038431,628.502096,60.745407,322,565.575660,827.293141,653.276606,641.202773,55.999167,530,507.501373,978.045818,741.236021,716.751469,142.667056,641.0,532.463950,1215.697772,88,88,12,12
91698,BaseMarket,Шлейф - это необходимая запчасть для вашего мо...,Шлейф для Apple iPhone 13 Pro Max на датчик пр...,Шлейфы для телефонов,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,644.870262,3,0,0,0,0,0,0,0,0,0,1249.630527,1368.811646,1482.039487,225.0,886.0,2853.0,222.0,865.0,2807.0,12.0,44.0,158.0,1006.245858,1125.218497,1247.695414,8.0,8.0,1305.0,132799,296,270102,900.0x1200.0,0.958781,0,3.775081,5.000000,3.780899,4.238067,4.184956,3,714.297011,698.142665,102.230078,1719.0,552.880188,1091.252996,644.870262,644.870262,155.940582,1,644.870262,644.870262,714.410127,698.166454,102.225124,1725,552.880188,1091.252996,614.456597,607.026717,58.193883,8442,323.477931,888.048074,753.690759,736.381127,142.270171,3489.0,301.048243,1267.132428,484,64,67,10
155054,,Мешки синтетические 3 шт для пылесоса SAMSUNG ...,Мешки SAMSUNG SC20M255AWB синтетические 3 шт д...,Пылесборник,,,,,,0.0,0.0,0.0,643.385534,816,0,0,0,0,0,0,0,0,0,1464.753935,1614.413751,1742.537550,1811.0,7986.0,33306.0,1609.0,7257.0,29922.0,39.0,206.0,758.0,1105.751115,1259.310746,1376.141242,352.0,352.0,972.0,119540,198,243406,1000.0x1000.0,0.157294,0,,0.000000,4.323129,4.615410,4.281690,3,758.971530,736.695421,155.940582,0.0,0.000000,1816.563093,643.385534,643.385534,155.940582,1,643.385534,643.385534,711.752769,713.888653,30.556919,1749,637.877157,792.033609,704.246587,713.875067,59.537722,3381,301.059328,1093.061551,709.382452,718.868015,29.863142,352.0,637.903510,791.455367,57,57,8,8


#### Восстановление временных признаков

In [9]:
# На основе пропорций попытаемся восстановить отсутствующие значения в признаках,
# где данные собираются по установленным интервалам, на 7 день, на 30-й день, на 90 день

temporal_features = [
    'GmvTotal', 'ExemplarAcceptedCountTotal', 'OrderAcceptedCountTotal',
    'ExemplarReturnedCountTotal', 'ExemplarReturnedValueTotal'
]

def restore_temporal_features(df_train, df_test, temporal_features=temporal_features):
        # Сохраняем индексы

    train_index = df_train.index
    test_index = df_test.index

    # Объединяем датафреймы
    df_train['dataset'] = 'train'
    df_test['dataset'] = 'test'
    combined_df = pd.concat([df_train, df_test], axis=0, ignore_index=True)

    print('NaNs before restore_temporal_features():', combined_df.isna().sum().sum())

    for feature in temporal_features:
        for days in [7, 30, 90]:
            col_name = f"{feature}{days}"
            if col_name in combined_df.columns:
                # Восстанавливаем через средние пропорции
                for source_days in [d for d in [7, 30, 90] if d != days]:
                    source_col = f"{feature}{source_days}"
                    if source_col in combined_df.columns:
                        mask = combined_df[col_name].isna() & combined_df[source_col].notna()
                        if mask.any():
                            ratio = days / source_days
                            combined_df.loc[mask, col_name] = combined_df.loc[mask, source_col] * ratio
    # Разделяем обратно
    print('NaNs after restore_temporal_features():', combined_df.isna().sum().sum())
    df_train = combined_df[combined_df['dataset'] == 'train'].drop('dataset', axis=1)
    df_test = combined_df[combined_df['dataset'] == 'test'].drop('dataset', axis=1)
    df_train.index = train_index
    df_test.index = test_index
    return df_train, df_test


df_train1, df_test1 = restore_temporal_features(df_train1, df_test1)

(df_train1.shape, df_test1.shape)

NaNs before restore_temporal_features(): 1042268
NaNs after restore_temporal_features(): 1025513


((147898, 87), (49300, 87))

#### работа с медиа столбцами

In [10]:
# Соберём статистические агрегации по столбцам с информацией о медиа активности по товару

def add_media_feats(df_train,df_test):
    # Сохраняем исходные индексы
    train_index = df_train.index
    test_index = df_test.index

    # Объединяем датафреймы
    df_train['dataset'] = 'train'
    df_test['dataset'] = 'test'
    combined_df = pd.concat([df_train, df_test], axis=0, ignore_index=True)
    print('before add_media_feats()',(df_train.shape,df_test.shape))
    # Медиа-столбцы
    media_cols = ['comments_published_count', 'photos_published_count', 'videos_published_count']

    # Расширенные статистики по продавцу
    seller_media_agg = combined_df.groupby('SellerID').agg({
        'comments_published_count': ['sum', 'mean', 'std', 'max', 'min'],
        'photos_published_count': ['sum', 'mean', 'std', 'max', 'min'],
        'videos_published_count': ['sum', 'mean', 'std', 'max', 'min'],
        'ItemID': 'count'  # количество товаров продавца
    }).reset_index()

    # Выравниваем названия столбцов
    seller_media_agg.columns = [
        'SellerID',
        'seller_comments_sum', 'seller_comments_mean', 'seller_comments_std', 'seller_comments_max', 'seller_comments_min',
        'seller_photos_sum', 'seller_photos_mean', 'seller_photos_std', 'seller_photos_max', 'seller_photos_min',
        'seller_videos_sum', 'seller_videos_mean', 'seller_videos_std', 'seller_videos_max', 'seller_videos_min',
        'seller_items_count'
    ]

    # Дополнительные фичи
    seller_media_agg['seller_total_media'] = (
        seller_media_agg['seller_comments_sum'] +
        seller_media_agg['seller_photos_sum'] +
        seller_media_agg['seller_videos_sum']
    )

    seller_media_agg['seller_media_per_item'] = (
        seller_media_agg['seller_total_media'] / seller_media_agg['seller_items_count'].replace(0, 1)
    )

    seller_media_agg['seller_media_diversity'] = (
        (seller_media_agg['seller_comments_sum'] > 0).astype(int) +
        (seller_media_agg['seller_photos_sum'] > 0).astype(int) +
        (seller_media_agg['seller_videos_sum'] > 0).astype(int)
    )

    # Объединяем с основным датафреймом
    combined_df = combined_df.merge(seller_media_agg, on='SellerID', how='left')

    # Разделяем обратно
    df_train = combined_df[combined_df['dataset'] == 'train'].drop('dataset', axis=1)
    df_test = combined_df[combined_df['dataset'] == 'test'].drop('dataset', axis=1)
    df_train.index = train_index
    df_test.index = test_index

    print('after add_media_feats()',(df_train.shape,df_test.shape))
    return df_train, df_test



df_train1, df_test1 = add_media_feats(df_train1, df_test1)

before add_media_feats() ((147898, 88), (49300, 88))
after add_media_feats() ((147898, 106), (49300, 106))


#### работа с бинами

In [11]:
# выделим признак на основе обычной цены и цены со скидкой - принадлежность к бинам, на основе квантилей

def add_bins(df_train,df_test):

    # Сохраняем исходные индексы
    train_index = df_train.index
    test_index = df_test.index

    # Добавляем метку для разделения датафреймов
    df_train['dataset'] = 'train'
    df_test['dataset'] = 'test'

    # Объединяем датафреймы
    combined_df = pd.concat([df_train, df_test], axis=0, ignore_index=True)
    print('combined_df.shape before:', combined_df.shape)

    # Создаем бины для PriceDiscounted
    # Используем 10 квантилей для равномерного распределения
    combined_df['PriceDiscounted_bins'] = pd.qcut(
        combined_df['PriceDiscounted'],
        q=10,
        labels=['bin_1', 'bin_2', 'bin_3', 'bin_4', 'bin_5', 'bin_6', 'bin_7', 'bin_8', 'bin_9', 'bin_10'],
        duplicates='drop'
    )
    print('combined_df.shape after:', combined_df.shape)

    # Разделяем обратно на train и test
    df_train = combined_df[combined_df['dataset'] == 'train'].copy()
    df_test = combined_df[combined_df['dataset'] == 'test'].copy()

    # Удаляем служебный столбец
    df_train = df_train.drop('dataset', axis=1)
    df_test = df_test.drop('dataset', axis=1)

    # Восстанавливаем исходные индексы
    df_train.index = train_index
    df_test.index = test_index
    return df_train, df_test


df_train1,df_test1 = add_bins(df_train1,df_test1)

combined_df.shape before: (197198, 107)
combined_df.shape after: (197198, 108)


#### количество товаров по продавцу (агрегация A по В)

In [12]:
# на основе фактического ассортимента у определённого продавца, собранного через агрегацию itemID по SellerID
# восстановить отсутствующие значения ItemVarietyCount

def fill_item_variety_count(df_train,df_test):

    train_index = df_train.index
    test_index = df_test.index

    # Добавляем метку для разделения датафреймов
    df_train['dataset'] = 'train'
    df_test['dataset'] = 'test'

    # Объединяем датафреймы
    combined_df = pd.concat([df_train, df_test], axis=0, ignore_index=True)

    print('na sum before fill_item_variety_count()', combined_df.isna().sum().sum())

    # Считаем количество товаров для каждого продавца
    seller_item_counts = combined_df.groupby('SellerID', as_index=False).agg({
        'ItemID': 'count'
    }).rename(columns={'ItemID': 'seller_item_count'})

    # Объединяем с основным датафреймом
    combined_df = combined_df.merge(seller_item_counts, on='SellerID', how='left')

    # Восстанавливаем пропуски в ItemVarietyCount
    # Если ItemVarietyCount пропущен, заменяем на количество товаров продавца
    combined_df['ItemVarietyCount'] = combined_df['ItemVarietyCount'].fillna(combined_df['seller_item_count'])

    # Удаляем временный столбец
    combined_df = combined_df.drop('seller_item_count', axis=1)

    print('na sum after fill_item_variety_count()', combined_df.isna().sum().sum())

    # Разделяем обратно на train и test
    df_train = combined_df[combined_df['dataset'] == 'train'].copy()
    df_test = combined_df[combined_df['dataset'] == 'test'].copy()

    # Удаляем служебный столбец
    df_train = df_train.drop('dataset', axis=1)
    df_test = df_test.drop('dataset', axis=1)

    # Восстанавливаем исходные индексы
    df_train.index = train_index
    df_test.index = test_index

    return df_train, df_test


df_train1,df_test1 = fill_item_variety_count(df_train1,df_test1)

na sum before fill_item_variety_count() 1038455
na sum after fill_item_variety_count() 1037458


#### Выявление степени положительности эмоционального окраса в описании товара

In [None]:
# Исходя из предположения о том, что продавцы контрафакта склонны больше приукрашивать, создать признак
# который бы при помощи BERT оценивал бы положительность окраса в столбце description

import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Загрузка токенизатора и модели
tokenizer = BertTokenizer.from_pretrained('blanchefort/rubert-base-cased-sentiment')
model = BertForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f'device is {device}')
model


In [None]:
# перед этим нужно очистить description от html символов

from bs4 import BeautifulSoup
import re

# Пример HTML-кода
html_content = "<div>Hello, <b>world!</b>\nThis is a test.</div>"

def text_clean(text):
  # Создание объекта BeautifulSoup
  soup = BeautifulSoup(text, 'html.parser')

  # Извлечение текста
  text = soup.get_text()

  # Удаление специальных символов
  cleaned_text = re.sub(r'\s+', ' ', text).strip()
  return cleaned_text


print(text_clean(html_content))

In [None]:
import re
from bs4 import BeautifulSoup

def text_special_symbols_count(text):
    # Создание объекта BeautifulSoup и извлечение текста
    text_len = len(text)
    soup = BeautifulSoup(text, 'html.parser')
    extracted = soup.get_text()
    spec_symb_len = len(text) - len(extracted)

    # Подсчёт специальных символов:
    # считаем символы, которые не буквы (латинские/кириллица), не цифры и не пробел
    #special_matches = re.findall(r'[^A-Za-zА-Яа-яЁё0-9\s]', extracted)
    #special_count = len(special_matches)


    return spec_symb_len

calculated_col = df_train1['description'].apply(text_special_symbols_count)

#df_train1['special_symb_count']
calculated_col

180164     30
43525       0
88468      10
131771    140
7004        0
         ... 
171481     10
170125      0
91698       0
155054      0
155493      0
Name: description, Length: 147898, dtype: int64

In [None]:
mini_df = df[['index', 'description', 'resolution']]
mini_df['description'] = mini_df['description'].fillna('').apply(text_clean).str[:500]
mini_df

In [None]:
import time

# Запись времени начала выполнения
start_time = time.time()


#pd.Series([predicted_class.item(), positive_score])

mini_df['description'] = mini_df['description'].fillna('')

print('fillna complete')

mini_df['positive_score'] = mini_df['description'].apply(predict_sentiment)

print('sentiment complete')

end_time = time.time()

# Вычисление времени выполнения
execution_time = end_time - start_time

print(f'time passed {execution_time}')
mini_df

# 56 sec - 100 - 2 outputs
# 66 sec - 100 - 1 output
# 115 sec - 200 - 1 output
# 178 sec - 300
# 4 sec - 300  cuda 1 out
# 13 sec - 1000 cuda
# 130 - 10 000 cuda

#RuntimeError: The size of tensor a (633) must match the size of tensor b (512) at non-singleton dimension 1

#### Добавить информацию о разрешении картинки в качестве признака

In [None]:
import os
from PIL import Image
import pandas as pd
from typing import Dict, Optional, Union

def add_image_dimensions_to_df(
    df: pd.DataFrame,
    image_folder: str,
    image_id_col: str = "ItemID",
    width_col: str = "width",
    height_col: str = "height",
) -> pd.DataFrame:
    file_info: Dict[str, Dict[str, Optional[int]]] = {}

    for filename in os.listdir(image_folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
            try:
                filepath = os.path.join(image_folder, filename)

                with Image.open(filepath) as img:
                    width, height = img.size

                photo_id = os.path.splitext(filename)[0]
                file_info[photo_id] = {
                    'width': width,
                    'height': height
                }

            except Exception as e:
                print(f"Ошибка при обработке {filename}: {e}")

    df[width_col] = df[image_id_col].astype(str).map(lambda x: file_info.get(x, {}).get('width'))
    df[height_col] = df[image_id_col].astype(str).map(lambda x: file_info.get(x, {}).get('height'))
    df[width_col] = df[width_col].fillna(0)
    df[height_col] = df[height_col].fillna(0)
    df['res'] = df[width_col].astype('int').astype('str') + 'x' + df[height_col].astype('int').astype('str')

    return df



# создание класса для умного жадного поиска

Поиск будет происходить на основе модели CatBoost, но класс предусматривает использование и других моделей. Создаётся список признаков, обладающих наибольшей значимостью для модели k-признаков с наибольшими значимостями последовательно добавляются в модель, в порядке убывания значимости, проверяется рост качества при добавлении признака после проверки всех признаков, начинается проверка качества модели при удалении признаков дававших прирост. Процесс сопровождается информацией о прогрессе

In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer
from sklearn.base import clone
from sklearn.inspection import permutation_importance
import warnings
from catboost import CatBoostClassifier
warnings.filterwarnings('ignore')



class SmartGreedySearch:
    def __init__(self, model, X, y, cv=5, feats_to_leave=5, random_state=42, model_params=None):
        self.model = model
        self.X = X
        self.y = y
        self.cv=5
        self.random_state=random_state
        self.feature_names = X.columns.tolist()
        self.history = []
        self.feats_to_leave = feats_to_leave

        if model.__class__.__name__ == 'Pipeline':
            self.model_params = pipeline.steps[-1][1].get_params()
        else:
            self.model_params = self.model.get_params()

    # № 2.5 вызывается на forward_selection, чтобы отдать ему список
    # если у модели нет метода feature_importance, тогда зовёт друга - _get_permutation_ranking
    def get_feature_ranking(self):
        model = clone(self.model)



        if hasattr(model, 'feature_importances_'):
            try:
                model.fit(self.X,self.y)
                importance_df = pd.DataFrame({
                    'feature': self.feature_names,
                    'importance': model.feature_importances_
                }).sort_values('importance', ascending=False)
                importance_df = importance_df.head(self.feats_to_leave)
                return importance_df['feature'].tolist()
            except:
                pass

        if hasattr(model, 'get_feature_importance'):
            try:
                model.fit(self.X,self.y)
                importance_df = pd.DataFrame({
                    'feature': self.feature_names,
                    'importance': model.feature_importances_
                }).sort_values('importance', ascending=False)
                importance_df = importance_df.head(self.feats_to_leave)
                return importance_df['feature'].tolist()
            except:
                pass


        return self._get_permutation_ranking()

    def _get_permutation_ranking(self):
        model = clone(self.model)

        model.fit(self.X,self.y)


        print('no feature_importance method found, \
              starting permutation_importance, may take a while, please wait :D')
        result = permutation_importance(
            model, self.X, self.y,
            n_repeats = 5,
            random_state= self.random_state,
            n_jobs=-1
        )

        importance_df = pd.DataFrame({
            'feature': self.feature_names,
            'importance': result.importances_mean
        }).sort_values('importance', ascending=False)
        importance_df = importance_df.head(self.feats_to_leave)

        return importance_df['feature'].to_list()


    def evaluate_features(self, features):


        model = clone(self.model)

        X_subset = self.X[features]
        params = self.model_params.copy()
        #print("features (incoming):", features)
        #print("X_subset.columns:", X_subset.columns.tolist())

        if model.__class__.__name__ == 'CatBoostClassifier':
            #print("params['cat_features'] (original):", params.get('cat_features'))

            current_cat_features = [f for f in params.get('cat_features', []) if f in features]
            #print("current_cat_features (kept):", current_cat_features)


            params_for_model = params.copy()
            params_for_model['cat_features'] = current_cat_features
            #print(f'model get params: {model.get_params()}')
            model.set_params(**params_for_model)  # можно оставить имена


        scores = cross_val_score(model, X_subset, self.y, cv=self.cv,
                         scoring='f1')

        return np.mean(scores)


    def forward_selection(self):
        """Жадный forward selection"""

        ranked_features = self.get_feature_ranking()
        current_features = []
        best_score = -1

        print("=== FORWARD SELECTION ===")
        i=0
        for feature in ranked_features:
            trial_features = current_features + [feature]
            score = self.evaluate_features(trial_features)
            i+=1
            self.history.append({
                'phase': 'forward',
                'feature': feature,
                'action': 'tested_add',
                'score': score,
                'features': trial_features.copy()
            })

            if score > best_score:
                current_features.append(feature)
                best_score = score
                print(f"{i}. ✓ Added {feature}, score: {score:.4f}")
            else:
                print(f"{i}. ✗ Skipped {feature}, no improvement")

        return current_features, best_score

    def backward_elimination(self, initial_features):
        """Жадный backward elimination"""
        #print(f'line 158 DEBUG: initial_features: {initial_features}')
        current_features = initial_features.copy()
        best_score = self.evaluate_features(current_features)
        i=0


        print("\n=== BACKWARD ELIMINATION ===")
        improved = True

        while improved and len(current_features) > 1:
            improved = False

            for feature in current_features.copy():
                i+=1
                print(f'{i}. trying to remove feature: {feature}')
                #print(f'line 170 DEBUG: feature: {feature}, current_features: {current_features}')
                trial_features = [f for f in current_features if f != feature]
                #print(f'line 172 DEBUG: trial_features (after list comp): {trial_features}')
                score = self.evaluate_features(trial_features)
                self.history.append({
                    'phase': 'backward',
                    'feature': feature,
                    'action': 'tested_remove',
                    'score': score,
                    'features': trial_features.copy()
                })

                if score >= best_score + 0.001:  # Небольшой запас
                    current_features.remove(feature)
                    best_score = score
                    improved = True
                    print(f"✓ Removed {feature}, score: {score:.4f}")
                    break

        return current_features, best_score

    def stepwise_selection(self):
        """Комбинированный stepwise_selection"""
        # Forward phase
        features_forward, score_forward = self.forward_selection()
        #print(f'line 194 DEBUG: features_forward: {features_forward}')

        # Backward phase
        self.features_final, score_final = self.backward_elimination(features_forward)

        print(f"\n=== FINAL RESULTS ===")
        print(f"Final features: {self.features_final}")
        print(f"Final score: {score_final:.4f}")

        return self.features_final, score_final

    def get_history_df(self):
        return pd.DataFrame(self.history)

    def get_best_features(self):
        return self.features_final

# первая попытка выбрать наилучшие признаки

Будут использованы все признаки, кроме 'itemID', 'SellerID', 'brand_name' которые могут привести к переобучению

In [17]:

category_cols = ['price_bin','PriceDiscounted_bins',
                  'description', 'name_rus', 'CommercialTypeName4', 'res']

exclude = [c for c in df_train1.columns if c not in {'itemID', 'SellerID', 'brand_name',
                                                     'resolution'}]

category_cols = list(set(category_cols) & set(exclude))


params = {
    'iterations': 100,          # Количество итераций
    'learning_rate': 0.1,       # Скорость обучения
    'depth': 4,                 # Глубина дерева
    'l2_leaf_reg': 3,           # Регуляризация L2
    'loss_function': 'Logloss',  # Функция потерь для бинарной классификации
    'eval_metric': 'F1',   # Метрика для оценки
    'random_seed': 42,          # Сид для воспроизводимости
    'verbose': 0,               # Отключение вывода
    'cat_features' : category_cols,
    'task_type' : 'GPU'
}

model=CatBoostClassifier(**params)


sgs=SmartGreedySearch(model=model,
                          X=df_train1[exclude],
                          y=df_train1['resolution'],
                          cv=5,
                          random_state=42,
                          feats_to_leave=20)

sgs.stepwise_selection()

=== FORWARD SELECTION ===
✓ Added CommercialTypeName4, score: 0.3643
✓ Added description, score: 0.5439
✓ Added brand_name_max_price, score: 0.6962
✓ Added name_rus, score: 0.7101
✓ Added item_time_alive, score: 0.7216
✓ Added res, score: 0.7228
✓ Added len_description, score: 0.7231
✗ Skipped brand_name_price_std, no improvement
✓ Added brand_name_weighted_rating, score: 0.7244
✓ Added brand_name_price_count, score: 0.7280
✓ Added ItemVarietyCount_median_price, score: 0.7301
✗ Skipped SellerID_price_std, no improvement
✗ Skipped len_name_rus, no improvement
✗ Skipped seller_items_count, no improvement
✗ Skipped SellerID_weighted_rating, no improvement
✗ Skipped CommercialTypeName4_price_std, no improvement
✗ Skipped SellerID_price_count, no improvement
✗ Skipped brand_name_min_price, no improvement
✗ Skipped GmvTotal30, no improvement
✗ Skipped count_words_description, no improvement

=== BACKWARD ELIMINATION ===
trying to remove feature: CommercialTypeName4
trying to remove feature: 

(['CommercialTypeName4',
  'description',
  'brand_name_max_price',
  'name_rus',
  'item_time_alive',
  'res',
  'len_description',
  'brand_name_weighted_rating',
  'brand_name_price_count',
  'ItemVarietyCount_median_price'],
 np.float64(0.7301149552622161))

Проверка качества модели на тестовой части выборки

In [25]:
category_cols = ['price_bin','PriceDiscounted_bins',
                  'description', 'name_rus', 'CommercialTypeName4', 'res']

final_features = ['CommercialTypeName4', 'description',  'brand_name_max_price',  'name_rus',  'item_time_alive',  'res',
  'len_description',  'brand_name_weighted_rating', 'brand_name_price_count', 'ItemVarietyCount_median_price']
category_cols = list(set(category_cols) & set(final_features))



params = {
    'iterations': 100,          # Количество итераций
    'learning_rate': 0.1,       # Скорость обучения
    'depth': 4,                 # Глубина дерева
    'l2_leaf_reg': 3,           # Регуляризация L2
    'loss_function': 'Logloss',  # Функция потерь для бинарной классификации
    'eval_metric': 'F1',   # Метрика для оценки
    'random_seed': 42,          # Сид для воспроизводимости
    'verbose': 0,               # Отключение вывода
    'cat_features' : category_cols
}

model=CatBoostClassifier(**params)

model.fit(df_train1[final_features], df_train1['resolution'])

X = df_test1[final_features]
y = df_test1['resolution']

y_pred = model.predict(X)

print('f1_score:', f1_score(y, y_pred))

f1_score: 0.735488238280589


# Вторая попытка выбрать наилучшие признаки

In [None]:
Исключаются признаки CommercialTypeName4 и description, будет проверено топ 95 признаков по значимости

In [28]:
category_cols = ['price_bin','PriceDiscounted_bins',
                  'description', 'name_rus', 'CommercialTypeName4', 'res']

exclude = [c for c in df_train1.columns if c not in {'itemID', 'SellerID', 'brand_name',
                                                     'resolution', 'CommercialTypeName4',
                                                     'description'}]

category_cols = list(set(category_cols) & set(exclude))


params = {
    'iterations': 100,          # Количество итераций
    'learning_rate': 0.1,       # Скорость обучения
    'depth': 4,                 # Глубина дерева
    'l2_leaf_reg': 3,           # Регуляризация L2
    'loss_function': 'Logloss',  # Функция потерь для бинарной классификации
    'eval_metric': 'F1',   # Метрика для оценки
    'random_seed': 42,          # Сид для воспроизводимости
    'verbose': 0,               # Отключение вывода
    'cat_features' : category_cols
}

model=CatBoostClassifier(**params)


sgs=SmartGreedySearch(model=model,
                          X=df_train1[exclude],
                          y=df_train1['resolution'],
                          cv=5,
                          random_state=42,
                          feats_to_leave=95)

sgs.stepwise_selection()

=== FORWARD SELECTION ===
1. ✓ Added CommercialTypeName4_price_count, score: 0.0000
2. ✓ Added name_rus, score: 0.4101
3. ✓ Added item_time_alive, score: 0.4288
4. ✓ Added CommercialTypeName4_max_price, score: 0.5270
5. ✓ Added CommercialTypeName4_mean_price, score: 0.5674
6. ✗ Skipped CommercialTypeName4_median_price, no improvement
7. ✓ Added brand_name_max_price, score: 0.6263
8. ✓ Added CommercialTypeName4_weighted_rating, score: 0.6299
9. ✓ Added CommercialTypeName4_price_std, score: 0.6306
10. ✗ Skipped SellerID_price_count, no improvement
11. ✓ Added res, score: 0.6433
12. ✓ Added CommercialTypeName4_min_price, score: 0.6445
13. ✓ Added brand_name_price_count, score: 0.6501
14. ✓ Added SellerID_weighted_rating, score: 0.6526
15. ✓ Added SellerID_median_price, score: 0.6538
16. ✓ Added brand_name_weighted_rating, score: 0.6590
17. ✓ Added ItemAvailableCount, score: 0.6613
18. ✗ Skipped seller_items_count, no improvement
19. ✗ Skipped count_words_name_rus, no improvement
20. ✗ Ski

(['CommercialTypeName4_price_count',
  'name_rus',
  'item_time_alive',
  'CommercialTypeName4_max_price',
  'CommercialTypeName4_mean_price',
  'brand_name_max_price',
  'CommercialTypeName4_weighted_rating',
  'CommercialTypeName4_price_std',
  'res',
  'CommercialTypeName4_min_price',
  'brand_name_price_count',
  'SellerID_weighted_rating',
  'SellerID_median_price',
  'brand_name_weighted_rating',
  'ItemAvailableCount',
  'ExemplarReturnedCountTotal90'],
 np.float64(0.6624330287116098))

In [29]:
category_cols = ['price_bin','PriceDiscounted_bins',
                  'description', 'name_rus', 'CommercialTypeName4', 'res']

final_features = ['CommercialTypeName4_price_count', 'name_rus', 'item_time_alive',
                  'CommercialTypeName4_median_price', 'CommercialTypeName4_max_price',
                  'brand_name_max_price', 'CommercialTypeName4_mean_price',
                  'CommercialTypeName4_price_std', 'res', 'CommercialTypeName4_min_price',
                  'brand_name_price_count', 'brand_name_price_std', 'brand_name_weighted_rating',
                  'count_words_name_rus', 'brand_name_median_price', 'SellerID_mean_price',
                 'brand_name_mean_price']


category_cols = list(set(category_cols) & set(final_features))



params = {
    'iterations': 100,          # Количество итераций
    'learning_rate': 0.1,       # Скорость обучения
    'depth': 4,                 # Глубина дерева
    'l2_leaf_reg': 3,           # Регуляризация L2
    'loss_function': 'Logloss',  # Функция потерь для бинарной классификации
    'eval_metric': 'F1',   # Метрика для оценки
    'random_seed': 42,          # Сид для воспроизводимости
    'verbose': 0,               # Отключение вывода
    'cat_features' : category_cols
}

model=CatBoostClassifier(**params)

model.fit(df_train1[final_features], df_train1['resolution'])

X = df_test1[final_features]
y = df_test1['resolution']

y_pred = model.predict(X)

print('f1_score:', f1_score(y, y_pred))

f1_score: 0.6764547896150402
