In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb


from path_setup import setup_paths
setup_paths()

In [2]:
from dataset_transformer import BaseDatasetTransform
from base_model_train import BaseModelTrain
from model_evaluator import ModelEvaluator

In [67]:
df = pd.read_excel('../datasets/splat.xlsx')

In [68]:
df

Unnamed: 0,PrimaryItemID,PrimaryItemCode,Article,PrimaryItemName,Поставщик,Бренд,ManufacturerName,BrandName,SubBrandName,SegmentName,CategoryName,SubCategoryName,ProductTypeName,AgeSegmentName,Weight,Volume,Quantity
0,394690.0,13879,,З/П РОМАШКИН ЛУГ УГОЛЬ СЕРЕБРО 100МЛ,,,,,,Oral care,Зубная паста,,,Для детей,,,
1,394674.0,13862,,З/П РОМАШКИН ЛУГ ЦЕЛЕБНЫЕ ТРАВЫ 100МЛ,,,,,,Oral care,Зубная паста,,,Для детей,,,
2,399746.0,13869,,З/Щ SILCAMED СРЕД ТРОЙНОЕ ДЕЙСТВИЕ,,,DENTAL KOSMETIK,SILCAMED,,Oral care,Зубные щетки,Мануальные,,,,,
3,399997.0,14417,,БОС ЖИДКИЙ1200МЛ.ОТБ ЖМС,,,AIST/SPB,,,Home care,LAUNDRY,,,,,,
4,383853.0,7765,,&.АМП.Бумага для выпечки 6м,,,,,,Home care,OTHERS,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17930,,37308,,Лонга Вита з/щ бамбуковая д/взр,Не присвоено,LONGA VITA,,,,,,,,,,,
17931,,37309,,"Лонга Вита з/щ Русские мотивы,",Не присвоено,LONGA VITA,,,,,,,,,,,
17932,,37311,,ЭЛМ.зуб.пас.детская 50мл,Не присвоено,ELMEX,,,,,,,,,,,
17933,,37312,,IN Зубн.паста.Sensodyne Мгновенный эффект 75мл,ГЛАКСОСМИТКЛЯЙН ХЕЛСКЕР ЗАО,SENSODYNE,,,,,,,,,,,


In [69]:
df = df.drop(['PrimaryItemID','PrimaryItemCode','Article','ProductTypeName'],axis=1)
for column in df.columns:
    df[column] = df[column].astype('object')
df.head()

Unnamed: 0,PrimaryItemName,Поставщик,Бренд,ManufacturerName,BrandName,SubBrandName,SegmentName,CategoryName,SubCategoryName,AgeSegmentName,Weight,Volume,Quantity
0,З/П РОМАШКИН ЛУГ УГОЛЬ СЕРЕБРО 100МЛ,,,,,,Oral care,Зубная паста,,Для детей,,,
1,З/П РОМАШКИН ЛУГ ЦЕЛЕБНЫЕ ТРАВЫ 100МЛ,,,,,,Oral care,Зубная паста,,Для детей,,,
2,З/Щ SILCAMED СРЕД ТРОЙНОЕ ДЕЙСТВИЕ,,,DENTAL KOSMETIK,SILCAMED,,Oral care,Зубные щетки,Мануальные,,,,
3,БОС ЖИДКИЙ1200МЛ.ОТБ ЖМС,,,AIST/SPB,,,Home care,LAUNDRY,,,,,
4,&.АМП.Бумага для выпечки 6м,,,,,,Home care,OTHERS,,,,,


In [70]:
columns_to_combne = df.drop(['SegmentName','Volume','Weight','Quantity'],axis=1).columns
df['combined_text'] = df[columns_to_combne].fillna('').agg(', '.join, axis=1)
df = df.drop(columns_to_combne,axis=1)
df.head()

Unnamed: 0,SegmentName,Weight,Volume,Quantity,combined_text
0,Oral care,,,,"З/П РОМАШКИН ЛУГ УГОЛЬ СЕРЕБРО 100МЛ, , , , ,..."
1,Oral care,,,,"З/П РОМАШКИН ЛУГ ЦЕЛЕБНЫЕ ТРАВЫ 100МЛ, , , , ..."
2,Oral care,,,,"З/Щ SILCAMED СРЕД ТРОЙНОЕ ДЕЙСТВИЕ, , , DENTA..."
3,Home care,,,,"БОС ЖИДКИЙ1200МЛ.ОТБ ЖМС, , , AIST/SPB, , , LA..."
4,Home care,,,,"&.АМП.Бумага для выпечки 6м, , , , , , OTHERS, ,"


In [72]:
df = df.drop(['Weight','Volume','Quantity'],axis=1)

In [73]:
df.head()

Unnamed: 0,SegmentName,combined_text
0,Oral care,"З/П РОМАШКИН ЛУГ УГОЛЬ СЕРЕБРО 100МЛ, , , , ,..."
1,Oral care,"З/П РОМАШКИН ЛУГ ЦЕЛЕБНЫЕ ТРАВЫ 100МЛ, , , , ..."
2,Oral care,"З/Щ SILCAMED СРЕД ТРОЙНОЕ ДЕЙСТВИЕ, , , DENTA..."
3,Home care,"БОС ЖИДКИЙ1200МЛ.ОТБ ЖМС, , , AIST/SPB, , , LA..."
4,Home care,"&.АМП.Бумага для выпечки 6м, , , , , , OTHERS, ,"


In [74]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import pandas as pd

from transformers import BertTokenizer, BertModel

model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)



In [75]:
from tqdm import tqdm

def get_embeddings(texts, batch_size):
    embeddings_list = []
    model.eval()  
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")
        
        inputs = {key: value.to(model.device) for key, value in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)

            batch_embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings_list.extend(batch_embeddings.cpu().numpy().tolist())
    
    return embeddings_list


In [None]:
#descriptions = df['combined_text'].tolist()
#embeddings = get_embeddings(descriptions, 2048)

In [78]:
splat_embeddings = np.load("splat_embeddings.npy")

In [79]:
col = [f'{i+1}_feature' for i in range(splat_embeddings.shape[1])]

df_body = pd.DataFrame(splat_embeddings, columns=col)

df = pd.concat([df, df_body], axis=1)

In [81]:
df = df.drop('combined_text',axis=1)

In [87]:
df = df[~df['SegmentName'].isna()]

In [88]:
bdf = BaseDatasetTransform(df,target='SegmentName')
X_train, X_test, y_train, y_test,categorical_features = bdf.fit_transform()

Series([], dtype: int64)

В колонке(ах) ['SegmentName'] нет пропущенных значений
В наборе данных нет пропущенных значений
-------------------------------------------
Информация о колонках в датасете

Категориальные колонки:
Index([], dtype='object')

Числовые колонки:
Index(['1_feature', '2_feature', '3_feature', '4_feature', '5_feature',
       '6_feature', '7_feature', '8_feature', '9_feature', '10_feature',
       ...
       '759_feature', '760_feature', '761_feature', '762_feature',
       '763_feature', '764_feature', '765_feature', '766_feature',
       '767_feature', '768_feature'],
      dtype='object', length=768)


In [89]:
lgbm_model = lgb.LGBMClassifier(verbose=-1)

columns_to_delete = []  

unique_classes = len(set(y_train))
print(unique_classes)

model_train = BaseModelTrain(
    model=lgbm_model,
    columns_to_delete=columns_to_delete,
    categorical_features=categorical_features,
    boosting_type='gbdt',
    objective='multiclass',  
    num_class=unique_classes, 
    metric='multi_error' 
)

1


In [90]:
metrics = ['accuracy', 'precision', 'recall', 'f1_macro']
evaluator = ModelEvaluator(model_trainer=model_train, metrics=metrics)

evaluator.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [91]:
evaluator.evaluate_to_dataframe(X_test, y_test)

Unnamed: 0,Metric,Score
0,accuracy,0.914427
1,precision,0.913345
2,recall,0.786639
3,f1,0.825841
