In [1]:
import pandas as pd

classification1 = pd.read_excel('../data/train_Ametist/classification.xlsx', sheet_name='Материалы, изд, констр и оборуд')
classification2 = pd.read_excel('../data/train_Ametist/classification.xlsx', sheet_name='Машины и механизмы')
classification = pd.concat([classification1, classification2], axis=0).dropna()
train_data = pd.read_excel('../data/train_Ametist/train.xlsx')

In [2]:
def extract_group(text):
    text = str(text)
    if 'Группа' in text:
        return text.split(':')[-1].replace('Группа ', '').strip()
    return None


classification_groups = classification.copy()
classification_groups['group'] = classification_groups['Классификатор строительных ресурсов'].apply(extract_group)
classification_groups['group'] = classification_groups['group'].fillna(method='ffill')
classification_groups = classification_groups.dropna()
classification_groups = classification_groups.loc[classification_groups.group.apply(len) > 0]
classification_groups = classification_groups.loc[classification_groups['Классификатор строительных ресурсов'].apply(lambda x: all([c in '0123456789.-' for c in x]))]
classification_groups = classification_groups.drop_duplicates(subset='group', keep='last')
classification_groups = classification_groups.drop(columns=['Unnamed: 1', 'Unnamed: 2']).rename(columns={'Классификатор строительных ресурсов': 'code'}).reset_index(drop=True)
classification_groups['code'] = classification_groups['code'].apply(lambda x: '.'.join(x.split('-')[0].split('.')[4:]))
train_data['group_code'] = train_data['ref_code'].apply(lambda x: '.'.join(x.split('-')[0].split('.')[4:]))

In [10]:
import spacy
import string
import re
import time


nlp = spacy.load('ru_core_news_sm')  # нужно скачать: python -m spacy download ru_core_news_sm


def strip_features_and_normaize(text):
    text = re.sub(r'\s*-\s*', '/', text)
    text = re.sub(r'\(.*?\)|\[.*?\]|\{.*?\}', '', text)
    result = nlp(text)
    filtered_result = filter(lambda x: x.pos_ in ['NOUN', 'ADJ', 'VERB', 'PROPN'] and 
                             x.dep_ in ['ROOT', 'amod', 'nsubj', 
                                        'advmod', 'nmod', 'nsubjpass', 
                                        'nmod:npmod', 'nmod:poss',
                                        'nmod:tmod', 'obl'] and
                             len(x.text) > 2, result)

    return ' '.join(list([e.lemma_.strip() for e in filtered_result]))


start = time.time()

train_data['record_name_strip'] = train_data.record_name.apply(strip_features_and_normaize)
train_data['record_name_strip_2'] = train_data.record_name_2.apply(strip_features_and_normaize)
classification_groups['group_strip'] = classification_groups.group.apply(strip_features_and_normaize)
print(time.time() - start)

23.764219760894775


In [4]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('DiTy/bi-encoder-russian-msmarco')
group_embeddings = model.encode(classification_groups.group_strip.to_list(), convert_to_tensor=True)
wrong_embeddings1 = model.encode(train_data.record_name_strip.to_list(), convert_to_tensor=True)
wrong_embeddings2 = model.encode(train_data.record_name_strip_2.to_list(), convert_to_tensor=True)



In [5]:
group_codes = set(classification_groups.code.to_list())

num_equal_codes = sum([c in group_codes for c in train_data.group_code])
num_equal_codes

2128

## One-level  matching

In [84]:
from tqdm.notebook import trange
import torch.nn.functional as F
import torch

correct_matches = 0


def check_matching(match_from_code, match_to_codes, emb_distances, depth=-1):
    if depth > 0:
        return ''.join(match_from_code.split('.')[:depth]) == ''.join(match_to_codes[torch.argmax(emb_distances).cpu().item()].split('.')[:depth])
    return match_from_code == match_to_codes[torch.argmax(emb_distances).cpu().item()]


for i in trange(wrong_embeddings1.shape[0]):
    group_emb_distances = torch.abs(util.cos_sim(group_embeddings, wrong_embeddings1[i].unsqueeze(0)))

    if check_matching(train_data.group_code.iloc[i], classification_groups.code, group_emb_distances, depth=-1):
        correct_matches += 1
correct_matches / num_equal_codes

  0%|          | 0/2407 [00:00<?, ?it/s]

0.12077067669172932

In [85]:
correct_matches = 0

for i in trange(wrong_embeddings2.shape[0]):
    group_emb_distances = torch.abs(util.cos_sim(group_embeddings, wrong_embeddings2[i].unsqueeze(0)))

    if check_matching(train_data.group_code.iloc[i], classification_groups.code, group_emb_distances, depth=-1):
        correct_matches += 1
correct_matches / num_equal_codes

  0%|          | 0/2407 [00:00<?, ?it/s]

0.11701127819548872

## Hierarchial matching

In [16]:
hierarchial_classification = classification.copy()
hierarchial_classification = hierarchial_classification.rename(columns={'Классификатор строительных ресурсов': 'code', 'Unnamed: 1': 'name', 'Unnamed: 2': 'measure'})
hierarchial_classification = hierarchial_classification.loc[hierarchial_classification.code.apply(lambda x: all(c in '0123456789-.' for c in x))]
hierarchial_classification = hierarchial_classification.loc[hierarchial_classification.code.apply(lambda x: len(x.split('.')) < 5)]
hierarchial_classification = hierarchial_classification.drop_duplicates(subset='name', keep='first')
hierarchial_classification['level0_code'] = hierarchial_classification.code.apply(lambda x: x.split('.')[0])
hierarchial_classification['level1_code'] = hierarchial_classification.code.apply(lambda x: x.split('.')[1])
hierarchial_classification['level2_code'] = hierarchial_classification.code.apply(lambda x: x.split('.')[2])
hierarchial_classification['level3_code'] = hierarchial_classification.code.loc[hierarchial_classification.code.apply(lambda x: len(x.split('.')) > 3)].apply(lambda x: x.split('.')[3])
hierarchial_classification['level3_code'] = hierarchial_classification.level3_code.fillna('01-0')
hierarchial_classification['level3_code'] = hierarchial_classification.level3_code.apply(lambda x: x.split('-')[0])
hierarchial_classification['level2_code'] = hierarchial_classification.level2_code.apply(lambda x: x.split('-')[0])
print(hierarchial_classification.shape[0])
hierarchial_classification.head(10)

103139


Unnamed: 0,code,name,measure,level0_code,level1_code,level2_code,level3_code
5,01.1.01.01-0002,Детали фасонные коньковые к листам хризотилцем...,100 компл,1,1,1,1
8,01.1.01.02-0011,"Доска электротехническая дугостойкая (АЦЭИД), ...",т,1,1,1,2
11,01.1.01.04-1018,"Листы хризотилцементные волнистые, профиль 40/...",м2,1,1,1,4
13,01.1.01.04-1022,"Листы хризотилцементные волнистые, профиль 40/...",м2,1,1,1,4
15,01.1.01.04-1024,"Листы хризотилцементные волнистые, профиль 40/...",м2,1,1,1,4
17,01.1.01.04-1032,"Листы хризотилцементные волнистые, профиль 40/...",м2,1,1,1,4
19,01.1.01.04-1038,"Листы хризотилцементные волнистые, профиль 51/...",м2,1,1,1,4
23,01.1.01.04-1046,"Листы хризотилцементные волнистые, профиль 51/...",м2,1,1,1,4
27,01.1.01.04-1068,"Листы хризотилцементные волнистые, профиль 40/...",м2,1,1,1,4
30,01.1.01.05-0011,Листы хризотилцементные плоские непрессованные...,м2,1,1,1,5


In [66]:
%%time

level0_embs = hierarchial_classification.groupby(f'level0_code').name.apply(lambda x: model.encode(x.to_list(), convert_to_tensor=True).mean(dim=0).cpu())

CPU times: user 1min 7s, sys: 1.55 s, total: 1min 9s
Wall time: 55.2 s


In [None]:
from tqdm.notebook import trange

# levelwise embeddings лучше посчитать заранее!


def predict_hierachically(match_from_name, hierarchial_classification):
    match_from_emb = model.encode(match_from_name, convert_to_tensor=True)
    best_group = list()
    levelwise_groupby = hierarchial_classification.copy()

    for hierarchy_level in range(4):
        if levelwise_groupby[f'level{hierarchy_level}_code'].isna().any():
            break

        levelwise_groupby = levelwise_groupby.groupby(f'level{hierarchy_level}_code')
        if hierarchy_level == 0:
            level_embs = level0_embs
        else:
            level_embs = levelwise_groupby.name.apply(lambda x: model.encode(x.to_list(), convert_to_tensor=True).mean(dim=0).cpu())
        level_embs = torch.stack(level_embs.to_list()).to('cuda:0')

        emb_distances = torch.abs(F.cosine_similarity(level_embs, match_from_emb.unsqueeze(0), dim=1))

        best_group_on_level = list(levelwise_groupby.groups.keys())[torch.argmax(emb_distances).cpu().item()]

        best_group.append(best_group_on_level)
        levelwise_groupby = levelwise_groupby.get_group(best_group_on_level)

    return '.'.join(best_group)

pred_groups = list()
for idx in trange(train_data.shape[0]):
    pred_groups.append(predict_hierachically(train_data.record_name_strip.iloc[idx], hierarchial_classification))

for i in range(3):
    level_group_code_gt = train_data.group_code.apply(lambda x: '.'.join(x.split('.')[:i+1]))
    level_group_code_pred = list(map(lambda x: '.'.join(x.split('.')[:i+1], pred_groups)))

    print(sum(level_group_code_gt == level_group_code_pred) / len(level_group_code_pred))
    
# match_from_name = train_data.record_name_strip.iloc[idx]
# match_from_emb = model.encode(match_from_name, convert_to_tensor=True)
# best_group = list()
# levelwise_groupby = hierarchial_classification.copy()

# print('taget:', match_from_name)
# print('target_group:', train_data.group_code.iloc[idx])

# for hierarchy_level in range(4):
#     if levelwise_groupby[f'level{hierarchy_level}_code'].isna().any():
#         break

#     levelwise_groupby = levelwise_groupby.groupby(f'level{hierarchy_level}_code')
#     if hierarchy_level == 0:
#         level_embs = level0_embs
#     else:
#         level_embs = levelwise_groupby.name.apply(lambda x: model.encode(x.to_list(), convert_to_tensor=True).mean(dim=0).cpu())
#     level_embs = torch.stack(level_embs.to_list()).to('cuda:0')

#     emb_distances = torch.abs(F.cosine_similarity(level_embs, match_from_emb.unsqueeze(0), dim=1))

#     best_group_on_level = list(levelwise_groupby.groups.keys())[torch.argmax(emb_distances).cpu().item()]

#     best_group.append(best_group_on_level)
#     levelwise_groupby = levelwise_groupby.get_group(best_group_on_level)

# print('predicted_group:', '.'.join(best_group))
# hierarchial_classification.loc[hierarchial_classification.code.apply(lambda x: x.startswith('.'.join(best_group)))]