In [1]:
# системные пакеты
import os
from glob import glob
from pymystem3 import Mystem
import pandas as pd
import json
import pickle
import joblib
from collections import Counter
from pymystem3 import Mystem

In [2]:
# пакеты машинного обучения
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Получение локального пути приложения
pp = os.getcwd()
PATH_DD = pp.split('group_mlada')[0] + 'group_mlada/'

In [4]:
# Читаем тренировочные данные
train_path = f'{PATH_DD}/data/train/train.parquet'
train = pd.read_parquet(train_path, engine='pyarrow')

In [5]:
# читаем файл с описаниями
text_and_bert_path = f'{PATH_DD}/data/train/text_and_bert.parquet'
text_ful = pd.read_parquet(text_and_bert_path, engine='pyarrow')

In [6]:
# Читаем файл с атрибутами
attributes_path = f'{PATH_DD}/data/train/attributes.parquet'
attributes = pd.read_parquet(attributes_path, engine='pyarrow')

In [7]:
stat_len_categories1 = Counter()
stat_len_categories2 = Counter()
stat_len_categories3 = Counter()
stat_len_categories4 = Counter()

In [8]:
for i in range(0, len(attributes)):
    dd = json.loads(attributes.iloc[i]['categories'])
    stat_len_categories1[dd['1']] +=1
    stat_len_categories2[dd['2']] +=1
    stat_len_categories3[dd['3']] +=1
    stat_len_categories4[dd['4']] +=1

In [9]:
characteristic = Counter()

In [10]:
for i in range(0, len(attributes)):
    dd = json.loads(attributes.iloc[i]['characteristic_attributes_mapping'])
    for key in dd:
        characteristic[key] += 1

In [12]:
def get_dit_param(ar_param, litera:str) ->dict:
    result = {}
    anum = 0
    for key, _ in ar_param.most_common():
        result[key] = litera + str(anum)
        anum += 1
    return result

In [13]:
d_characteristic = get_dit_param(characteristic, 'e')
d_categories2 = get_dit_param(stat_len_categories2, 'b')
d_categories3 = get_dit_param(stat_len_categories3, 'c')
d_categories4 = get_dit_param(stat_len_categories4, 'd')
d_all = {'d2': d_categories2,
         'd3': d_categories3,
         'd4': d_categories4,
         'dc': d_characteristic}

In [14]:
def get_desc_obj(d_all, row):
    '''
    Получение описание объекта в кодах-словах
    '''
    r_d = []
    dd = json.loads(row['categories'])
    r_d.append( d_all['d2'][dd['2']])
    r_d.append( d_all['d3'][dd['3']])
    r_d.append( d_all['d4'][dd['4']])
    for key in json.loads(row['characteristic_attributes_mapping']):
        r_d.append(d_all['dc'][key])
    return ' '.join(r_d)

In [15]:
desc_all_obj ={}
all_v = []
for i in range(0, len(attributes)):
    v = attributes.iloc[i]['variantid']
    all_v.append(str(v).strip())
    desc_all_obj[str(v).strip()] = get_desc_obj(d_all, attributes.iloc[i])

In [16]:
# Распечатка строки описания объектов в кодах категорий
def print_obj(desc_all_obj, nn: int):
    k = 0
    for key in desc_all_obj:
        if k == nn:
                print(key, desc_all_obj[key], sep=' ==> ')
                break
        k += 1

In [17]:
print_obj(desc_all_obj, 999)
print_obj(desc_all_obj, 1000)
print_obj(desc_all_obj, 1001)

628056916 ==> b9 c52 d108 e978 e494 e305 e423 e239 e230 e297 e580 e257 e19 e9 e518 e4 e14 e255 e0 e1 e452 e298 e405 e380 e68 e700 e394 e463 e638 e241 e363 e236 e359 e419 e849 e412 e672 e268 e783 e244 e273 e289 e238 e252 e616 e2343 e2 e418 e443 e685 e403 e3 e6 e275 e69 e295
628105125 ==> b4 c252 d1174 e4 e14 e100 e27 e0 e1 e63 e44 e182 e1780 e3 e1799 e6 e151 e2 e145 e5
628139361 ==> b4 c2 d0 e8 e27 e0 e2


In [18]:
m = Mystem()

In [24]:
import re
def get_diig(ss: str) -> str:
    '''
    удаление полностью чисел
    '''
    return re.sub(r'\d', '', ss)

In [19]:
def get_lem_words_three(ss: str) -> list:
    '''
    Получение слов из строки
    '''
    ww= []
    ss = ss.strip().lower()
    ddel = ['*', ',', '-', '/', '%', ';', ')', '(', '[', ']','+', '.', '..', ':']
    for i in ddel:
        ss = ss.replace(i, ' ')
    ssl = ss.split()
    for word in ssl:
        # print(word)
        wwd = get_diig(word)
        if len(wwd) > 1:
            lemmas = m.lemmatize(wwd)
            ww.append(lemmas[0])
    return ww

In [20]:
def get_sequence_three(row):
    '''
    Получение предложения описания объекта в кодах-словах
    '''
    all_word = []
    ar = json.loads(row['characteristic_attributes_mapping'])
    for key in ar:
        if isinstance(ar[key], list):
            # print(key)
            for ww in ar[key]:
                
                all_word = all_word + get_lem_words_three(ww)
            # print()
    return ' '.join(all_word)

In [25]:
X_words_three = []
pusto = 0
Y_var_three = []

for i in range(0, len(attributes)):
    v = attributes.iloc[i]['variantid']
    Y_var_three.append(str(v).strip())
    sequence_three = get_sequence_three(attributes.iloc[i])
    if sequence_three:
        X_words_three.append(sequence_three)
    else:
        X_words_three.append(['pusto'])
        pusto += 1
print(len(X_words_three))
print(len(X_words_three))

2252569
2252569


In [26]:
desc_all_obj4= {}
for key, sens in zip(Y_var_three, X_words_three):
    desc_all_obj4[key] = sens

In [27]:
X4=[]
Y4=[]
for row in train.iterrows():
    if str(row[1]['variantid1']).strip() in desc_all_obj and str(row[1]['variantid2']).strip() in desc_all_obj:
        # print(desc_all_obj[row[1]['variantid1']])
        a = ''
        b = ''
        if type(desc_all_obj4[str(row[1]['variantid1'])]) == list:
            a = 'pusto'
        else: a = desc_all_obj4[str(row[1]['variantid1']).strip()]
        if type(desc_all_obj4[str(row[1]['variantid2'])]) == list:
            b = 'pusto'
        else: b = desc_all_obj4[str(row[1]['variantid2']).strip()]
        X4.append(desc_all_obj[str(row[1]['variantid1']).strip()] + ' ' + a + ' ' +
                  desc_all_obj[str(row[1]['variantid2']).strip()] + ' ' + b)
        Y4.append(row[1]['target'])

In [3]:
# Классификатор Random Forest
def train_classifier(data, labels):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data)
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
    
    random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    random_forest_model.fit(X_train, y_train)
    
    y_pred_rf = random_forest_model.predict(X_test)
    print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
    precision = precision_score(y_test, y_pred_rf)
    recall = recall_score(y_test, y_pred_rf)
    auc = roc_auc_score(y_test, y_pred_rf)
    # print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'AUC: {auc:.4f}')
    
    return random_forest_model, vectorizer

In [5]:
pp = os.getcwd()
PATH_DD = pp.split('group_mlada')[0] + 'group_mlada/'
with open(PATH_DD + 'data/X4.pickle', 'br') as f:
    X4 = pickle.load(f)
with open(PATH_DD + 'data/Y4.pickle', 'br') as f:
    Y4 = pickle.load(f)

In [6]:
random_forest_model4, vectorizer4 = train_classifier(X4, Y4)

Random Forest Accuracy: 0.7838419539246226
Precision: 0.7728
Recall: 0.7794
AUC: 0.7837


In [8]:
joblib.dump(random_forest_model4, 'model_3.pkl', compress=9)
joblib.dump(vectorizer4, 'vectorizer_3.pkl')

['vectorizer_3.pkl']

In [9]:
joblib.dump(random_forest_model4, 'model_3_full.pkl')

['model_3_full.pkl']