In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import joblib
import scipy
import pandas as pd

In [2]:
df = pd.read_csv('../data/processed/cleaned_dataset.csv')

In [None]:
# # get texts from text cols
# a = df['QUESTION_clean']
# b = df['TITLE']
# c = df['ANSWER_clean']

# a.to_csv('../data/question_texts.csv', index=False)
# b.to_csv('../data/title_texts.csv', index=False)
# c.to_csv('../data/answer_texts.csv', index=False)

### Сплит данных

In [47]:
# Будем стратифицировать по всем таргетам сразу
df['stratify_key'] = df['SUBCATEGORY_clean'].astype(str) + '_' + df['PRIORITY'].astype(str) + df['AVARIYA'].astype(str)
counts = df['stratify_key'].value_counts()
rare_classes = counts[counts < 20].index
df['stratify_key'] = df['stratify_key'].replace(rare_classes, 'OTHER')
df['stratify_key'].value_counts()

all_ids = df.index.values
train_val_ids, test_ids = train_test_split(
    all_ids,
    test_size=0.10,
    random_state=42,
    stratify=df['stratify_key'],
    shuffle=True
)

# Subset stratify_key to the train_val_ids
train_val_stratify = df.loc[train_val_ids, 'stratify_key']

train_ids, val_ids = train_test_split(
    train_val_ids,
    test_size=0.15,
    random_state=42,
    stratify=train_val_stratify,
    shuffle=True
)

np.save("../data/splits/train.npy", train_ids)
np.save("../data/splits/val.npy", val_ids)
np.save("../data/splits/test.npy", test_ids)

Добавим выборку поменьше

In [48]:
def subsample_stratified(ids, stratify_column, fraction=1/3, random_state=42):
    """Subsample while maintaining stratification"""
    if fraction >= 1.0:
        return ids
    
    # Get the stratify values for these IDs
    stratify_values = stratify_column.loc[ids]
    
    # Perform stratified sampling
    subsampled_ids, _ = train_test_split(
        ids,
        train_size=fraction,
        random_state=random_state,
        stratify=stratify_values,
        shuffle=True
    )
    return subsampled_ids

# Subsample each split
train_ids_subsampled = subsample_stratified(train_ids, df['stratify_key'], fraction=1/3)
val_ids_subsampled = subsample_stratified(val_ids, df['stratify_key'], fraction=1/3)
test_ids_subsampled = subsample_stratified(test_ids, df['stratify_key'], fraction=1/3)

print(f"Original sizes - Train: {len(train_ids)}, Val: {len(val_ids)}, Test: {len(test_ids)}")
print(f"Subsampled sizes - Train: {len(train_ids_subsampled)}, Val: {len(val_ids_subsampled)}, Test: {len(test_ids_subsampled)}")

np.save("../data/splits/train_small.npy", train_ids_subsampled)
np.save("../data/splits/val_small.npy", val_ids_subsampled)
np.save("../data/splits/test_small.npy", test_ids_subsampled)

Original sizes - Train: 539850, Val: 95268, Test: 70569
Subsampled sizes - Train: 179950, Val: 31756, Test: 23523


In [56]:
df.iloc[test_ids_subsampled]['SUBCATEGORY_clean'].value_counts()

SUBCATEGORY_clean
Прочее                                       9713
Доступ к ИСОД                                8431
Запрос на администрирование                  3532
Настройка ПО и оборудования                   501
Авария                                        375
Коррекция данных                              236
ОШС                                           186
Консультация                                  147
Программное обеспечение. Региональные ПТК     126
Запрос статуса                                119
Программное обеспечение                        61
СПГУ                                           58
Запрос на доработку                            31
Оборудование                                    7
Name: count, dtype: int64

>Вроде все классы есть

## Feature extraction

### Текстовые фичи

In [None]:
# not using
# # TF-ID
# tfidf = TfidfVectorizer(
#     max_features=8000,  # можно увеличить для точности
#     ngram_range=(1,2),
#     min_df=5
# )

# # Используем train split чтобы не было утечки
# tfidf.fit(df.loc[train_ids, 'QUESTION_FULL'])
# X_question_full = tfidf.transform(df['QUESTION_FULL'])

Эмбеддинги для всех текстовых фич вычислили отдельно на сервере (bge_m3)

In [None]:
# def build_sentence_embeddings(df, column, model_path, save_path=None):
#     model = SentenceTransformer(model_path)
#     sentences = df[column].tolist()
#     embeddings = model.encode(sentences, batch_size=64, show_progress_bar=True)
    
#     if save_path:
#         np.savez_compressed(save_path, embeddings)
#     return embeddings

In [None]:
# build_sentence_embeddings(df, 'QUESTION_FULL', model_path='../models/embeddings/deepvk_USER-bge-m3', save_path='../data/features/question_bge_m3.npz')

### Категориальные фичи

Закодируем s_name, reg, source с OE.

Когда добавим новые классы, они будут маппиться в эмбеддинг с номером cardinality+1 

In [3]:
# load ids again
train_ids = np.load('../data/splits/train.npy')
test_ids = np.load('../data/splits/test.npy')
val_ids = np.load('../data/splits/val.npy')

In [4]:
ord_encoders = {}

cat_cols = ['S_NAME', 'REG', 'SOURCE']

# cat features cardinalities for train configs
print(df[cat_cols].nunique())

for col in cat_cols:
    oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    oe.fit(df.loc[train_ids, [col]]) 
    df[col + '_enc'] = oe.transform(df[[col]]).flatten()  # Transform and flatten to 1D
    ord_encoders[col] = oe

S_NAME     22
REG       933
SOURCE      5
dtype: int64


Добавим категориальные фичи, выделенные плейсхолдерами из текста

In [5]:
# patterns from preprocessing.ipynb
SENSITIVE_PATTERNS = {
    "EMAIL": r'\b[A-Za-z0-9._%+-]+@(?:[A-Za-z0-9-]+\.)+[A-Za-z]{2,}\b|<".+?"@[^>]+>',
    "INN": r'(?i)инн\s*\d{5,12}',
    "PHONE": r'\b(?:\+7|8)?[\s-]?\(?\d{3,4}\)?[\s-]?\d{2,3}[\s-]?\d{2}[\s-]?\d{2}(?:\s*\(доб\.\s*\d+\))?\b',
    # "PHONE_EXT": r'\b8\s?\d{3,4}[\s-]?\d{2}[\s-]?\d{2}[\s-]?\d{2}(?:\s*\(доб\.\s*\d+\))?',
    "VIN": r'\b[A-HJ-NPR-Z0-9]{17}\b',
    "INCEDENT": r'\bIM\d{8-12}\b',
    "REG_NUMBER": r'\b[АВЕКМНОРСТУХ]{1,3}\d{3,4}[АВЕКМНОРСТУХ]{2}\b',
    "CASE_NO": r'\b(?:[CcSs]D|T)\d{6,10}\b',
    "APPEAL_NO": r'\b\d/\d{9,12}\b',
    "DOC_NO": r'№\s?\d{1,6}([/\\-]?\d{1,6})?([А-ЯA-Z])?',
    "LONG_ID": r'\b\d{9,}\b|\b[a-f0-9]{32,128}\b|(?:UID|GlndID|GUID)[: ]?[0-9a-fA-F\-]{16,128}',
    "IP": r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
    "DATE": r'\b\d{2}[./-]\d{2}[./-]\d{2,4}(?:\s?г\.?)?',
    "FIO": r'\b[А-ЯЁ][а-яё]+ [А-ЯЁ]\.[А-ЯЁ]\.(?=[\s,.)/]|$)',
    "USERNAME": r'\b[a-zA-Z][a-zA-Z0-9]{3,15}\d\b',
    "TOKEN": r'\b[a-f0-9]{16,64}\b|\b(?=.*[A-Z])(?=.*[a-z])(?=.*\d)[A-Za-z0-9]{12,64}\b',
    "URL": r'https?://[^\s]+',
}

In [6]:
def extract_sensitive_flags(text: str) -> dict:
    """Возвращает булевы фичи для классификаторов"""
    flags = {f"HAS_{k}": 0 for k in SENSITIVE_PATTERNS.keys()}
    
    if not isinstance(text, str) or text.strip() == "" or text == "[NO_TEXT]":
        flags["HAS_TEXT"] = 0
        return flags
    
    for placeholder in SENSITIVE_PATTERNS.keys():
        if placeholder in text:
            flags[f"HAS_{placeholder}"] = 1
    
    # Флаг наличия текста
    flags["HAS_TEXT"] = 1
    
    return flags

In [7]:
flags_df = df['QUESTION_clean'].apply(extract_sensitive_flags)

In [8]:
flags_df = flags_df.apply(pd.Series)

In [9]:
flags_df.sample(5)

Unnamed: 0,HAS_EMAIL,HAS_INN,HAS_PHONE,HAS_VIN,HAS_INCEDENT,HAS_REG_NUMBER,HAS_CASE_NO,HAS_APPEAL_NO,HAS_DOC_NO,HAS_LONG_ID,HAS_IP,HAS_DATE,HAS_FIO,HAS_USERNAME,HAS_TOKEN,HAS_URL,HAS_TEXT
604221,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
573058,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
157623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
610670,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
393292,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1


In [10]:
df = pd.concat([df, flags_df], axis=1)

In [11]:
df.sample(5)

Unnamed: 0,S_NAME,QUESTION,ANSWER,OPEN_TIME_,RESOLVE_TIME_,CLOSE_TIME_,ATC_NEXT_BREACH_,TITLE,SUBCATEGORY,REG,...,HAS_APPEAL_NO,HAS_DOC_NO,HAS_LONG_ID,HAS_IP,HAS_DATE,HAS_FIO,HAS_USERNAME,HAS_TOKEN,HAS_URL,HAS_TEXT
404429,СУДИС,Отправка: УМВД Zayavka na sbros paroly Хасанов...,"Пароль сброшен, архив с данными направлен в ад...",2023-04-07 19:16:05,2023-03-11 11:05:52,2023-10-11 11:06:13,2023-04-24 09:00:21,Восстановление пароля УЗ,Доступ к ИСОД,Приволжский федеральный округ/Удмуртская Респу...,...,0,0,0,0,0,0,0,0,0,1
112444,СЭД,Изменение УЗ (Ч В) Описание пустое,"Изменения внесены, данные на ВИСП отобразятся ...",2022-05-11 09:06:01,2022-05-11 16:59:12,2022-06-11 10:04:40,2022-12-11 11:11:30,Изменение данных УЗ,Доступ к ИСОД,Приволжский федеральный округ/Самарская область,...,0,0,0,0,0,0,0,0,0,1
666949,СЭД,"Добрый день! Подскажите пожалуйста, налажена о...",Функциональность восстановлена,2023-12-04 15:55:32,2023-12-05 09:21:58,2023-12-07 13:32:08,2023-04-19 15:33:34,СЭД. Северо-Западный федеральный округ. Наруше...,Авария,Северо-Западный федеральный округ/Республика Коми,...,0,0,0,0,0,1,0,0,0,1
19720,СЭД,Просьба зарегистрировать обращение на Блокиров...,"Учетная запись заблокирована, изменение на ВИС...",2022-01-25 17:37:23,2022-01-26 13:50:53,2022-02-02 13:02:12,2022-01-09 17:37:23,Блокировка УЗ,Доступ к ИСОД,Южный федеральный округ/Волгоградская область,...,0,0,0,0,0,1,0,0,0,1
470698,СУДИС,Создание ЦКС,Учетные записи созданы,2023-06-06 08:02:12,2023-06-06 18:07:20,2023-06-13 18:00:54,2023-06-20 18:00:00,Создание УЗ,Доступ к ИСОД,Приволжский федеральный округ/Оренбургская обл...,...,0,0,0,0,0,0,0,0,0,1


In [12]:
placeholder_cat_cols = [f"HAS_{placeholder}" for placeholder in SENSITIVE_PATTERNS.keys()]

In [13]:
cat_cols = [f'{col}_enc' for col in cat_cols]
cat_cols = np.concatenate((cat_cols, placeholder_cat_cols))
cat_cols

array(['S_NAME_enc', 'REG_enc', 'SOURCE_enc', 'HAS_EMAIL', 'HAS_INN',
       'HAS_PHONE', 'HAS_VIN', 'HAS_INCEDENT', 'HAS_REG_NUMBER',
       'HAS_CASE_NO', 'HAS_APPEAL_NO', 'HAS_DOC_NO', 'HAS_LONG_ID',
       'HAS_IP', 'HAS_DATE', 'HAS_FIO', 'HAS_USERNAME', 'HAS_TOKEN',
       'HAS_URL'], dtype='<U14')

In [14]:
X_cat = df[cat_cols].values

In [15]:
X_cat.shape

(705687, 19)

### Временные фичи

In [16]:
for col in ['OPEN_TIME_', 'RESOLVE_TIME_', 'CLOSE_TIME_', 'ATC_NEXT_BREACH_']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Encoding cyclical features
hour = df['OPEN_TIME_'].dt.hour
df['hour_sin'] = np.sin(hour * (2. * np.pi / 24.))
df['hour_cos'] = np.cos(hour * (2. * np.pi / 24.))

day_of_week = df['OPEN_TIME_'].dt.dayofweek
df['day_of_week_sin'] = np.sin(day_of_week * (2. * np.pi / 7.))
df['day_of_week_cos'] = np.cos(day_of_week * (2. * np.pi / 7.))

df['is_weekend'] = (day_of_week >= 5).astype(int)

day_of_month = df['OPEN_TIME_'].dt.day
df['day_of_month_sin'] = np.sin(day_of_month * (2. * np.pi / 31.))
df['day_of_month_cos'] = np.cos(day_of_month * (2. * np.pi / 31.))

In [17]:
X_time = df[['day_of_week_sin', 'day_of_week_cos', 'hour_sin', 'hour_cos', 'day_of_month_sin', 'day_of_month_cos', 'is_weekend']].fillna(0).values

### Таргеты

In [18]:
df['AVARIYA_enc'] = df['AVARIYA'].map({'Да':1, 'Нет':0})
# priority kept as it is (2,3)

In [20]:
le = LabelEncoder()
le.fit(df.loc[train_ids, 'SUBCATEGORY_clean'])
classes = le.classes_
df[f'SUBCATEGORY_clean_processed'] = df['SUBCATEGORY_clean'].apply(lambda x: np.where(classes == x)[0][0] if x in classes else -1)
df['SUBCATEGORY_clean_enc'] = le.transform(df['SUBCATEGORY_clean'])

In [23]:
target_cols = ['SUBCATEGORY_clean_enc', 'PRIORITY', 'AVARIYA_enc']
y = df[target_cols]

### Сохраняем фичи, таргеты и векторизаторы

In [70]:
# np.savez_compressed("../data/features/question_tfidf.npz", features=X_question_full, ids=df.index.values)
np.savez_compressed('../data/features/cat_features.npz', features=X_cat, ids=df.index.values)
np.savez_compressed('../data/features/time_features.npz', features=X_time, ids=df.index.values)

y.to_csv('../data/targets/targets.csv', index=False)

# joblib.dump(tfidf, '../models/vectorizers/tfidf_vectorizer.pkl')
# for col, le in label_encoders.items():
#     joblib.dump(le, f'../models/vectorizers/label_encoder_{col}.pkl')
joblib.dump(le, f'../models/vectorizers/label_encoder_SUBCATEGORY_clean_enc.pkl')

for col, oe in ord_encoders.items():
    joblib.dump(oe, f'../models/vectorizers/ordinal_encoder_{col}.pkl')