In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd

In [2]:
df = pd.read_csv('../data/processed/cleaned_dataset.csv')

In [None]:
# # get texts from text cols
# a = df['QUESTION_clean']
# b = df['TITLE']
# c = df['ANSWER_clean']

# a.to_csv('../data/question_texts.csv', index=False)
# b.to_csv('../data/title_texts.csv', index=False)
# c.to_csv('../data/answer_texts.csv', index=False)

### Сплит данных

In [47]:
# Будем стратифицировать по всем таргетам сразу
df['stratify_key'] = df['SUBCATEGORY_clean'].astype(str) + '_' + df['PRIORITY'].astype(str) + df['AVARIYA'].astype(str)
counts = df['stratify_key'].value_counts()
rare_classes = counts[counts < 20].index
df['stratify_key'] = df['stratify_key'].replace(rare_classes, 'OTHER')
df['stratify_key'].value_counts()

all_ids = df.index.values
train_val_ids, test_ids = train_test_split(
    all_ids,
    test_size=0.10,
    random_state=42,
    stratify=df['stratify_key'],
    shuffle=True
)

# Subset stratify_key to the train_val_ids
train_val_stratify = df.loc[train_val_ids, 'stratify_key']

train_ids, val_ids = train_test_split(
    train_val_ids,
    test_size=0.15,
    random_state=42,
    stratify=train_val_stratify,
    shuffle=True
)

np.save("../data/splits/train.npy", train_ids)
np.save("../data/splits/val.npy", val_ids)
np.save("../data/splits/test.npy", test_ids)

Добавим выборку поменьше

In [48]:
def subsample_stratified(ids, stratify_column, fraction=1/3, random_state=42):
    """Subsample while maintaining stratification"""
    if fraction >= 1.0:
        return ids
    
    # Get the stratify values for these IDs
    stratify_values = stratify_column.loc[ids]
    
    # Perform stratified sampling
    subsampled_ids, _ = train_test_split(
        ids,
        train_size=fraction,
        random_state=random_state,
        stratify=stratify_values,
        shuffle=True
    )
    return subsampled_ids

# Subsample each split
train_ids_subsampled = subsample_stratified(train_ids, df['stratify_key'], fraction=1/3)
val_ids_subsampled = subsample_stratified(val_ids, df['stratify_key'], fraction=1/3)
test_ids_subsampled = subsample_stratified(test_ids, df['stratify_key'], fraction=1/3)

print(f"Original sizes - Train: {len(train_ids)}, Val: {len(val_ids)}, Test: {len(test_ids)}")
print(f"Subsampled sizes - Train: {len(train_ids_subsampled)}, Val: {len(val_ids_subsampled)}, Test: {len(test_ids_subsampled)}")

np.save("../data/splits/train_small.npy", train_ids_subsampled)
np.save("../data/splits/val_small.npy", val_ids_subsampled)
np.save("../data/splits/test_small.npy", test_ids_subsampled)

Original sizes - Train: 539850, Val: 95268, Test: 70569
Subsampled sizes - Train: 179950, Val: 31756, Test: 23523


In [6]:
df.iloc[train_ids]['AVARIYA'].value_counts()

AVARIYA
Нет    531302
Да       8548
Name: count, dtype: int64

>Вроде все классы есть

## Feature extraction

### Текстовые фичи

In [None]:
# not using
# # TF-ID
# tfidf = TfidfVectorizer(
#     max_features=8000,  # можно увеличить для точности
#     ngram_range=(1,2),
#     min_df=5
# )

# # Используем train split чтобы не было утечки
# tfidf.fit(df.loc[train_ids, 'QUESTION_FULL'])
# X_question_full = tfidf.transform(df['QUESTION_FULL'])

Эмбеддинги для всех текстовых фич вычислили отдельно на сервере (bge_m3)

In [None]:
# def build_sentence_embeddings(df, column, model_path, save_path=None):
#     model = SentenceTransformer(model_path)
#     sentences = df[column].tolist()
#     embeddings = model.encode(sentences, batch_size=64, show_progress_bar=True)
    
#     if save_path:
#         np.savez_compressed(save_path, embeddings)
#     return embeddings

In [None]:
# build_sentence_embeddings(df, 'QUESTION_FULL', model_path='../models/embeddings/deepvk_USER-bge-m3', save_path='../data/features/question_bge_m3.npz')

### Категориальные фичи

Закодируем s_name, reg, source с OE.

Когда добавим новые классы, они будут маппиться в эмбеддинг с номером cardinality+1 

In [3]:
# load ids again
train_ids = np.load('../data/splits/train.npy')
test_ids = np.load('../data/splits/test.npy')
val_ids = np.load('../data/splits/val.npy')

In [71]:
import json

In [None]:

ord_encoders = {}
cat_cols = ['S_NAME', 'REG', 'SOURCE']
# cat_cardinalities = {}

# cat features cardinalities for train configs
print(df[cat_cols].nunique())

for col in cat_cols:
    oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    oe.fit(df.loc[train_ids, [col]]) 
    df[col + '_enc'] = oe.transform(df[[col]]).flatten()  # Transform and flatten to 1D
    ord_encoders[col] = oe
    
    # cardinality = len(oe.categories_[0])+1
    # cat_cardinalities[col] = cardinality

Добавим категориальные фичи, выделенные плейсхолдерами из текста

In [230]:
# patterns from preprocessing.ipynb
SENSITIVE_PATTERNS = {
    "EMAIL": r'\b[A-Za-z0-9._%+-]+@(?:[A-Za-z0-9-]+\.)+[A-Za-z]{2,}\b|<".+?"@[^>]+>',
    "INN": r'(?i)инн\s*\d{5,12}',
    "PHONE": r'\b(?:\+7|8)?[\s-]?\(?\d{3,4}\)?[\s-]?\d{2,3}[\s-]?\d{2}[\s-]?\d{2}(?:\s*\(доб\.\s*\d+\))?\b',
    # "PHONE_EXT": r'\b8\s?\d{3,4}[\s-]?\d{2}[\s-]?\d{2}[\s-]?\d{2}(?:\s*\(доб\.\s*\d+\))?',
    "VIN": r'\b[A-HJ-NPR-Z0-9]{17}\b',
    "INCEDENT": r'\bIM\d{8-12}\b',
    "REG_NUMBER": r'\b[АВЕКМНОРСТУХ]{1,3}\d{3,4}[АВЕКМНОРСТУХ]{2}\b',
    "CASE_NO": r'\b(?:[CcSs]D|T)\d{6,10}\b',
    "APPEAL_NO": r'\b\d/\d{9,12}\b',
    "DOC_NO": r'№\s?\d{1,6}([/\\-]?\d{1,6})?([А-ЯA-Z])?',
    "LONG_ID": r'\b\d{9,}\b|\b[a-f0-9]{32,128}\b|(?:UID|GlndID|GUID)[: ]?[0-9a-fA-F\-]{16,128}',
    "IP": r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
    "DATE": r'\b\d{2}[./-]\d{2}[./-]\d{2,4}(?:\s?г\.?)?',
    "FIO": r'\b[А-ЯЁ][а-яё]+ [А-ЯЁ]\.[А-ЯЁ]\.(?=[\s,.)/]|$)',
    "USERNAME": r'\b[a-zA-Z][a-zA-Z0-9]{3,15}\d\b',
    "TOKEN": r'\b[a-f0-9]{16,64}\b|\b(?=.*[A-Z])(?=.*[a-z])(?=.*\d)[A-Za-z0-9]{12,64}\b',
    "URL": r'https?://[^\s]+',
}

In [231]:
def extract_sensitive_flags(text: str) -> dict:
    """Возвращает булевы фичи для классификаторов"""
    flags = {f"HAS_{k}": 0 for k in SENSITIVE_PATTERNS.keys()}
    
    if not isinstance(text, str) or text.strip() == "" or text == "[NO_TEXT]":
        flags["HAS_TEXT"] = 0
        return flags
    
    for placeholder in SENSITIVE_PATTERNS.keys():
        if placeholder in text:
            flags[f"HAS_{placeholder}"] = 1
    
    # Флаг наличия текста
    flags["HAS_TEXT"] = 1
    
    return flags

In [232]:
flags_df = df['QUESTION_clean'].apply(extract_sensitive_flags).apply(pd.Series)

In [80]:
flag_cols = flags_df.columns

In [89]:
cat_cols_enc = [f'{col}_enc' for col in cat_cols]
# final_cat_cols = np.concatenate((cat_cols_enc, flag_cols))
# final_cat_cols

array(['S_NAME_enc', 'REG_enc', 'SOURCE_enc', 'HAS_EMAIL', 'HAS_INN',
       'HAS_PHONE', 'HAS_VIN', 'HAS_INCEDENT', 'HAS_REG_NUMBER',
       'HAS_CASE_NO', 'HAS_APPEAL_NO', 'HAS_DOC_NO', 'HAS_LONG_ID',
       'HAS_IP', 'HAS_DATE', 'HAS_FIO', 'HAS_USERNAME', 'HAS_TOKEN',
       'HAS_URL', 'HAS_TEXT'], dtype=object)

In [None]:
X_cat_oe = df[cat_cols_enc].values
X_cat_bin = flags_df.values

In [144]:
X_cat_oe.shape, X_cat_bin.shape

((705687, 3), (705687, 17))

### Временные фичи

In [16]:
for col in ['OPEN_TIME_', 'RESOLVE_TIME_', 'CLOSE_TIME_', 'ATC_NEXT_BREACH_']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Encoding cyclical features
hour = df['OPEN_TIME_'].dt.hour
df['hour_sin'] = np.sin(hour * (2. * np.pi / 24.))
df['hour_cos'] = np.cos(hour * (2. * np.pi / 24.))

day_of_week = df['OPEN_TIME_'].dt.dayofweek
df['day_of_week_sin'] = np.sin(day_of_week * (2. * np.pi / 7.))
df['day_of_week_cos'] = np.cos(day_of_week * (2. * np.pi / 7.))

df['is_weekend'] = (day_of_week >= 5).astype(int)

day_of_month = df['OPEN_TIME_'].dt.day
df['day_of_month_sin'] = np.sin(day_of_month * (2. * np.pi / 31.))
df['day_of_month_cos'] = np.cos(day_of_month * (2. * np.pi / 31.))

In [17]:
X_time = df[['day_of_week_sin', 'day_of_week_cos', 'hour_sin', 'hour_cos', 'day_of_month_sin', 'day_of_month_cos', 'is_weekend']].fillna(0).values

### Таргеты

In [14]:
df['AVARIYA_enc'] = df['AVARIYA'].map({'Да':1, 'Нет':0})
df['PRIORITY_enc'] = df['PRIORITY'].map({2: 0, 3: 1}).astype(np.int64)

In [20]:
le = LabelEncoder()
le.fit(df.loc[train_ids, 'SUBCATEGORY_clean'])
classes = le.classes_
df[f'SUBCATEGORY_clean_processed'] = df['SUBCATEGORY_clean'].apply(lambda x: np.where(classes == x)[0][0] if x in classes else -1)
df['SUBCATEGORY_clean_enc'] = le.transform(df['SUBCATEGORY_clean'])

In [16]:
target_cols = ['SUBCATEGORY_clean_enc', 'PRIORITY_enc', 'AVARIYA_enc']
y = df[target_cols]

### Сохраняем метаданные
Сохраним размерности категориальных фич. Бинарные флаги будут подаваться в модель отдельно

In [22]:
targets_meta = {}
for target in target_cols:
    y_target = y[target].values
    classes = np.unique(y_target)

    # cardinality
    cardinality = len(classes)
    
    # class weights (balanced)
    if cardinality > 1:  # только для категориальных
        if (cardinality == 2):
            count0 = (y_target == 0).sum()
            count1 = (y_target == 1).sum()
            pos_weight = count0 / count1   # if we want to use the ratio
            
            targets_meta[target] = {
                "classes": classes.tolist(),
                "cardinality": cardinality,
                "pos_weight": pos_weight
            }
        else:
            weights = compute_class_weight(
                class_weight="balanced",
                classes=classes,
                y=y_target
            )
            
            targets_meta[target] = {
                "classes": classes.tolist(),
                "cardinality": cardinality,
                "class_weights": weights.tolist()
            }

In [23]:
targets_meta

{'SUBCATEGORY_clean_enc': {'classes': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13],
  'cardinality': 14,
  'class_weights': [4.478561908992829,
   0.19927264286646143,
   0.47579515282765206,
   54.493204633204634,
   14.09964035964036,
   11.429980563654032,
   7.123546394250182,
   3.3532606629666235,
   9.020439206462829,
   245.88397212543555,
   27.74144979951254,
   13.349103359564165,
   0.1729794587704677,
   29.035837722185647]},
 'PRIORITY_enc': {'classes': [0, 1],
  'cardinality': 2,
  'pos_weight': 0.1210527319114325},
 'AVARIYA_enc': {'classes': [0, 1],
  'cardinality': 2,
  'pos_weight': 62.16002864047257}}

In [148]:
df['SUBCATEGORY_clean'].value_counts()

SUBCATEGORY_clean
Прочее                                       291400
Доступ к ИСОД                                252951
Запрос на администрирование                  105941
Настройка ПО и оборудования                   15032
Авария                                        11255
Коррекция данных                               7076
ОШС                                            5588
Консультация                                   4410
Программное обеспечение. Региональные ПТК      3776
Запрос статуса                                 3575
Программное обеспечение                        1817
СПГУ                                           1736
Запрос на доработку                             925
Оборудование                                    205
Name: count, dtype: int64

### Сохраняем фичи, таргеты и векторизаторы

In [None]:
# np.savez_compressed("../data/features/question_tfidf.npz", features=X_question_full, ids=df.index.values)
np.savez_compressed('../data/features/oe_cat_features.npz', features=X_cat_oe, ids=df.index.values)
np.savez_compressed('../data/features/bin_cat_features.npz', features=X_cat_bin, ids=df.index.values)
np.savez_compressed('../data/features/time_features.npz', features=X_time, ids=df.index.values)

y.to_csv('../data/targets/targets.csv', index=False)

# joblib.dump(tfidf, '../models/vectorizers/tfidf_vectorizer.pkl')
# for col, le in label_encoders.items():
#     joblib.dump(le, f'../models/vectorizers/label_encoder_{col}.pkl')
joblib.dump(le, f'../models/vectorizers/SUBCATEGORY_clean_lenc.pkl')

for col, oe in ord_encoders.items():
    joblib.dump(oe, f'../models/vectorizers/{col}_oenc.pkl')