In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import joblib
import scipy

In [58]:
df = pd.read_csv("../data/processed/cleaned_dataset.csv")
df.columns

Index(['S_NAME', 'QUESTION', 'ANSWER', 'OPEN_TIME_', 'RESOLVE_TIME_',
       'CLOSE_TIME_', 'ATC_NEXT_BREACH_', 'TITLE', 'SUBCATEGORY', 'REG',
       'PRIORITY', 'AVARIYA', 'SOURCE', 'QUESTION_FULL', 'SUBCATEGORY_CLEAN'],
      dtype='object')

### Сплит данных

In [59]:
# Будем стратифицировать по всем таргетам сразу
df['stratify_key'] = df['SUBCATEGORY_CLEAN'].astype(str) + '_' + df['PRIORITY'].astype(str) + df['AVARIYA'].astype(str)
counts = df['stratify_key'].value_counts()
rare_classes = counts[counts < 20].index
df['stratify_key'] = df['stratify_key'].replace(rare_classes, 'OTHER')
df['stratify_key'].value_counts()

all_ids = df.index.values
train_val_ids, test_ids = train_test_split(
    all_ids,
    test_size=0.15,
    random_state=42,
    stratify=df['stratify_key'],
    shuffle=True
)

# Subset stratify_key to the train_val_ids
train_val_stratify = df.loc[train_val_ids, 'stratify_key']

train_ids, val_ids = train_test_split(
    train_val_ids,
    test_size=0.15,
    random_state=42,
    stratify=train_val_stratify,
    shuffle=True
)

np.save("../data/splits/train.npy", train_ids)
np.save("../data/splits/val.npy", val_ids)
np.save("../data/splits/test.npy", test_ids)

In [73]:
df.iloc[train_val_ids]['AVARIYA'].value_counts()

AVARIYA
Нет    768884
Да      11820
Name: count, dtype: int64

## Feature extraction

### Текстовые фичи

In [None]:
# TF-ID
tfidf = TfidfVectorizer(
    max_features=8000,  # можно увеличить для точности
    ngram_range=(1,2),
    min_df=5
)

# Используем train split чтобы не было утечки
tfidf.fit(df.loc[train_ids, 'QUESTION_FULL'])
X_question_full = tfidf.transform(df['QUESTION_FULL'])

In [None]:
# def build_sentence_embeddings(df, column, model_path, save_path=None):
#     model = SentenceTransformer(model_path)
#     sentences = df[column].tolist()
#     embeddings = model.encode(sentences, batch_size=64, show_progress_bar=True)
    
#     if save_path:
#         np.savez_compressed(save_path, embeddings)
#     return embeddings

In [None]:
# build_sentence_embeddings(df, 'QUESTION_FULL', model_path='../models/embeddings/deepvk_USER-bge-m3', save_path='../data/features/question_bge_m3.npz')

### Категориальные фичи

In [23]:
ord_encoders = {}

cat_cols = ['S_NAME', 'REG', 'SOURCE']

for col in cat_cols:
    oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    oe.fit(df.loc[train_ids, [col]]) 
    df[col + '_enc'] = oe.transform(df[[col]]).flatten()  # Transform and flatten to 1D
    ord_encoders[col] = oe

X_cat = df[[c + '_enc' for c in cat_cols]].values

### Временные фичи

In [24]:
for col in ['OPEN_TIME_', 'RESOLVE_TIME_', 'CLOSE_TIME_', 'ATC_NEXT_BREACH_']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Encoding cyclical features
hour = df['OPEN_TIME_'].dt.hour
df['hour_sin'] = np.sin(hour * (2. * np.pi / 24.))
df['hour_cos'] = np.cos(hour * (2. * np.pi / 24.))

day_of_week = df['OPEN_TIME_'].dt.dayofweek
df['day_of_week_sin'] = np.sin(day_of_week * (2. * np.pi / 7.))
df['day_of_week_cos'] = np.cos(day_of_week * (2. * np.pi / 7.))

df['is_weekend'] = (day_of_week >= 5).astype(int)

day_of_month = df['OPEN_TIME_'].dt.day
df['day_of_month_sin'] = np.sin(day_of_month * (2. * np.pi / 31.))
df['day_of_month_cos'] = np.cos(day_of_month * (2. * np.pi / 31.))

X_time = df[['day_of_week_sin', 'day_of_week_cos', 'hour_sin', 'hour_cos', 'day_of_month_sin', 'day_of_month_cos', 'is_weekend']].fillna(0).values

### Таргеты

In [27]:
target_cols = ['SUBCATEGORY_CLEAN', 'PRIORITY', 'AVARIYA']
targets = df[target_cols]

label_encoders = {}

for col in target_cols:
    le = LabelEncoder()
    le.fit(df.loc[train_ids, col])
    classes = le.classes_
    df[f'{col}_processed'] = df[col].apply(lambda x: np.where(classes == x)[0][0] if x in classes else -1)
    df[col + '_enc'] = le.transform(df[col])
    label_encoders[col] = le

y = df[[c + '_enc' for c in target_cols]]

### Сохраняем фичи, таргеты и векторизаторы

In [None]:

np.savez_compressed("../data/features/question_tfidf.npz", features=X_question_full, ids=df.index.values)
np.savez_compressed('../data/features/cat_features.npz', features=X_cat, ids=df.index.values)
np.savez_compressed('../data/features/time_features.npz', features=X_time, ids=df.index.values)

y.to_csv('../data/targets/targets.csv', index=False)

joblib.dump(tfidf, '../models/vectorizers/tfidf_vectorizer.pkl')
for col, le in label_encoders.items():
    joblib.dump(le, f'../models/vectorizers/label_encoder_{col}.pkl')
for col, oe in ord_encoders.items():
    joblib.dump(oe, f'../models/vectorizers/ordinal_encoder_{col}.pkl')

Сохраним текстовые фичи для получения эмбеддингов позднее:

In [None]:
question_fulls = df['QUESTION_FULL']
titles = df['TITLE']
anwers = df['ANSWER']

# question_fulls.to_csv('../torch_container/data/input/question_texts.csv', index=False)
# titles.to_csv('../torch_container/data/input/title_texts.csv', index=False)
anwers.to_csv('../torch_container/data/input/answer_texts.csv', index=False)

Проиндексируем эмбеддинги:

In [46]:
question_bge = np.load('../data/features/question_bge_m3.npy')
title_bge = np.load('../data/features/title_bge_m3.npy')
answer_bge = np.load('../data/features/answer_bge_m3.npy')

In [47]:
np.savez('../data/features/question_bge_m3.npz', features=question_bge, ids=df.index.values)
np.savez('../data/features/title_bge_m3.npz', features=title_bge, ids=df.index.values)
np.savez('../data/features/answer_bge_m3.npz', features=answer_bge, ids=df.index.values)

In [29]:
timef = np.load('../data/features/time_features.npz')
catf = np.load('../data/features/cat_features.npz')

In [31]:
timef['features'][::1000].shape
# catf['features']

(919, 7)

In [5]:
tfidfn = np.load("../data/features/question_tfidf.npz")

In [13]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
from src.data_loader import load_features