In [3]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
import numpy as np
import pandas as pd
from scipy.integrate import trapezoid
from scipy.stats import linregress
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score,
    root_mean_squared_error, mean_absolute_percentage_error
)

DATA_FOLDER_PATH = 'data' # фолдер с временными рядами


def load_data(data_path: str):
    """Загрузка данных из CSV файла."""
    return pd.read_csv(data_path)


def preprocess_data(work_file):
    """Предобработка данных: удаление пустых или отсутствующих файлов."""
    indices_to_drop = []
    for index, file_name in enumerate(work_file['file_name']):
        try:
            txt_file = pd.read_csv(f'{DATA_FOLDER_PATH}/{file_name}', delimiter='\t', names=['t', 'delta_p', 'dp'])
            if txt_file.empty:
                indices_to_drop.append(index)
        except (FileNotFoundError, pd.errors.EmptyDataError):
            indices_to_drop.append(index)
    return work_file.drop(indices_to_drop).reset_index(drop=True)


def calculate_file_stats(dataframe):
    """Вычисление статистик для файлов."""
    stats = []
    for file_name in dataframe['file_name']:
        data = pd.read_csv(f'{DATA_FOLDER_PATH}/{file_name}', delimiter='\t', names=['t', 'deltaP', 'dP'])
        peak_idx = data["dP"].idxmax()
        stats.append({
            'file_name': file_name,
            'max_deltaP': data['deltaP'].max(),
            'min_deltaP': data['deltaP'].min(),
            'max_dP': data['dP'].max(),
            'min_dP': data['dP'].min(),
            'mean_deltaP': data['deltaP'].mean(),
            'mean_dP': data['dP'].mean(),
            'std_dP': data['dP'].std(),
            'peak_time': data.loc[peak_idx, 't'],
            'slope_dP_before_peak': linregress(data['t'][:peak_idx + 1], data["dP"][:peak_idx + 1])[0],
            'slope_dP_after_peak': linregress(data['t'][peak_idx:], data["dP"][peak_idx:])[0],
            'integral_dP': trapezoid(data["dP"], data['t'])
        })
    return pd.DataFrame(stats)


def train_pipeline(data_path, binary_target, numeric_target):
    """Обучение моделей для бинарного и числового признаков."""
    data_csv = load_data(data_path)
    work_file = preprocess_data(data_csv)
    stats_by_file = calculate_file_stats(work_file)

    # таргеты
    stats_by_file[binary_target] = work_file[binary_target]
    stats_by_file[numeric_target] = work_file[numeric_target]

    # треним классификатора
    X_clf = stats_by_file.drop(columns=[binary_target, numeric_target, 'file_name'])
    y_clf = stats_by_file[binary_target]
    X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

    clf_model = RandomForestClassifier(random_state=42, max_depth=5, n_jobs=-1, class_weight='balanced')
    clf_model.fit(X_train_clf, y_train_clf)

    clf_metrics = {'train_accuracy': accuracy_score(y_train_clf, clf_model.predict(X_train_clf)),
    'test_accuracy': accuracy_score(y_test_clf, clf_model.predict(X_test_clf)),
    'train_b_accuracy': balanced_accuracy_score(y_train_clf, clf_model.predict(X_train_clf)),
    'test_b_accuracy': balanced_accuracy_score(y_test_clf, clf_model.predict(X_test_clf)),
    # 'feature_importances': dict(zip(X_train_clf.columns, clf_model.feature_importances_))
                   }

    # треним регрессора
    regression_data = stats_by_file[stats_by_file[binary_target] == 1]
    X_reg = regression_data.drop(columns=[binary_target, numeric_target, 'file_name'])
    y_reg = regression_data[numeric_target]

    reg_model = RandomForestRegressor(random_state=42, max_depth=5, n_jobs=-1)
    if not X_reg.empty:
        X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2,
                                                                            random_state=42)
        reg_model.fit(X_train_reg, y_train_reg)
        
        reg_metrics = {
        'train_rmse': root_mean_squared_error(y_train_reg, reg_model.predict(X_train_reg)),
        'test_rmse': root_mean_squared_error(y_test_reg, reg_model.predict(X_test_reg)),
        'train_mape': mean_absolute_percentage_error(y_train_reg, reg_model.predict(X_train_reg)),
        'test_mape': mean_absolute_percentage_error(y_test_reg, reg_model.predict(X_test_reg)),
    }
    else:
        reg_model = None

    return {
        'binary_model': clf_model,
        'regression_model': reg_model,
        'stats_columns': X_clf.columns.tolist(),
        'clf_metrics': clf_metrics,
        'reg_metrics': reg_metrics
    }


def predict_pipeline(models, data_path, binary_target, numeric_target):
    """Предсказание бинарных и числовых признаков."""
    new_data = load_data(data_path)
    processed_data = preprocess_data(new_data)
    stats = calculate_file_stats(processed_data)

    # бин признак
    X_clf = stats.drop(columns=['file_name'], errors='ignore')
    binary_preds = models['binary_model'].predict(X_clf)

    result_df = pd.DataFrame({
        'file': processed_data['file_name'],
        binary_target: binary_preds
    })

    # числовой признак
    numeric_preds = np.full(len(result_df), np.nan)
    if models['regression_model'] is not None:
        mask = binary_preds == 1
        if mask.any():
            X_reg = stats[mask].drop(columns=['file_name'], errors='ignore')
            numeric_preds[mask] = models['regression_model'].predict(X_reg)

    result_df[numeric_target] = numeric_preds
    return result_df

In [14]:
DATA_PATH = 'hq_markup_train.csv'
BINARY_TARGET = 'Линейный режим'
NUMERIC_TARGET = 'Линейный режим_details'

models = train_pipeline(DATA_PATH, BINARY_TARGET, NUMERIC_TARGET)

predictions = predict_pipeline(models, DATA_PATH, BINARY_TARGET, NUMERIC_TARGET)

In [16]:
models['clf_metrics']

{'train_accuracy': 0.83,
 'test_accuracy': 0.73,
 'train_b_accuracy': np.float64(0.8626034177535316),
 'test_b_accuracy': np.float64(0.6485411140583555)}

In [17]:
models['reg_metrics']

{'train_rmse': 0.22646365920211953,
 'test_rmse': 0.3360334110313135,
 'train_mape': 4.019804751771536,
 'test_mape': 0.5362102182913818}

In [18]:
predictions

Unnamed: 0,file,Линейный режим,Линейный режим_details
0,e9c7e07f-b723-4da3-918e-7dbcb360c830,0,
1,726565c9-e05b-4a28-9079-22d94f9bb9e2,1,1.005519
2,c2a4d43d-5a93-4c37-81b6-d4fdc5b0280b,0,
3,15e04219-9fa0-401d-8b42-833b71ccafb3,0,
4,d43f459d-97fa-4ab0-bacc-43e2cb73e1b9,1,0.412346
...,...,...,...
495,a0448188-be5d-4ad6-929f-1870710e224b,0,
496,c295e7be-009c-4893-94c0-194d449077ed,0,
497,ac06114e-e466-497a-a0b7-5166d784140e,1,0.017599
498,0fd9d92c-a6d6-44c2-80b2-7f3cfcbea3da,0,
