In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from scipy.integrate import trapezoid
from scipy.stats import linregress
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score,
    root_mean_squared_error, mean_absolute_percentage_error
)

DATA_FOLDER_PATH = '../data' # фолдер с временными рядами


In [3]:
def load_data(data_path: str):
    """Загрузка данных из CSV файла."""
    return pd.read_csv(data_path)

Детектим шум

In [4]:
def calculate_noise_stats(data):
    std_deltaP = data['deltaP'].std()
    std_dP = data['dP'].std()
    outliers_deltaP = sum(abs(data['deltaP'] - data['deltaP'].mean()) > 3 * std_deltaP)
    outliers_dP = sum(abs(data['dP'] - data['dP'].mean()) > 3 * std_dP)
    smoothed_deltaP = data['deltaP'].rolling(window=5, center=True).mean()
    smoothed_dP = data['dP'].rolling(window=5, center=True).mean()
    noise_deltaP = (data['deltaP'] - smoothed_deltaP).abs()
    noise_dP = (data['dP'] - smoothed_dP).abs()
    return {
        'std_deltaP_noise': std_deltaP,
        'std_dP_noise': std_dP,
        'outliers_deltaP': outliers_deltaP,
        'outliers_dP': outliers_dP,
        'mean_noise_deltaP': noise_deltaP.mean(),
        'median_noise_deltaP': noise_deltaP.median(),
        'mean_noise_dP': noise_dP.mean(),
        'median_noise_dP': noise_dP.median()
    }

Детеким скачки

In [5]:
def calculate_transition_stats(data):
    deltaP_diff = data['deltaP'].diff().abs()
    dP_diff = data['dP'].diff().abs()
    jumps_deltaP = sum(deltaP_diff > deltaP_diff.mean() + 3 * deltaP_diff.std())
    jumps_dP = sum(dP_diff > dP_diff.mean() + 3 * dP_diff.std())
    return {
        'jumps_deltaP': jumps_deltaP,
        'jumps_dP': jumps_dP,
        'max_jump_deltaP': deltaP_diff.max(),
        'max_jump_dP': dP_diff.max(),
        'mean_jump_deltaP': deltaP_diff.mean(),
        'mean_jump_dP': dP_diff.mean()
    }

In [6]:
def preprocess_data(work_file):
    """Предобработка данных: удаление пустых или отсутствующих файлов."""
    if 'file_name' not in work_file.columns:
        raise ValueError("Столбец 'file_name' отсутствует в данных.")
    indices_to_drop = []
    for index, file_name in enumerate(work_file['file_name']):
        try:
            txt_file = pd.read_csv(f'{DATA_FOLDER_PATH}/{file_name}', delimiter='\t', names=['t', 'delta_p', 'dp'])
            if txt_file.empty:
                indices_to_drop.append(index)
        except (FileNotFoundError, pd.errors.EmptyDataError):
            indices_to_drop.append(index)
    return work_file.drop(indices_to_drop).reset_index(drop=True)

Детектим ступенчатость

In [7]:
def calculate_discreteness_stats(data):
    time_diff = data['t'].diff().dropna()
    deltaP_diff = data['deltaP'].diff().abs()
    steps = sum(deltaP_diff > deltaP_diff.mean() + 3 * deltaP_diff.std())
    return {
        'mean_time_step': time_diff.mean(),
        'steps_deltaP': steps
    }

In [8]:
def calculate_base_stats(data):
    
        peak_idx = data["dP"].idxmax()
        file_stats = {
        'max_deltaP': data['deltaP'].max(),
        'min_deltaP': data['deltaP'].min(),
        'max_dP': data['dP'].max(),
        'min_dP': data['dP'].min(),
        'mean_deltaP': data['deltaP'].mean(),
        'mean_dP': data['dP'].mean(),
        'std_dP': data['dP'].std(),
        'peak_time': data.loc[peak_idx, 't'],
        'slope_dP_before_peak': linregress(data['t'][:peak_idx + 1], data["dP"][:peak_idx + 1])[0],
        'slope_dP_after_peak': linregress(data['t'][peak_idx:], data["dP"][peak_idx:])[0],
        'integral_dP': trapezoid(data["dP"], data['t'])
    }
        return file_stats

In [9]:
def calculate_file_stats(dataframe):
    """Вычисление статистик для файлов."""
    stats = []
    for file_name in dataframe['file_name']:
        data = pd.read_csv(f'{DATA_FOLDER_PATH}/{file_name}', delimiter='\t', names=['t', 'deltaP', 'dP'])
        
        # Базовые статистики
        file_stats = {'file_name': file_name}
        file_stats.update(calculate_base_stats(data))
        file_stats.update(calculate_noise_stats(data))
        file_stats.update(calculate_transition_stats(data))
        file_stats.update(calculate_discreteness_stats(data))
        
        # Добавляем словарь в список
        stats.append(file_stats)
    
    return pd.DataFrame(stats)

In [10]:
def train_pipeline(data_path, binary_target, numeric_target):
    """Обучение моделей для бинарного и числового признаков."""
    data_csv = load_data(data_path)
    work_file = preprocess_data(data_csv)
    stats_by_file = calculate_file_stats(work_file)

    # таргеты
    stats_by_file[binary_target] = work_file[binary_target]
    stats_by_file[numeric_target] = work_file[numeric_target]

    # треним классификатора
    X_clf = stats_by_file.drop(columns=[binary_target, numeric_target, 'file_name'])
    y_clf = stats_by_file[binary_target]
    X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

    clf_model = RandomForestClassifier(random_state=42, max_depth=5, n_jobs=-1, class_weight='balanced', n_estimators=300, min_samples_split=5, min_samples_leaf=4)
    clf_model.fit(X_train_clf, y_train_clf)

    clf_metrics = {'train_accuracy': accuracy_score(y_train_clf, clf_model.predict(X_train_clf)),
    'test_accuracy': accuracy_score(y_test_clf, clf_model.predict(X_test_clf)),
    'train_b_accuracy': balanced_accuracy_score(y_train_clf, clf_model.predict(X_train_clf)),
    'test_b_accuracy': balanced_accuracy_score(y_test_clf, clf_model.predict(X_test_clf)),
    # 'feature_importances': dict(zip(X_train_clf.columns, clf_model.feature_importances_))
                   }

    # треним регрессора
    regression_data = stats_by_file[stats_by_file[binary_target] == 1]
    X_reg = regression_data.drop(columns=[binary_target, numeric_target, 'file_name'])
    y_reg = regression_data[numeric_target]

    reg_model = RandomForestRegressor(random_state=42, max_depth=5, n_jobs=-1)
    if not X_reg.empty:
        X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2,
                                                                            random_state=42)
        reg_model.fit(X_train_reg, y_train_reg)
        
        reg_metrics = {
        'train_rmse': root_mean_squared_error(y_train_reg, reg_model.predict(X_train_reg)),
        'test_rmse': root_mean_squared_error(y_test_reg, reg_model.predict(X_test_reg)),
        'train_mape': mean_absolute_percentage_error(y_train_reg, reg_model.predict(X_train_reg)),
        'test_mape': mean_absolute_percentage_error(y_test_reg, reg_model.predict(X_test_reg)),
    }
    else:
        reg_model = None

    return {
        'binary_model': clf_model,
        'regression_model': reg_model,
        'stats_columns': X_clf.columns.tolist(),
        'clf_metrics': clf_metrics,
        'reg_metrics': reg_metrics
    }

In [11]:
DATA_PATH = 'hq_markup_train.csv'
BINARY_TARGET = 'Некачественное ГДИС'
NUMERIC_TARGET = 'Некачественное ГДИС'

models = train_pipeline(DATA_PATH, BINARY_TARGET, NUMERIC_TARGET)

# predictions = predict_pipeline(models, DATA_PATH, BINARY_TARGET, NUMERIC_TARGET)

In [12]:
models['clf_metrics']

{'train_accuracy': 0.9775,
 'test_accuracy': 0.92,
 'train_b_accuracy': np.float64(0.9582251082251083),
 'test_b_accuracy': np.float64(0.8751902587519026)}

In [13]:
models['reg_metrics']

{'train_rmse': 0.0, 'test_rmse': 0.0, 'train_mape': 0.0, 'test_mape': 0.0}