In [86]:
import warnings

warnings.filterwarnings('ignore')

In [87]:
import os
import glob
import numpy as np
import pandas as pd
from scipy.integrate import trapezoid
from scipy.stats import linregress
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score,
    root_mean_squared_error, mean_absolute_percentage_error
)

DATA_FOLDER_PATH = 'data'

BINARY_TARGETS = [
    'Некачественное ГДИС',
    'Влияние ствола скважины',
    'Радиальный режим',
    'Линейный режим',
    'Билинейный режим',
    'Сферический режим',
    'Граница постоянного давления',
    'Граница непроницаемый разлом'
]


def load_data(data_path: str):
    """Загрузка данных из CSV файла."""
    return pd.read_csv(data_path)


def preprocess_data(work_file):
    """Предобработка данных: удаление пустых или отсутствующих файлов."""
    indices_to_drop = []
    for index, file_name in enumerate(work_file['file_name']):
        try:
            txt_file = pd.read_csv(f'{DATA_FOLDER_PATH}/{file_name}', delimiter='\t', names=['t', 'delta_p', 'dp'])
            if txt_file.empty:
                indices_to_drop.append(index)
        except (FileNotFoundError, pd.errors.EmptyDataError):
            indices_to_drop.append(index)
    return work_file.drop(indices_to_drop).reset_index(drop=True)


def calculate_file_stats(dataframe):
    """Вычисление статистик для файлов."""
    stats = []
    for file_name in dataframe['file_name']:
        data = pd.read_csv(f'{DATA_FOLDER_PATH}/{file_name}', delimiter='\t', names=['t', 'deltaP', 'dP'])
        peak_idx = data["dP"].idxmax()
        stats.append({
            'file_name': file_name,
            'max_deltaP': data['deltaP'].max(),
            'min_deltaP': data['deltaP'].min(),
            'max_dP': data['dP'].max(),
            'min_dP': data['dP'].min(),
            'mean_deltaP': data['deltaP'].mean(),
            'mean_dP': data['dP'].mean(),
            'std_dP': data['dP'].std(),
            'peak_time': data.loc[peak_idx, 't'],
            'slope_dP_before_peak': linregress(data['t'][:peak_idx + 1], data["dP"][:peak_idx + 1])[0],
            'slope_dP_after_peak': linregress(data['t'][peak_idx:], data["dP"][peak_idx:])[0],
            'integral_dP': trapezoid(data["dP"], data['t'])
        })
    return pd.DataFrame(stats)


def train_pipeline(data_path):
    """Обучение моделей."""
    data_csv = load_data(data_path)
    work_file = preprocess_data(data_csv)
    stats_by_file = calculate_file_stats(work_file)

    models = {}

    base_features = [
        col for col in stats_by_file.columns
        if col != 'file_name'
           and not col.endswith('_details')
           and col not in BINARY_TARGETS
    ]

    for binary_target in BINARY_TARGETS:
        if binary_target not in work_file.columns:
            continue

        numeric_target = f"{binary_target}_details" if binary_target != 'Некачественное ГДИС' else None

        # Треним классификатор
        X_clf = stats_by_file[base_features]
        y_clf = work_file[binary_target]

        if y_clf.nunique() < 2:
            continue

        X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
            X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf
        )

        clf_model = RandomForestClassifier(
            random_state=42,
            max_depth=5,
            n_jobs=-1,
            class_weight='balanced'
        )
        clf_model.fit(X_train_clf, y_train_clf)

        # Метрики классификатора
        clf_metrics = {
            'train_accuracy': accuracy_score(y_train_clf, clf_model.predict(X_train_clf)),
            'test_accuracy': accuracy_score(y_test_clf, clf_model.predict(X_test_clf)),
            'train_b_accuracy': balanced_accuracy_score(y_train_clf, clf_model.predict(X_train_clf)),
            'test_b_accuracy': balanced_accuracy_score(y_test_clf, clf_model.predict(X_test_clf)),
        }

        # Треним регрессор
        reg_model = None
        reg_metrics = {}
        if numeric_target and numeric_target in work_file.columns:
            regression_data = work_file[work_file[binary_target] == 1]
            if len(regression_data) >= 2:
                X_reg = stats_by_file.loc[regression_data.index][base_features]
                y_reg = regression_data[numeric_target]

                X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
                    X_reg, y_reg, test_size=0.2, random_state=42
                )

                reg_model = RandomForestRegressor(
                    random_state=42,
                    max_depth=5,
                    n_jobs=-1
                )
                reg_model.fit(X_train_reg, y_train_reg)

                reg_metrics = {
                    'train_rmse': root_mean_squared_error(y_train_reg, reg_model.predict(X_train_reg)),
                    'test_rmse': root_mean_squared_error(y_test_reg, reg_model.predict(X_test_reg)),
                    'train_mape': mean_absolute_percentage_error(y_train_reg, reg_model.predict(X_train_reg)),
                    'test_mape': mean_absolute_percentage_error(y_test_reg, reg_model.predict(X_test_reg)),
                }

        models[binary_target] = {
            'classifier': clf_model,
            'regressor': reg_model,
            'features': base_features,
            'clf_metrics': clf_metrics,
            'reg_metrics': reg_metrics,
            'numeric_target': numeric_target
        }

    return {'models': models, 'base_features': base_features}


def predict_pipeline(models, val_data_folder_path):
    """Предсказание по файлам."""

    e_c = 0  # пустые файлы
    stats = []

    for file_path in glob.glob(os.path.join(val_data_folder_path, '*')):
        file_name = os.path.basename(file_path)
        txt_file = pd.read_csv(file_path, delimiter='\t', names=['t', 'delta_p', 'dp'])

        if txt_file.empty:
            print(f'Пустой файл: {file_name}')
            e_c += 1
            continue

        peak_idx = txt_file["dp"].idxmax()
        stats.append({
            'file_name': file_name,
            'max_deltaP': txt_file['delta_p'].max(),
            'min_deltaP': txt_file['delta_p'].min(),
            'max_dP': txt_file['dp'].max(),
            'min_dP': txt_file['dp'].min(),
            'mean_deltaP': txt_file['delta_p'].mean(),
            'mean_dP': txt_file['dp'].mean(),
            'std_dP': txt_file['dp'].std(),
            'peak_time': txt_file.loc[peak_idx, 't'],
            'slope_dP_before_peak': linregress(txt_file['t'][:peak_idx + 1], txt_file["dp"][:peak_idx + 1])[0],
            'slope_dP_after_peak': linregress(txt_file['t'][peak_idx:], txt_file["dp"][peak_idx:])[0],
            'integral_dP': trapezoid(txt_file["dp"], txt_file['t'])
        })
    print(f'empty files count: {e_c}')
    # print(stats)

    stats = pd.DataFrame(stats)

    all_columns = ['file'] + BINARY_TARGETS + [f"{t}_details" for t in BINARY_TARGETS if t != 'Некачественное ГДИС']

    result_df = pd.DataFrame({col: np.nan for col in all_columns}, index=stats.index)

    result_df['file'] = stats['file_name']

    # Предсказания для каждой модели
    for target, model_data in models['models'].items():
        features = model_data['features']
        numeric_target = model_data['numeric_target']

        # Бинарное предсказание
        X = stats[features]
        result_df[target] = model_data['classifier'].predict(X)

        # Числовое предсказание
        if model_data['regressor'] and numeric_target:
            mask = result_df[target] == 1
            if mask.any():
                result_df.loc[mask, numeric_target] = model_data['regressor'].predict(
                    stats.loc[mask, features]
                )

    return result_df

In [88]:
DATA_PATH = 'hq_markup_train.csv'

# Обучение моделей
trained_models = train_pipeline(DATA_PATH)

In [89]:
predictions = predict_pipeline(trained_models, 'validation 1')

Пустой файл: 846291a2-6475-47ac-8f10-08eac6b93fcb
empty files count: 1


In [90]:
predictions

Unnamed: 0,file,Некачественное ГДИС,Влияние ствола скважины,Радиальный режим,Линейный режим,Билинейный режим,Сферический режим,Граница постоянного давления,Граница непроницаемый разлом,Влияние ствола скважины_details,Радиальный режим_details,Линейный режим_details,Билинейный режим_details,Сферический режим_details,Граница постоянного давления_details,Граница непроницаемый разлом_details
0,00950953-2e3b-4e38-91cb-5bbdd5dbda0e,0,1,0,0,1,0,0,1,1.415261,,,1.115903,,,83.981770
1,01177e76-c463-423e-8b03-c4386c432d1b,0,1,1,0,1,0,0,0,1.493934,1.512206,,1.017995,,,
2,018d2fc4-3078-4ec5-89a3-5e8d4222ca15,0,1,0,0,0,1,1,0,1.490442,,,,1.400838,28.243110,
3,0194dbe5-a531-43a1-8111-5b2afa3f1e93,1,0,0,0,0,0,0,0,,,,,,,
4,01a8e17c-a75c-4e61-bd0f-580a3ea626ea,0,1,0,1,1,0,0,1,1.426424,,0.801447,0.837299,,,233.951526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,fd66f729-25ce-4564-8536-03566030b414,0,0,0,0,0,0,0,0,,,,,,,
495,fdaa271c-aa61-42c4-a5ba-ecaae371302e,0,1,1,0,0,1,0,0,1.960127,0.224393,,,0.541941,,
496,fe8ca18d-0914-45dc-9ef1-710e323ddf7f,1,0,0,0,0,0,0,0,,,,,,,
497,fef9e492-2e53-4041-959b-1a22710aaca5,0,1,0,0,0,1,1,0,2.478278,,,,0.952073,22.718777,


In [100]:
trained_models.keys()

dict_keys(['models', 'base_features'])

In [99]:
trained_models['models']['Билинейный режим']['clf_metrics']

{'train_accuracy': 0.8625,
 'test_accuracy': 0.75,
 'train_b_accuracy': np.float64(0.8838685586258401),
 'test_b_accuracy': np.float64(0.7004517221908526)}

In [101]:
trained_models['models']['Билинейный режим']['reg_metrics']

{'train_rmse': 0.16406662742195838,
 'test_rmse': 0.25617057470700194,
 'train_mape': 1.13820319689858,
 'test_mape': 1.3689997810080081}