In [161]:
import numpy as np
import pandas as pd
from scipy.integrate import trapezoid
from scipy.optimize import curve_fit
from scipy.stats import linregress

In [162]:
data_csv = pd.read_csv("hq_markup_train.csv")
bq_data_csv = pd.read_csv("markup_train.csv")
# data_csv

Тут надо указать, какой файл и какой столбец будет обрабатываться 

In [163]:
work_file = data_csv
bin_col_name = 'Билинейный режим'
digit_col_name = 'Билинейный режим_details'

In [164]:
# col_name = 'Билинейный режим_details'
# # col_name.replace('_details', '')
# data_csv[data_csv[col_name.replace('_details', '')]==1][['Билинейный режим', 'Билинейный режим_details']].isna().sum()

In [165]:
def calculate_file_stats(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Вычисляет статистические характеристики для каждого файла.

    Параметры:
    ----------
    dataframe : pandas.DataFrame
        DataFrame, содержащий столбец 'file_name' с именами файлов, которые нужно обработать.

    Возвращает:
    ----------
    stats_by_file : pandas.DataFrame
        DataFrame, содержащий статистические характеристики для каждого файла. Включает следующие столбцы:
        - file_name: имя файла
        - max_deltaP: максимальное значение deltaP
        - min_deltaP: минимальное значение deltaP
        - max_dP: максимальное значение dP
        - min_dP: минимальное значение dP
        - mean_deltaP: среднее значение deltaP
        - mean_dP: среднее значение dP
        - std_dP: стандартное отклонение dP
        - peak_time: время пика dP
        - slope_dP_before_peak: наклон dP до пика
        - slope_dP_after_peak: наклон dP после пика
        - integral_dP: интеграл dP по времени
    """
    # пустой DataFrame для хранения статистики
    stats_by_file = pd.DataFrame(columns=[
        'file_name',
        'max_deltaP',
        'min_deltaP',
        'max_dP',
        'min_dP',
        'mean_deltaP',
        'mean_dP',
        'std_dP',
        'peak_time',
        'slope_dP_before_peak',
        'slope_dP_after_peak',
        'integral_dP'
    ])

    for file_name in dataframe['file_name']:
        
        data = pd.read_csv(f'data/{file_name}', delimiter='\t', names=['t', 'deltaP', 'dP'])
        
        max_deltaP = data['deltaP'].max()
        min_deltaP = data['deltaP'].min()
        max_dP = data['dP'].max()
        min_dP = data['dP'].min()
        mean_deltaP = data['deltaP'].mean()
        mean_dP = data['dP'].mean()
        std_dP = data['dP'].std()

        peak_idx = data["dP"].idxmax()
        peak_time = data.loc[peak_idx, 't']

        slope_before, *_ = linregress(data['t'][:peak_idx+1], data["dP"][:peak_idx+1])
        slope_after, *_ = linregress(data['t'][peak_idx:], data["dP"][peak_idx:])
        slope_dP_before_peak = slope_before
        slope_dP_after_peak = slope_after

        integral_dP = trapezoid(data["dP"], data['t'])

        features_dict = {
            'file_name': file_name,
            'max_deltaP': max_deltaP,
            'min_deltaP': min_deltaP,
            'max_dP': max_dP,
            'min_dP': min_dP,
            'mean_deltaP': mean_deltaP,
            'mean_dP': mean_dP,
            'std_dP': std_dP,
            'peak_time': peak_time,
            'slope_dP_before_peak': slope_dP_before_peak,
            'slope_dP_after_peak': slope_dP_after_peak,
            'integral_dP': integral_dP
        }

        features_df = pd.DataFrame([features_dict])
        stats_by_file = pd.concat([stats_by_file, features_df], ignore_index=True)

    return stats_by_file

Удалим все строки, которые указывают на пустой файл

In [166]:
indices_to_drop = []
print(work_file.shape)

for index, file_name in enumerate(work_file['file_name']):
    txt_file = pd.read_csv(f'data/{file_name}', delimiter='\t', names=['t', 'delta_p', 'dp'])
    if len(txt_file) == 0:
        indices_to_drop.append(index)

work_file = work_file.drop(indices_to_drop).reset_index(drop=True)
work_file.shape

(500, 18)


(500, 18)

In [167]:
regression_data = work_file[work_file[bin_col_name]==1].reset_index(drop=True)

In [None]:
stats_by_file = calculate_file_stats(regression_data)
stats_by_file = stats_by_file.reset_index(drop=True)

In [169]:
# col_name = 'Билинейный режим_details'
stats_by_file[digit_col_name] = regression_data[digit_col_name]

In [170]:
from sklearn.model_selection import train_test_split
X = stats_by_file.drop(columns=[digit_col_name, 'file_name'])
y = stats_by_file[digit_col_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y.describe()

In [172]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42, n_jobs=-1, max_depth=5)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f'train rmse: {root_mean_squared_error(y_train, model.predict(X_train))}')
print(f'test rmse: {root_mean_squared_error(y_test, model.predict(X_test))}')
print(f'train mape: {mean_absolute_percentage_error(y_train, model.predict(X_train))}')
print(f'test mape: {mean_absolute_percentage_error(y_test, model.predict(X_test))}')

train rmse: 0.16406662742195838
test rmse: 0.2561705747070019
train mape: 1.1382031968985804
test mape: 1.3689997810080077


In [173]:
model.feature_importances_

array([0.20101269, 0.05083238, 0.10935935, 0.1075651 , 0.09905236,
       0.14935978, 0.09594139, 0.00440744, 0.01054387, 0.04356602,
       0.12835962])