In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Обработка результатов

## **Изучение полученных результатов из УМ**

In [None]:
def get_result(config:str,
               way_type:str,
               fault:str,
               speed:int,
               profile:str = "gost",
               force:str = "vertical") ->pd.DataFrame:
    """
    Получение результатов расчетов по ключевым словам
    1. `config` принимает два вида строк `empty` и `loaded`
    2. `way_type` принимает три вида строк `straight`, `curve_350` и `curve_650`
    3. `fault` принимает три вида строк `normal`, `polzun15`, `ellips10`
    4. `speed` от 10 до 60 км/ч
    5. `profile` есть профили `gost`, `newwagonw`, `greb_26`, `greb_30`, `greb_28`, `greb_24`
    """
    dictionary = {"curve_350":{20:55,
                               30:39,
                               40:28,
                               50:24,
                               60:19},
                }

    if force == "vertical":    
        if config == "empty":
            PATH = "C:\\Users\\Daniil\\Desktop\\simulation_results\\Vertical force\\empty"
        
        elif config == "loaded":
            PATH = "C:\\Users\\Daniil\\Desktop\\simulation_results\\Vertical force\\loaded"
    
    elif force == "side":
        if config == "empty":
            PATH = "C:\\Users\\Daniil\\Desktop\\simulation_results\\Side force\\empty"
        
        elif config == "loaded":
            PATH = "C:\\Users\\Daniil\\Desktop\\simulation_results\\Side force\\loaded"
    
        
    name = "\\"+config+"_"+way_type+"_"+fault+"_"+str(speed)+"_"+profile+".csv"

    l_name = len(name)

    FULL_PATH = PATH+name

    print(name)

    file = pd.read_csv(FULL_PATH,encoding='latin-1')

    COL_NAMES = ["time_step",name[1:l_name-4]]

    file.columns = COL_NAMES

    file = file.set_index("time_step")

    if config == "curve_350":
        file = file[file.index<dictionary["curve_350"][speed]]

    
    return file


In [None]:
def get_profile_results(config:str,
                        way_type:str,
                        fault:str,
                        speed:int,
                        force:str = "vertical"):
    """Результаты расчета по всем видам профилей"""
    if force == "vertical":    
        file1 = get_result(config, way_type,fault,speed)
        file2 = get_result(config, way_type,fault,speed, "greb_26")
        file3 = get_result(config, way_type,fault,speed, "greb_28")
        file4 = get_result(config, way_type,fault,speed, "greb_30")
        file5 = get_result(config, way_type,fault,speed, "newwagonw")
        file6 = get_result(config, way_type,fault,speed, "greb_24")
    
    elif force =="side":
        file1 = get_result(config, way_type,fault,speed, force="side")
        file2 = get_result(config, way_type,fault,speed, "greb_26", force="side")
        file3 = get_result(config, way_type,fault,speed, "greb_28", force="side")
        file4 = get_result(config, way_type,fault,speed, "greb_30", force="side")
        file5 = get_result(config, way_type,fault,speed, "newwagonw", force="side")
        file6 = get_result(config, way_type,fault,speed, "greb_24", force="side")
    

    return file1,file2,file3,file4,file5, file6

In [None]:
def plot_profile_results(config:str,way_type:str,fault:str,speed:int, force:str ="vertical"):
    """Создание графика сравнения результатов с разным профилем колес"""
    d={"loaded":"Груженый",
       "empty":"Порожний",
       "straight":"прямая",
       "curve_350":"кривая 350 м",
       "curve_650":"кривая 650 м",
       "normal":"без неисправностей",
       "polzun15":"ползун",
       "ellips10":"неравномерный прокат"}
    
   #  sns.set (rc={' axes.facecolor':'#C0C0C0', 'figure.facecolor':'#FFFFF0 '})
    
    files = get_profile_results(config, way_type,fault,speed, force=force)
    FILE = pd.concat(files,axis=1)
    FILE.columns = ["gost","greb_26","greb_28","greb_30","newwagonw","greb_24"]

    plt.figure(figsize=(12,8))
    plt.grid(True)
    sns.lineplot(FILE)
    plt.title(f"{d[config]} вагон, {d[way_type]}, {d[fault]}, скорость {speed} км/ч ")
    plt.xlabel("Время, с")
    if force == "vertical":
        plt.ylabel("Вертикальная сила, Н")
    elif force == "side":
        plt.ylabel("Боковая сила, Н")
    plt.show()
    



In [None]:
plot_profile_results("loaded","curve_350","ellips10",60,force="side")

In [None]:
def get_speed_results(config:str,
                      way_type:str,
                      fault:str,
                      profile:str = "gost",
                      force:str ="vertical") -> list[pd.DataFrame]:
    """Получение расчета сразу по всем скоростям"""
    speed = [i for i in range(10,130,10)]

    results = []

    for v in speed:
        if "curve" in way_type and v > 80:
            continue
        
        if force=="vertical":
            file = get_result(config,way_type,fault,profile=profile,speed=v,force=force)
            results.append(file)
            
        elif force=="side":
            file = get_result(config,way_type,fault,profile=profile,speed=v,force=force)
            results.append(file)

    return results

In [None]:
get_speed_results("loaded","straight","normal","greb_24","vertical")

In [None]:
from IPython.display import clear_output

wagon_cfg = ["empty","loaded"]
way_cfg = ["straight","curve_350","curve_650"]
wheel_cfg = ["gost", "greb_26","greb_28","greb_30","newwagonw", "greb_24"]
fault_cfg = ["normal","polzun15","ellips10"]

def get_full_calculations(wagon_cfg:list,
                          way_cfg:list,
                          wheel_cfg:list,
                          fault_cfg:list,
                          force:str = "vertical") -> dict:
    """Получение словаря со всеми расчетами"""
    gen_dict = {}
    

    for wagon in wagon_cfg:
        gen_dict[wagon] = {}
        for way in way_cfg:
            gen_dict[wagon][way] = {}
            for fault in fault_cfg:   
                gen_dict[wagon][way][fault] = {}
                for wheel in wheel_cfg:
                    clear_output(True)
                    print(f"{wagon}\n{way}\n{fault}\n{wheel}\n------")
                    gen_dict[wagon][way][fault][wheel] = get_speed_results(wagon,way,fault,wheel,force=force)
    
    return gen_dict

In [None]:
ALL_CALCULATIONS_VERTICAL = get_full_calculations(wagon_cfg,way_cfg,wheel_cfg,fault_cfg,force="vertical")

In [None]:
ALL_CALCULATIONS_SIDE = get_full_calculations(wagon_cfg,way_cfg,wheel_cfg,fault_cfg,force="side")

## **Подготовка инфы для разделения на кастомные фолды**

Так как колесо по ГОСТ имеет радиус 475 мм, то полный оборот колеса fold = 2*pi*475

Тогда время прохождения полного оборота колеса будет равно t = fold/v 

In [None]:
def time_split(v:int) -> int:
    """Определение временного промежутка полного оборота колеса
    1. v - скорость движения поезда, км/ч
    """
    speed = v/3.6
    lenght = 2*np.pi*0.475
    t = lenght/speed
    return t

In [None]:
plt.grid(True)
sns.lineplot(ALL_CALCULATIONS_VERTICAL["empty"]["straight"]["normal"]["gost"][0])
plt.show()

In [None]:
def get_time_splits(data:pd.DataFrame) -> list:
    """Возвращает индексы по которым нужно производить обрез
    Индексы высчтитываются в зависимости от скорости движения вагона и радиуса колеса"""
    
    res = []

    start_point = 1.4

    time_max_point = data.index.max()               # Максимальное время

    if data.columns[0].split("_")[1] == "straight":
        col_name = data.columns[0].split("_")           # 10,20,30... км/ч
        wheel_rotate_num = time_split(int(col_name[3])) # 1.007 сек

    elif data.columns[0].split("_")[1] == "curve":
        col_name = data.columns[0].split("_")           # 10,20,30... км/ч
        wheel_rotate_num = time_split(int(col_name[4])) # 1.007 сек
        
    num_folds = (time_max_point-1)//wheel_rotate_num    # Сколько всего фолдов получится сделать

    res.append(start_point)

    for _ in range(int(num_folds)):
        start_point+= wheel_rotate_num
        res.append(start_point)

    return res

In [None]:
def time_indexes(frames:list[pd.DataFrame]) -> dict:
    """Возвращает словарь индексов по которым нужно производить обрез, где
    индексы высчтитываются в зависимости от скорости движения вагона и радиуса колеса"""
    
    res = {}

    for n in range(len(frames)):
        str = frames[n].columns[0].split("_")
        for s in str:
            if s.isdigit():                        # тут нужно придумать исключение для толщины гребней
                if int(s) != 350 and int(s) != 650 and int(s) != 24:
                    name = s

                    res[name] = get_time_splits(frames[n])
    
    return res

In [None]:
ALL_CALCULATIONS_VERTICAL["empty"]["straight"]["normal"]["greb_24"][1].columns

In [None]:
time_indexes(ALL_CALCULATIONS_VERTICAL["empty"]["straight"]["normal"]["greb_24"])

In [None]:
def get_all_time_indexes(calculations:dict[dict[dict[dict[list[pd.DataFrame]]]]]) -> dict:
    """Получение всех индексов времени по которым надо делить расчеты в виде словаря"""

    wagon_cfg = calculations.keys()
    way_cfg = calculations["empty"].keys()
    fault_cfg = calculations["empty"]["straight"].keys()
    wheel_cfg = calculations["empty"]["straight"]["normal"].keys()


    gen_dict = {}
    

    for wagon in wagon_cfg:
        gen_dict[wagon] = {}

        for way in way_cfg:
            gen_dict[wagon][way] = {}

            for fault in fault_cfg:   
                gen_dict[wagon][way][fault] = {}
                
                for wheel in wheel_cfg:
                    clear_output(True)
                    print(f"{wagon}\n{way}\n{fault}\n{wheel}\n------")
                    gen_dict[wagon][way][fault][wheel] = time_indexes(calculations[wagon][way][fault][wheel])
    
    return gen_dict

ALL_TIME_INDEXES = get_all_time_indexes(ALL_CALCULATIONS_VERTICAL)

In [None]:
def get_splitted_dataframe(data:pd.DataFrame,indexes:list) -> pd.DataFrame:
    """Разделение одного результата расчета на несколько других по полному обороту колеса
    1. `data` - датафрейм с расчетом
    2. `indexes` - индексы по которым нужно делить расчет"""
    
    zeros = np.zeros((214,1))
    common_df = pd.DataFrame(zeros)

    for i in range(len(indexes)):
        if i < len(indexes) - 1:
            seq = data[(data.index>=indexes[i]) & (data.index<=indexes[i+1])]  # срез по точкам
        else:
            seq = data[data.index>indexes[i]]
        common_df = pd.concat([common_df,seq], axis=1)
    
    df = common_df.drop(0, axis=1)
    num_cols = len(df.columns)

    df.columns = [[data.columns[0] for i in range(num_cols)],[i for i in range(num_cols)]]
    
    return df       

In [None]:
plt.figure().set_size_inches(12,8)
plt.grid(True)
sns.lineplot(ALL_CALCULATIONS_VERTICAL["loaded"]["straight"]["normal"]["gost"][1])
plt.title("Не разделенный расчет")
plt.ylabel("Вертикальная сила, Н")
plt.xlabel("Время, с")
plt.show()

**Пример разделения расчета на несколько мелких по обороту колеса**

In [None]:
devided_result = get_splitted_dataframe(ALL_CALCULATIONS_VERTICAL["loaded"]["straight"]["normal"]["gost"][1],
                                       ALL_TIME_INDEXES["loaded"]["straight"]["normal"]["gost"]["20"])

devided_result_ = get_splitted_dataframe(ALL_CALCULATIONS_SIDE["loaded"]["straight"]["normal"]["gost"][1],
                                       ALL_TIME_INDEXES["loaded"]["straight"]["normal"]["gost"]["20"])

plt.figure().set_size_inches(12,8)
plt.grid(True)
plt.plot(devided_result)
plt.ylabel("Вертикальная сила, Н")
plt.xlabel("Время, с")
plt.title("Раздедение одного расчета по полному обороту колеса")
plt.xlim(0,5)
plt.ylim(0,225000)
plt.show()

In [None]:
from scipy.stats import skew, kurtosis

def get_skew_kurt(data:pd.DataFrame) -> pd.DataFrame:
    """Получение дополнительных фичей для расчетов"""
    
    cols = data.columns

    skews = []
    kurtosises = []

    for i in cols:
        skew_ = skew(data[i].dropna().to_numpy())
        kurt_ = kurtosis(data[i].dropna().to_numpy())
        skews.append(skew_)
        kurtosises.append(kurt_)
    
    return pd.DataFrame({"skew":skews,"kurt":kurtosises}, index=cols).T

In [None]:
def get_description(data:pd.DataFrame) -> pd.DataFrame:
    """Получаем описанный фрейм и к нему добавляем доп фичи"""
    
    summ = data.sum()
    variance = data.var()
    skew_kurt = get_skew_kurt(data)
    desc = data.describe()

    summ_var = pd.concat([variance,summ],axis=1).T
    summ_var.index = ["var","sum"]
   
    df = pd.concat([desc,summ_var,skew_kurt], axis=0)
    return df

In [None]:
from IPython.display import clear_output

def make_frame_from_splits(calculations:dict,time_indexes:dict) -> pd.DataFrame:
    """Объединение всех разделенных расчетов на фолды и создание фичей"""
    
    wagon_cfg = calculations.keys()
    way_cfg = calculations["empty"].keys()
    fault_cfg = calculations["empty"]["straight"].keys()
    wheel_cfg = calculations["empty"]["straight"]["normal"].keys()
    speed_cfg = time_indexes["empty"]["straight"]["normal"]["gost"].keys()
    lenght = len(calculations["empty"]["straight"]["normal"]["gost"])
    
    zeros = np.zeros((1,12))
    common_df = pd.DataFrame(zeros)

    n = 0

    for wagon in wagon_cfg:
        for way in way_cfg:
            for fault in fault_cfg:
                for wheel in wheel_cfg:
                    for l,speed in zip(range(lenght),speed_cfg):
                        
                        if "curve" in way:
                            if l > 7 and int(speed) > 80:
                                continue
                        
                        splitted_df = get_splitted_dataframe(calculations[wagon][way][fault][wheel][l],
                                                            time_indexes[wagon][way][fault][wheel][speed])
                        
                        feats = get_description(splitted_df)

                        common_df = pd.concat([common_df,feats], axis=1)

                        clear_output(wait=True)
                        print(f"Сделано: {n}")
                        n+=1
                       
    
    df = common_df.drop(0, axis=0).drop(0,axis=1)

    return df                   

In [None]:
# ЗАПУСТИТЬ ЭТИ ЯЧЕЙКИ

df_vertical = make_frame_from_splits(ALL_CALCULATIONS_VERTICAL,ALL_TIME_INDEXES)


In [None]:
def make_pretty_df(data:pd.DataFrame, file_name:str, save:bool):
    df_ = data.copy()
    unvalid_cols = [i for i in range(1,12)]

    df_ = df_.drop(unvalid_cols,axis=1)     # Тут заменил df на df_

    new_cols = pd.MultiIndex.from_tuples(df_.columns)
    df_.columns = new_cols

    if save:
        df_.T.to_parquet(f"{file_name}")

    return df_.T

In [None]:
# ЗАПУСТИТЬ ЭТИ ЯЧЕЙКИ, СОХРАНЕНИЕ УЖЕ ЕСТЬ В ФУНКЦИИ

DF_VERTICAL = make_pretty_df(df_vertical,"data_new_raw_vertical.parquet",True)


In [None]:
df_side = make_frame_from_splits(ALL_CALCULATIONS_SIDE,ALL_TIME_INDEXES)
DF_SIDE = make_pretty_df(df_side,"data_new_raw_side.parquet",True)

## Подгрузка сохраненных результатов

In [2]:
# DF = df_
DF_SIDE = pd.read_parquet("data_new_raw_side.parquet")
DF_VERTICAL = pd.read_parquet("data_new_raw_vertical.parquet")

In [3]:
DF_SIDE.head()

Unnamed: 0,Unnamed: 1,25%,50%,75%,count,kurt,max,mean,min,skew,std,sum,var
empty_straight_normal_10_gost,0,-11091.2945,-7695.0806,-5090.5459,215.0,-0.233897,4572.9741,-7727.353586,-16595.238,0.312691,4381.332844,-1661381.0,19196080.0
empty_straight_normal_10_gost,1,-11988.8725,-8809.9502,-6249.2637,215.0,-0.960363,-2141.2126,-9108.305539,-16471.59,-0.236408,3683.999205,-1958286.0,13571850.0
empty_straight_normal_10_gost,2,-14041.421,-10742.788,-7864.0198,215.0,-0.813141,-1299.6753,-11051.330162,-20610.469,-0.120031,3959.602067,-2376036.0,15678450.0
empty_straight_normal_10_gost,3,-12863.902,-9595.0107,-6922.1643,215.0,-0.622037,139.41061,-9705.199496,-19541.955,0.042085,3892.656564,-2086618.0,15152780.0
empty_straight_normal_10_gost,4,-12899.497,-10761.991,-8761.3857,215.0,-0.390614,-3988.2886,-11032.573618,-18987.34,-0.260762,3119.499045,-2372003.0,9731274.0


In [4]:
DF_VERTICAL.head()

Unnamed: 0,Unnamed: 1,25%,50%,75%,count,kurt,max,mean,min,skew,std,sum,var
empty_straight_normal_10_gost,0,126248.195,127004.93,127457.27,215.0,0.712095,128489.06,126760.678,123032.87,-0.803691,916.289112,27253545.77,839585.7
empty_straight_normal_10_gost,1,127451.105,127666.25,128031.975,215.0,-0.154321,128716.65,127687.467349,126435.99,-0.283763,439.892909,27452805.48,193505.8
empty_straight_normal_10_gost,2,127705.36,128518.85,130617.665,215.0,-1.200553,132194.3,129000.508791,126678.78,0.375457,1505.297634,27735109.39,2265921.0
empty_straight_normal_10_gost,3,126194.69,127737.66,128519.41,215.0,-0.551838,131944.66,127570.362279,124110.58,0.091421,1621.369411,27427627.89,2628839.0
empty_straight_normal_10_gost,4,126548.895,126879.16,127545.775,215.0,0.913185,129705.06,127062.812,125834.34,1.052079,714.953224,27318504.58,511158.1


In [5]:
def delete_unvalid_cols(data:pd.DataFrame) -> pd.DataFrame:
    """Удаление неликвидных колонок"""
    data_copy = data.T.copy()
    unvalid_cols = []

    for i in data_copy.columns:
        if data_copy[i].nunique() < 3:
            unvalid_cols.append(i)
    
    df = data_copy.drop(unvalid_cols, axis=1)

    return df.T


DF_SIDE = delete_unvalid_cols(DF_SIDE)
DF_VERTICAL = delete_unvalid_cols(DF_VERTICAL)

DF_VERTICAL.head(15)
# print(df.shape)

Unnamed: 0,Unnamed: 1,25%,50%,75%,count,kurt,max,mean,min,skew,std,sum,var
empty_straight_normal_10_gost,0,126248.195,127004.93,127457.27,215.0,0.712095,128489.06,126760.678,123032.87,-0.803691,916.289112,27253550.0,839585.7
empty_straight_normal_10_gost,1,127451.105,127666.25,128031.975,215.0,-0.154321,128716.65,127687.467349,126435.99,-0.283763,439.892909,27452810.0,193505.8
empty_straight_normal_10_gost,2,127705.36,128518.85,130617.665,215.0,-1.200553,132194.3,129000.508791,126678.78,0.375457,1505.297634,27735110.0,2265921.0
empty_straight_normal_10_gost,3,126194.69,127737.66,128519.41,215.0,-0.551838,131944.66,127570.362279,124110.58,0.091421,1621.369411,27427630.0,2628839.0
empty_straight_normal_10_gost,4,126548.895,126879.16,127545.775,215.0,0.913185,129705.06,127062.812,125834.34,1.052079,714.953224,27318500.0,511158.1
empty_straight_normal_10_gost,5,126508.655,127269.22,128522.04,215.0,-0.424134,131159.64,127612.129488,125634.27,0.714055,1345.966395,27436610.0,1811626.0
empty_straight_normal_10_gost,6,127270.655,131287.77,132180.03,215.0,-0.917315,134591.02,130195.758093,124305.92,-0.573006,2765.339186,27992090.0,7647101.0
empty_straight_normal_10_gost,7,123335.995,124113.06,124728.555,215.0,0.19034,127695.05,124161.085581,121648.42,0.594783,1204.612356,26694630.0,1451091.0
empty_straight_normal_10_gost,8,122561.4125,124238.14,124826.245,214.0,-0.317501,128177.18,123816.053645,119859.16,-0.378228,1596.119046,26496640.0,2547596.0
empty_straight_normal_10_gost,9,122923.54,124045.73,125428.21,215.0,6.953195,134962.42,124085.049907,114752.05,0.364874,2101.677845,26678290.0,4417050.0


# Пайплайн для обучения первой модельки 

*Предварительные результаты*

In [None]:
def new_str(value:str):
    """Замена строки типа `loaded_curve_650_normal_30_greb_30` на `loaded_curve650_normal_30_greb_30`"""

    if "curve" in value and "greb" in value:
        splitted = value.split("_")
        way_cfg = splitted[1]
        curve_m = splitted[2]
        new_word_1 = way_cfg+curve_m
        value = value.replace(curve_m,"")
        value = value.replace(way_cfg+"_",new_word_1)

        greb = splitted[5]
        greb_mm = splitted[6]
        new_word_2 = greb+greb_mm
        value = value.replace(greb_mm,"")
        value = value.replace(greb+"_",new_word_2)

        speed = splitted[4]

        if greb_mm == "30" and speed == "30":
            value = value.split("_")
            
            if "" in value:
                value.remove("")
            value.insert(3,"30")
            # print(value)
            value = "_".join(value)

    
    elif "curve" in value:
        splitted = value.split("_")
        way_cfg = splitted[1]
        curve_m = splitted[2]
        new_word_1 = way_cfg+curve_m
        value = value.replace(curve_m,"")
        value = value.replace(way_cfg+"_",new_word_1)

    return value

new_str("loaded_curve_650_normal_30_greb_30")

In [None]:
df = df.reset_index().drop(["level_1"], axis=1)
df_ = df_.reset_index().drop(["level_1"], axis=1)

df_.head(15)

In [None]:
df["level_0"] = df["level_0"].map(new_str)
df_["level_0"] = df_["level_0"].map(new_str)

In [None]:
df_

In [None]:
new_cols = [f"new_col_{i}" for i in range(6)]

df[new_cols] = df['level_0'].str.split("_", expand=True)
df_[new_cols] = df_['level_0'].str.split("_", expand=True)

In [None]:
df_

In [None]:
df = df.drop("level_0", axis=1)
df_ = df_.drop("level_0", axis=1)
df_

In [None]:
df = df.drop("new_col_5", axis=1)
df_ = df_.drop("new_col_5", axis=1)
df_

In [None]:
df.columns[:12]

In [None]:
columns = ['25%', '50%', '75%', 'count', 'kurt', 'max', 'mean', 'min', 'skew','std', 'sum', 'var'] + ["wagon_cfg","way_cfg","target","speed","profile"]

df.columns=columns
df_.columns=columns

df_

In [None]:
df = pd.get_dummies(df,columns=["wagon_cfg","way_cfg","profile"],drop_first=True)
df_ = pd.get_dummies(df_,columns=["wagon_cfg","way_cfg"],drop_first=True)

In [None]:
df_

In [None]:
df["speed"] = df["speed"].astype(int)
df_["speed"] = df_["speed"].astype(int)

In [None]:
def binarize_target(string:str):
    if (string == "normal"
        or string == "newwagonw"
        or string == "gost"
        or string == "greb30"
        or string == "greb28"
        or string == "greb"):
        return 0
    else:
        return 1

def encode_target(string:str):
    
    if (string == "normal"
        or string == "newwagonw"
        or string == "gost"):

        return int(0)
        
    elif (string == "polzun15"
          or string == "greb28"
          or string == "greb30"
          or string == "greb"):
        
        return int(1)
    
    elif (string == "ellips10"
          or string == "greb26"
          or string == "greb24"):
        
        return int(2)

In [None]:
df["target"] = df["target"].map(encode_target)
# df_["target"] = df_["target"].map(encode_target)
df_["profile"] = df_["profile"].map(binarize_target)
df_

In [None]:
df_.dtypes

In [None]:
plt.figure().set_size_inches(12,8)
sns.heatmap(df.corr(),annot=True)
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def save_results(estimator,X_test,y_test):
    
    ave = "micro"
    y_pred = estimator.predict(X_test)
    presicion = precision_score(y_test,y_pred,average=ave)
    recall = recall_score(y_test,y_pred,average=ave)
    f1_ = f1_score(y_test,y_pred,average=ave)

    df = pd.read_csv("stat_results.csv")

    shape = X_test.shape[0]
    

    line = pd.DataFrame({"step1":[estimator.steps[0][0]],
                         "step2":[estimator.steps[1][0]],
                         "test_size":[shape],
                         "presicion":[presicion],
                         "recall":[recall],
                         "f1_score":[f1_]})
    
    updated_stats = pd.concat([df,line],axis=0).drop("Unnamed: 0",axis=1).to_csv("stat_results.csv")

    return updated_stats

# Обучение моделей диагностирования неисправностей

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

np.random.seed(32)

X = df.drop(["target",
             "wagon_cfg_loaded",
             "profile_greb",
             "profile_greb24",
             "profile_greb26",
             "profile_greb28",
             "profile_greb30",
             "profile_newwagonw",
             "count","sum"],axis=1)

y = df["target"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=True)

## XGBoost

In [None]:
xgb_pipe = Pipeline([("StandardScaler",StandardScaler()),("XGBClassifier",XGBClassifier(booster="gblinear"))])

xgb_pipe.fit(X_train,y_train)

save_results(xgb_pipe,X_test,y_test)

print(classification_report(y_test,xgb_pipe.predict(X_test)))

In [None]:
import catboost
import lightgbm
import xgboost
import sklearn

def plot_feature_importance(estimator):
    
    if type(estimator[1]) == xgboost.sklearn.XGBClassifier:
        coefs = estimator[1].coef_

    elif type(estimator[1]) == catboost.core.CatBoostClassifier or lightgbm.sklearn.LGBMClassifier:
        coefs = estimator[1].feature_importances_

    elif type(estimator[1]) == sklearn.linear_model._coordinate_descent.Lasso:
        coefs = estimator[1].coef_
        
    if coefs.shape[0] == 3:
        d = {0:"исправного колеса",
             1:"ползуна",
             2:"неравномерного проката"}
        
    elif coefs.shape[0] == 2:
        d = {0:"исправного вагона",
             1:"неисправного вагона"}
    
    elif len(coefs.shape) == 1:
        d = {0:"модели Catboost"}
    
    if len(coefs) <= 3:
        for i in range(len(coefs)):
            df = pd.DataFrame(coefs[i]).T
            df.columns = estimator[:-1].get_feature_names_out()
            df.index = ["Степень важности"]
            plt.figure().set_size_inches(12,2)
            plt.title(f"Коэффициенты важности признаков для предсказания {d[i]}")
            sns.barplot(abs(df))
            plt.xticks(rotation=45)
            plt.show()
        
    elif len(coefs.shape) == 1:
        df = pd.DataFrame(coefs).T
        df.columns = estimator[:-1].get_feature_names_out()
        df.index = ["Степень важности"]
        plt.figure().set_size_inches(12,2)
        plt.title(f"Коэффициенты важности признаков для предсказания неисправностей {d[0]}")
        sns.barplot(abs(df))
        plt.xticks(rotation=45)
        plt.show()

    else:
        print(len(coefs))
        
plot_feature_importance(xgb_pipe)

## Catboost

In [None]:
from catboost import CatBoostClassifier

cat_pipe = Pipeline([("StandardScaler",StandardScaler()),("CatBoostClassifier",CatBoostClassifier())])

cat_pipe.fit(X_train,y_train)


In [None]:
print(classification_report(y_test,cat_pipe.predict(X_test)))

save_results(cat_pipe,X_test,y_test)

In [None]:
plot_feature_importance(cat_pipe)

## LightGBM

In [None]:
from lightgbm import LGBMClassifier
from sklearn.preprocessing import Normalizer

lgbm_pipe = Pipeline([("StandardScaler",StandardScaler()),("LGBMClassifier",LGBMClassifier())])

lgbm_pipe.fit(X_train,y_train)

print(classification_report(y_test,lgbm_pipe.predict(X_test)))

save_results(lgbm_pipe,X_test,y_test)

In [None]:
plot_feature_importance(lgbm_pipe)

## Сводка результатов

In [None]:
def show_stat_results():
    data = pd.read_csv("stat_results.csv")
    columns = data.columns

    cols_to_drop = []

    if "Unnamed: 0" in columns:
        print("in")
        for c in columns:
            if "Unnamed" in c:
                cols_to_drop.append(c)
        # print(cols_to_drop)
        return data.drop(cols_to_drop,axis=1)
    
    elif "Unnamed: 0" not in columns:
        return data

show_stat_results()

In [None]:
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_curve, roc_auc_score

fpr,tpr,thresholds = roc_curve(y_true=y_test,y_score=cat_pipe.predict_proba(X_test)[:,1])

auc_roc = roc_auc_score(y_true=y_test,y_score=cat_pipe.predict(X_test))

display = RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=auc_roc, estimator_name="Classifier")

display.plot()
plt.show()

In [None]:
fpr,tpr,thresholds = roc_curve(y_true=y_test,y_score=xgb_pipe.predict_proba(X_test)[:,1])

auc_roc = roc_auc_score(y_true=y_test,y_score=xgb_pipe.predict(X_test))

display = RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=auc_roc, estimator_name="XGB Classifier")

display.plot()
plt.show()

**PCA TSNE**

In [None]:
from sklearn.manifold import TSNE

X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(X)

X_embedded

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.plotting import plot_decision_regions

# Initializing Classifiers
clf = XGBClassifier()


# Loading some example data

# print(X)
# print(y)

# Plotting Decision Regions

gs = gridspec.GridSpec(2, 2)
fig = plt.figure().set_size_inches(12,8)

labels = ['Предсказания модели XGboost']
          

for clf, lab, grd in zip([clf],
                         labels,
                         itertools.product([0, 1],
                         repeat=2)):
    clf.fit(X_embedded, y)
    
    # ax = plt.subplot(gs[grd[0], grd[1]])
    plt.plot(grd[0],grd[1])
    fig = plot_decision_regions(X=X_embedded, y=y.to_numpy(),
                                clf=clf, legend=1)
    plt.title(lab)

plt.show()

# Обучение моделей для диагностирования толщины гребней

In [None]:
df_ = pd.get_dummies(df_,"target",drop_first=True)
df_

In [None]:
from sklearn.model_selection import train_test_split

# df_["target"] = df_["target"].map(encode_target)


X_ = df_.drop(["25%","50%","75%","sum","profile","count"],axis=1)

y_ = df_["profile"]

X_train_,X_test_,y_train_,y_test_ = train_test_split(X_,y_,test_size=0.2,shuffle=True)

## XGboost

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

profile_xgb_pipe = Pipeline([("StandardScaler",StandardScaler()),("XGBClassifier",XGBClassifier(booster="gblinear"))])
profile_xgb_pipe.fit(X_train_,y_train_)

print(classification_report(y_test_, profile_xgb_pipe.predict(X_test_)))

plot_feature_importance(profile_xgb_pipe)

save_results(profile_xgb_pipe,X_test_,y_test_)

## Catboost

In [None]:
profile_cat_pipe = Pipeline([("StandardScaler",StandardScaler()),("CatBoostClassifier",CatBoostClassifier())])

profile_cat_pipe.fit(X_train_,y_train_)

print(classification_report(y_test_, profile_cat_pipe.predict(X_test_)))

plot_feature_importance(profile_cat_pipe)

save_results(profile_cat_pipe,X_test_,y_test_)

In [None]:
print(classification_report(y_test_, profile_cat_pipe.predict(X_test_)))

## LightGBM

In [None]:
profile_lgbm_pipe = Pipeline([("StandardScaler",StandardScaler()),("LGBMClassifier",LGBMClassifier())])

profile_lgbm_pipe.fit(X_train_,y_train_)

print(classification_report(y_test_,profile_lgbm_pipe.predict(X_test_)))

save_results(profile_lgbm_pipe,X_test_,y_test_)

In [None]:
plot_feature_importance(profile_lgbm_pipe)

In [None]:
show_stat_results()