In [165]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Lasso
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

from math import sqrt

## Функции

In [136]:
# возвращает список колонок, не содержащие NaN
columns_without_nan = lambda df: df.loc[:, df.notna().all(axis=0)].columns

# возвращает список колонок, содержащие NaN
columns_with_nan = lambda df: df.loc[:, df.isna().any(axis=0)].columns

# возвращает список количественных колонок
numerical_columns = lambda df: [i for i in df.columns if df[i].dtypes != 'O']

# возвращает список категориальных колонок
categorial_columns = lambda df: [i for i in df.columns if df[i].dtypes == 'O']

# возвращает таблицу, содержащую числа в диапазоне 0<=x<=1
# если 0, то все значения в данном кластере для данного параметра пусты 
# если 1 - заполненны. Иначе это доля заполненных значений.
def nan_percent_table(data, col: str):
    df = pd.DataFrame(index=columns_with_nan(data), columns=data[col].unique())
    for i in df.columns:
        df1 = data[data[col] == i]
        for j in df.index:
            df.loc[j, i] = 1 - df1[df1[j].isna()].shape[0]/df1.shape[0]
    return df

def RMSE0(y_predict, y_absolut):
    mse = 0
    count = 0
    for i in range(len(y_predict)):
        if pd.isna(y_predict[i]) or pd.isna(y_absolut[i]):
            count += 1
        if y_predict[i] != y_absolut[i]:
            if pd.isna(y_predict[i]) or pd.isna(y_absolut[i]):
                mse += 0
            else:
                mse += (y_absolut[i] - y_predict[i]) ** 2
    return sqrt(mse / (len(y_predict) - count))

# датасет определенного кластера, полностью очищенный от NaN
def cleaned_data(cluster, df):
    df_i = df[df['type'] == cluster]
    nan_df = nan_percent_table(df, 'type')
    nan_df_i = nan_df[cluster]
    for j in nan_df.index:
        if nan_df_i[j] == 0:
            df_i = df_i.drop(j, axis = 1)
        else:
            df_i[j].fillna(df_i[j].median(), inplace = True)
    return df_i

# возвращает результат предсказания модели
def prediction(input_data, models_matrix, y_cols, cluster):
    y_output = list(models_matrix[models_matrix[cluster].notna()].index) # параметры y, которые доступны для кластера
    predict = pd.DataFrame(index=input_data.index, columns=y_cols) # предсказание
    for target in y_cols[:3]:
        for row in input_data.index:
            predict.loc[row, target] = input_data.loc[row, target]
    for target in y_cols[3:]:
        if target in y_output:
            model, params = models_matrix.loc[target, cluster]
            predict.loc[:, target] = model.predict(input_data[params])
    return predict

## Чтение и первичный просмотр данных

In [137]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')
y_base_cols = y_train.columns
X_train['n1_modifier'] = X_train['n1_modifier'].astype('object')
X_test['n1_modifier'] = X_test['n1_modifier'].astype('object')
X_train = pd.concat([X_train, pd.get_dummies(X_train['n1_modifier'], prefix='n1')], axis=1)
X_test = pd.concat([X_test, pd.get_dummies(X_test['n1_modifier'], prefix='n1')], axis=1)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47794 entries, 0 to 47793
Data columns (total 61 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   engine_id        47794 non-null  object 
 1   aircraft_id      47794 non-null  object 
 2   flight_datetime  47794 non-null  object 
 3   flight_phase     47794 non-null  object 
 4   engine_position  47794 non-null  int64  
 5   n1_modifier      47794 non-null  object 
 6   number_blades    47794 non-null  float64
 7   engine_family    47794 non-null  object 
 8   engine_type      47794 non-null  object 
 9   manufacturer     47794 non-null  object 
 10  ZHPTAC           11957 non-null  float64
 11  ZLPTAC           11957 non-null  float64
 12  ZPCN12           47794 non-null  float64
 13  ZPCN25           47794 non-null  float64
 14  ZPHSF            13336 non-null  float64
 15  ZPHSR            13336 non-null  float64
 16  ZPN12R           18382 non-null  float64
 17  ZPOIL       

In [138]:
# X_train.head(10)

In [139]:
# X_train.info()

In [140]:
# y_train.info()

In [141]:
# Соединеие датасетов у удаление мусорных столбцов
data_train = pd.merge(X_train, y_train,
                   on=["engine_id", "flight_datetime", "flight_phase"])
rubbish_columns = ['manufacturer', 'ac_manufacturer', 'engine_position', 'engine_family',
                   'aircraft_family', 'aircraft_type', 'aircraft_grp', 'n1_modifier']
X_train = X_train.drop(columns=rubbish_columns)
X_base_cols = X_train.columns
y_train = pd.merge(X_train[categorial_columns(X_train)], y_train,
                   on=["engine_id", "flight_datetime", "flight_phase"])
X_train = X_train.drop(columns=['engine_id', 'aircraft_id', 'flight_datetime'])
y_train = y_train.drop(columns=['engine_id', 'aircraft_id', 'flight_datetime'])
X_cols = X_train.columns
y_cols = y_train.columns

In [142]:
data_train["type"] = data_train["flight_phase"] + " "+ data_train["engine_type"]
data_train["type"].unique()
nan_matrix = nan_percent_table(data_train, 'type')
# nan_matrix

## Чистка исходных данных в X_train+y_train

In [143]:
data_new = data_train.copy()
data_new.shape

(47794, 92)

In [144]:
# Удаление записей, из-за которых значения в nan_matrix отклоняются от 0 или 1
for i in nan_matrix.columns:
    for j in nan_matrix.index:
        i1, i2 = i.split()
        if 1 > nan_matrix.loc[j, i] > 0.50:
            data_new = data_new.drop(data_new[(data_new[j].isna()) 
                                              & (data_new['engine_type'] == i2) 
                                              & (data_new['flight_phase'] == i1)].index)
        elif 0 < nan_matrix.loc[j, i] < 0.1:
            data_new = data_new.drop(data_new[(data_new[j].notna()) 
                                              & (data_new['engine_type'] == i2) 
                                              & (data_new['flight_phase'] == i1)].index)

In [145]:
nan_matrix = nan_percent_table(data_new, 'type')
# nan_matrix

In [146]:
print(f'Доля удалённых записей: {(data_train.shape[0] - data_new.shape[0])/data_train.shape[0]}')

Доля удалённых записей: 0.07159894547432732


In [147]:
# for i in nan_matrix.columns[1:6]:
#     i1, i2 = i.split()
#     data_new[(data_new['flight_phase'] == i1) 
#              & (data_new['engine_type'] == i2)].to_csv(i.replace('/', '!')+'.csv', index=False)

In [148]:
y_cols

Index(['flight_phase', 'engine_type', 'BRAT', 'DEGT', 'DELFN', 'DELN1',
       'DELVSV', 'DPOIL', 'EGTC', 'EGTHDM', 'EGTHDM_D', 'GEGTMC', 'GN2MC',
       'GPCN25', 'GWFM', 'PCN12', 'PCN12I', 'PCN1AR', 'PCN1BR', 'PCN1K',
       'PCN2C', 'SLOATL', 'SLOATL_D', 'VSVNOM', 'WBE', 'WBI', 'WFMP',
       'ZPCN25_D', 'ZT49_D', 'ZTLA_D', 'ZTNAC_D', 'ZWF36_D'],
      dtype='object')

In [149]:
features = ['n1_modifier', 'number_blades', 'ZPCN12',
       'ZPCN25', 'ZT49', 'ZVB1F', 'aircraft_type', 'aircraft_grp', 'ZALT',
       'ZT1A', 'ZXM', 'IBE', 'BRAT', 'PCN12', 'PCN1K', 'ZPCN25_D', 'ZT49_D']
features_for_train = ['flight_datetime', 'flight_phase', 'engine_id', 'engine_position',
                      'n1_modifier', 'number_blades', 'ZHPTAC', 'ZLPTAC', 'ZPCN12', 'ZPCN25',
                      'ZPHSF', 'ZPHSR', 'ZPN12R', 'ZPOIL', 'ZPS3', 'ZT1AB', 'ZT3', 'ZT49',
                      'ZTAMB', 'ZTLA', 'ZTNAC', 'ZTOIL', 'ZVB1F', 'ZVB1R', 'ZVB2F', 'ZVB2R',
                      'ZVSV', 'ZWF36', 'IHPSOV', 'AGW', 'CAS', 'IAI', 'IVS12', 'SAT',
                      'ZALT', 'ZT1A', 'ZVIAS', 'ZWBP1', 'ZWBP1_8E', 'ZWBP2', 'ZWBP2_8E',
                      'ZXM', 'IBE', 'IBP', 'IAIE']

# Матрица моделей и параметров (для каждого кластера и целевого параметра)
models = dict()
models_matrix = pd.DataFrame(columns=nan_matrix.columns, index=y_cols[2:])
for i in models_matrix.columns:
    for j in models_matrix.index:
        if not(j in nan_matrix.index) or j in nan_matrix.index and nan_matrix.loc[j, i] == 1:
            models_matrix.loc[j, i] = [LinearRegression(), []]
models_matrix.head(3)

  arr_value = np.array(value)


Unnamed: 0,TAKEOFF CFM56-5B4,CRUISE CF34-8E5,CRUISE CFM56-5B4,CRUISE CFM56-7B27/B1,TAKEOFF CFM56-7B26,TAKEOFF CF34-8E5,CRUISE CFM56-5B3,CRUISE CFM56-7B26,TAKEOFF CFM56-7B27/B1,TAKEOFF CFM56-5B3
BRAT,"[LinearRegression(), []]","[LinearRegression(), []]","[LinearRegression(), []]","[LinearRegression(), []]","[LinearRegression(), []]","[LinearRegression(), []]","[LinearRegression(), []]","[LinearRegression(), []]","[LinearRegression(), []]","[LinearRegression(), []]"
DEGT,,"[LinearRegression(), []]","[LinearRegression(), []]","[LinearRegression(), []]",,,"[LinearRegression(), []]","[LinearRegression(), []]",,
DELFN,"[LinearRegression(), []]",,,,"[LinearRegression(), []]",,,,"[LinearRegression(), []]","[LinearRegression(), []]"


## Тренировка модели

In [150]:
# формирование тренировочных датасетов по кластерам (X и y соответственно)
X_train_dict = dict()
y_train_dict = dict()
for cluster in nan_matrix.columns:
    X_train_dict[cluster] = cleaned_data(cluster, data_new[list(X_base_cols)+['type']])
for cluster in nan_matrix.columns:
    y_train_dict[cluster] = data_new[data_new['type'] == cluster][y_base_cols].\
    dropna(axis=1).drop(columns=['flight_datetime', 'flight_phase', 'engine_id'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [151]:
for cluster in models_matrix.columns:
    for target in models_matrix.index:
        if not(target in nan_matrix.index) \
        or target in nan_matrix.index and nan_matrix.loc[target, cluster] == 1:
            model, params = LinearRegression(), numerical_columns(X_train_dict[cluster])
            model.fit(X_train_dict[cluster][params], y_train_dict[cluster][target])
            models_matrix.loc[target, cluster] = [model, params]

  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)


  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_valu

  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_valu

  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)


  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)


##  Валидация на тестовой выборке

In [152]:
X_test["type"] = X_test["flight_phase"] + " "+ X_test["engine_type"]
nan_test_matrix = nan_percent_table(X_test, 'type')

In [153]:
data_test = pd.merge(X_test, y_test,
                   on=["engine_id", "flight_datetime", "flight_phase"])
y_test = pd.merge(X_test[categorial_columns(X_test)], y_test,
                   on=["engine_id", "flight_datetime", "flight_phase"])

In [154]:
# словарь с датасетами по кластерам (удаляют признаки, состоящие только из NaN; заполняют оставшиеся NaNы)
X_test_dict = dict()
for cluster in nan_test_matrix.columns:
    X_test_dict[cluster] = cleaned_data(cluster, X_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [166]:
for target in ['BRAT', 'PCN12', 'VSVNOM', 'WBI', 'ZPCN25_D']:
    for cluster in models_matrix.columns: 
        if target in nan_matrix.index and nan_matrix.loc[target, cluster] == 1:
            models_matrix.loc[target, cluster][0] = RandomForestRegressor(max_depth=10, random_state=0)
for target in ['DEGT', 'DPOIL', 'EGTC', 'EGTHDM', 'EGTHDM_D', 'GPCN25', 'GWFM', 'PCN1K', 'PCN2C',
              'SLOATL', 'SLOATL_D', 'WFMP', 'ZT49_D', 'ZTLA_D', 'ZTNAC_D', 'ZWF36_D']:
    for cluster in models_matrix.columns: 
        if target in nan_matrix.index and nan_matrix.loc[target, cluster] == 1:
            models_matrix.loc[target, cluster][0] = XGBRegressor(colsample_bytree=0.4603, gamma=0.0368,
                                    learning_rate=0.03, max_depth=2,
                                    min_child_weight=1.7817, n_estimators=10000,
                                    reg_alpha=0.4640, reg_lambda=0.9571,
                                    subsample=0.5213, random_state=7, nthread=-1, tree_method='gpu_hist')
for target in ['DELFN', 'DELN1', 'DELVSV', 'GEGTMC', 'GN2MC', 'PCN12I', 'PCN1AR', 'PCN1BR', 'WBE']:
    for cluster in models_matrix.columns: 
        if target in nan_matrix.index and nan_matrix.loc[target, cluster] == 1:
            models_matrix.loc[target, cluster][0] = LinearRegression()
for cluster in models_matrix.columns:
    for target in models_matrix.index:
        if not(target in nan_matrix.index) \
        or target in nan_matrix.index and nan_matrix.loc[target, cluster] == 1:
            model, params = models_matrix.loc[target, cluster]
            model.fit(X_train_dict[cluster][params], y_train_dict[cluster][target])
            models_matrix.loc[target, cluster] = [model, params]

y_test_pred = pd.DataFrame(columns=y_base_cols)
# y_valid = y_valid.append(prediction, ignore_index=True)
for cluster in data_new['type'].unique():
    y_test_pred = y_test_pred.append(prediction(X_test_dict[cluster], models_matrix, y_base_cols, cluster),
                             ignore_index=False)
y_test_pred.sort_index(inplace=True)

  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)


  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)


  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)


  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)
  arr_value = np.array(value)


In [167]:
for target in y_test_pred.columns[3:]:
    print(target, RMSE0(y_test_pred[target], y_test[target]))

BRAT 0.08963961907314444
DEGT 2.15713021001054
DELFN 0.4540777139442404
DELN1 0.23542145075357118
DELVSV 2.1038632663608075e-12
DPOIL 0.11017457451268131
EGTC 1.2869935194923074
EGTHDM 3.3245149581350626
EGTHDM_D 5.166662085363509
GEGTMC 2.2216250774462374
GN2MC 0.08224991312059922
GPCN25 0.09225945724921401
GWFM 0.5633454806340502
PCN12 0.32780700096757776
PCN12I 1.7476096890644818e-06
PCN1AR 0.1325454093971289
PCN1BR 5.597903666861342
PCN1K 0.655022578550251
PCN2C 0.0767224335051594
SLOATL 1.02462270084443
SLOATL_D 1.522630549694912
VSVNOM 0.0
WBE 2.283629126058528
WBI 0.07523749246429388
WFMP 13.459737823678461
ZPCN25_D 1.588567720354165
ZT49_D 22.018411714150982
ZTLA_D 0.06806016760596073
ZTNAC_D 3.611591092078105
ZWF36_D 159.20617844094815


In [157]:
# for target in y_test_pred.columns[3:]:
#     print(target, RMSE0(y_test_pred[target], y_test[target]))

In [158]:
models_matrix

Unnamed: 0,TAKEOFF CFM56-5B4,CRUISE CF34-8E5,CRUISE CFM56-5B4,CRUISE CFM56-7B27/B1,TAKEOFF CFM56-7B26,TAKEOFF CF34-8E5,CRUISE CFM56-5B3,CRUISE CFM56-7B26,TAKEOFF CFM56-7B27/B1,TAKEOFF CFM56-5B3
BRAT,"[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZHPTAC, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZHPTAC, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z..."
DEGT,,"[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZHPTAC, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...",,,"[LinearRegression(), [number_blades, ZHPTAC, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...",,
DELFN,"[LinearRegression(), [number_blades, ZPCN12, Z...",,,,"[LinearRegression(), [number_blades, ZPCN12, Z...",,,,"[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z..."
DELN1,"[LinearRegression(), [number_blades, ZPCN12, Z...",,,,"[LinearRegression(), [number_blades, ZPCN12, Z...",,,,"[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z..."
DELVSV,,,"[LinearRegression(), [number_blades, ZHPTAC, Z...",,,,"[LinearRegression(), [number_blades, ZHPTAC, Z...",,,
DPOIL,,,"[LinearRegression(), [number_blades, ZHPTAC, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...",,,"[LinearRegression(), [number_blades, ZHPTAC, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...",,
EGTC,,"[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZHPTAC, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...",,,"[LinearRegression(), [number_blades, ZHPTAC, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...",,
EGTHDM,"[LinearRegression(), [number_blades, ZPCN12, Z...",,"[LinearRegression(), [number_blades, ZHPTAC, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZHPTAC, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z..."
EGTHDM_D,"[LinearRegression(), [number_blades, ZPCN12, Z...",,,,"[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...",,,"[LinearRegression(), [number_blades, ZPCN12, Z...","[LinearRegression(), [number_blades, ZPCN12, Z..."
GEGTMC,,,"[LinearRegression(), [number_blades, ZHPTAC, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...",,,"[LinearRegression(), [number_blades, ZHPTAC, Z...","[LinearRegression(), [number_blades, ZPCN12, Z...",,


In [159]:
# lin_model = LinearRegression()
# las_model = Lasso(alpha=60)
# cat_model = CatBoostRegressor(iterations=2000)
# xgb_model = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
#                          learning_rate=0.05, max_depth=3, 
#                          min_child_weight=1.7817, n_estimators=2200,
#                          reg_alpha=0.4640, reg_lambda=0.8571,
#                          subsample=0.5213, random_state=7, nthread=-1)
# lgb_model = LGBMRegressor()

# # ['TAKEOFF CFM56-5B4', 'CRUISE CF34-8E5', 'CRUISE CFM56-5B4',
# #        'CRUISE CFM56-7B27/B1', 'TAKEOFF CFM56-7B26', 'TAKEOFF CF34-8E5',
# #        'CRUISE CFM56-5B3', 'CRUISE CFM56-7B26', 'TAKEOFF CFM56-7B27/B1',
# #        'TAKEOFF CFM56-5B3']
# cluster = 'TAKEOFF CFM56-5B3'
# target = 'BRAT'

# for i in [lin_model, las_model, cat_model, xgb_model, lgb_model]:
#     model = i
#     params = models_matrix.loc[target, cluster][1]
#     model.fit(X_train_dict[cluster][params], y_train_dict[cluster][target])
#     y_test_pred1 = model.predict(X_test_dict[cluster][params])
#     print(model, RMSE0(list(y_test[y_test['type'] == cluster][target]), y_test_pred1))

## Работа с валидационными данными, тренировка и предсказание

In [160]:
X_valid = pd.read_csv('X_valid.csv')
# X_valid['n1_modifier'] = X_valid['n1_modifier'].astype('object')
X_valid["type"] = X_valid["flight_phase"] + " "+ X_valid["engine_type"]
nan_valid_matrix = nan_percent_table(X_valid, 'type')
X_valid['n1_modifier'] = X_valid['n1_modifier'].astype('object')
X_valid = pd.concat([X_valid, pd.get_dummies(X_valid['n1_modifier'], prefix='n1')], axis=1)

In [161]:
# словарь с датасетами по кластерам (удаляют признаки, состоящие только из NaN; заполняют оставшиеся NaNы)
X_valid_dict = dict()
for cluster in nan_valid_matrix.columns:
    X_valid_dict[cluster] = cleaned_data(cluster, X_valid)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [162]:
y_valid = pd.DataFrame(columns=y_base_cols)
# y_valid = y_valid.append(prediction, ignore_index=True)
for cluster in data_new['type'].unique():
    y_valid = y_valid.append(prediction(X_valid_dict[cluster], models_matrix, y_base_cols, cluster),
                             ignore_index=False)
y_valid.sort_index(inplace=True)

In [163]:
y_valid.shape

(28676, 33)

In [None]:
y_valid.fillna(0).to_csv('y_valid.csv')