In [1]:
import pandas as pd
from matplotlib import pyplot as plt 
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score

In [4]:
data = pd.read_csv('preprocessed_data.csv', index_col=0)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

target_features = ['химшлак последний Al2O3', 'химшлак последний CaO',
       'химшлак последний FeO', 'химшлак последний MgO',
       'химшлак последний MnO', 'химшлак последний R',
       'химшлак последний SiO2']

data.head()

Unnamed: 0,МАРКА_rare,МАРКА_Э76ХФ,МАРКА_Э90ХАФ,ПРОФИЛЬ_rare,ПРОФИЛЬ_Р65,t вып-обр,t обработка,t под током,t продувка,ПСН гр.,чист расход C,чист расход Cr,чист расход Mn,чист расход Si,чист расход V,температура первая,температура последняя,Ar (интенс.),эл. энергия (интенс.),произв жидкая сталь,расход газ Ar,сыпуч известь РП,сыпуч кокс пыль УСТК,сыпуч кокс. мелочь КМ1,сыпуч шпат плав.,ферспл CaC2,ферспл FeV-80,ферспл Mn5Si65Al0.5,ферспл SiMn18,ферспл фх850А,эл. энергия,химсталь первый Al_1,химсталь первый C_1,химсталь первый Cr_1,химсталь первый Cu_1,химсталь первый Mn_1,химсталь первый Mo_1,химсталь первый N_1,химсталь первый Ni_1,химсталь первый P_1,химсталь первый S_1,химсталь первый Si_1,химсталь первый Ti_1,химсталь первый V_1,химсталь последний Al,химсталь последний C,химсталь последний Ca,химсталь последний Cr,химсталь последний Cu,химсталь последний Mn,химсталь последний Mo,химсталь последний N,химсталь последний Ni,химсталь последний P,химсталь последний S,химсталь последний Si,химсталь последний Ti,химсталь последний V,химшлак первый Al2O3_1,химшлак первый CaO_1,химшлак первый FeO_1,химшлак первый MgO_1,химшлак первый MnO_1,химшлак первый R_1,химшлак первый SiO2_1,химшлак последний Al2O3,химшлак последний CaO,химшлак последний FeO,химшлак последний MgO,химшлак последний MnO,химшлак последний R,химшлак последний SiO2
0,0,1,0,0,1,29.0,45.366667,24.4,41.033333,3.004414,0.45646,0.059572,0.117446,0.104762,0.040938,1557.0,1580.0,13.606742,12809.016393,115.5,18.611,0.132179,0.17,0.401,0.123,0.02,0.051939,0.112,0.182,0.106,5209.0,0.002,0.389,0.368,0.127,0.682,0.01,0.011,0.086,0.009,0.023,0.459,0.002,0.002,0.003,0.756,0.0001,0.417,0.126,0.779,0.009,0.012,0.086,0.011,0.018,0.559,0.003,0.037,4.113897,63.7,1.1,3.7,0.12,2.6,22.704914,,53.4,1.0,5.8,0.15,2.6,
1,0,1,0,0,1,26.0,44.066667,13.866667,44.066667,3.004414,0.359285,0.083738,0.160923,0.110327,0.040083,1601.0,1591.0,8.074721,12816.346154,111.6,11.659,0.132179,0.488,0.420162,0.094,0.02,0.050854,0.102,0.251,0.149,2962.0,0.002,0.465,0.345,0.112,0.643,0.009,0.011,0.08,0.007,0.023,0.461,0.002,0.002,0.003,0.774,0.0003,0.416,0.109,0.767,0.008,0.013,0.079,0.008,0.021,0.543,0.003,0.038,4.113897,58.5,0.8,4.9,0.14,2.2,22.704914,,59.3,0.6,4.1,0.11,2.3,
2,0,1,0,0,1,24.0,43.35,17.95,43.35,3.004414,0.331665,0.08149,0.132332,0.13986,0.041622,1593.0,1586.0,13.801968,12511.420613,115.8,19.871,0.132179,0.05,0.346,0.061,0.02,0.052807,0.16,0.204,0.145,3743.0,0.002,0.513,0.351,0.107,0.674,0.007,0.011,0.078,0.008,0.019,0.455,0.002,0.002,0.004,0.77,0.0009,0.417,0.108,0.788,0.006,0.012,0.078,0.008,0.013,0.568,0.003,0.038,4.113897,58.0,0.8,8.6,0.12,2.6,22.704914,,57.2,0.5,8.1,0.13,2.3,
3,0,1,0,0,1,17.0,46.183333,19.816667,46.183333,3.004414,0.377945,0.133194,0.221605,0.165186,0.04205,1589.0,1589.0,12.664958,12998.149706,116.3,19.497,0.077,0.05,0.39,0.059,0.02,0.053349,0.161,0.345,0.237,4293.0,0.002,0.487,0.31,0.105,0.611,0.009,0.011,0.07,0.006,0.02,0.434,0.001,0.001,0.003,0.767,0.0002,0.419,0.103,0.795,0.008,0.013,0.07,0.007,0.014,0.571,0.003,0.038,4.113897,59.8,0.4,6.4,0.11,2.4,22.704914,,60.3,0.5,6.3,0.12,2.6,
4,0,1,0,0,1,20.0,48.5,17.033333,48.5,3.004414,0.389875,0.105094,0.169459,0.143024,0.040967,1597.0,1592.0,10.29835,12987.475538,115.0,16.649,0.082,0.05,0.412,0.035,0.02,0.051975,0.149,0.263,0.187,3687.0,0.002,0.461,0.334,0.105,0.652,0.007,0.011,0.08,0.007,0.02,0.447,0.001,0.002,0.003,0.784,0.0003,0.421,0.105,0.796,0.006,0.013,0.081,0.008,0.017,0.568,0.003,0.037,4.113897,59.6,0.7,4.7,0.12,2.4,22.704914,,60.3,0.6,4.5,0.09,2.5,


## Первый датасет для предсказания Al2O3 и SiO2

In [16]:
data_1 = data.drop(['химшлак последний CaO',
       'химшлак последний FeO', 'химшлак последний MgO',
       'химшлак последний MnO', 'химшлак последний R'], axis=1)
target_features_1 = ['химшлак последний Al2O3',
                     'химшлак последний SiO2']
data_1.dropna(inplace=True)
data_1.shape

(4597, 67)

In [19]:
X = data_1.drop(target_features_1, axis=1)
y = data_1[target_features_1]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
# Метрика для оценки точности модели
def rmse(predicted, actual):
    return ((predicted - actual)**2).mean()**.5

In [23]:
for y_col in target_features_1:
    lr = LinearRegression()
    lr.fit(X_train, y_train[y_col])
    y_pred = lr.predict(X_test)
    print('{}\t'.format(y_col),
          '{}\t'.format(rmse(y_pred, y_test[y_col])),
          '{}'.format(r2_score(y_test[y_col], y_pred)))

химшлак последний Al2O3	 0.6947437108135951	 0.4891280036220358
химшлак последний SiO2	 1.985889880196065	 0.18652000268753177


In [25]:
for y_col in target_features_1:
    gb = GradientBoostingRegressor()
    gb.fit(X_train, y_train[y_col])
    y_pred = gb.predict(X_test)
    print('{}\t'.format(y_col),
          '{}\t'.format(rmse(y_pred, y_test[y_col])),
          '{}'.format(r2_score(y_test[y_col], y_pred)))

химшлак последний Al2O3	 0.7222712261539852	 0.447841862673225
химшлак последний SiO2	 1.9846993678885971	 0.18749504936061812


In [26]:
for y_col in target_features_1:
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train[y_col])
    y_pred = rf.predict(X_test)
    print('{}\t'.format(y_col),
          '{}\t'.format(rmse(y_pred, y_test[y_col])),
          '{}'.format(r2_score(y_test[y_col], y_pred)))

химшлак последний Al2O3	 0.7254154867787826	 0.443023982716256
химшлак последний SiO2	 1.9837318108134494	 0.18828706179361776


## Второй датасет для предсказания CaO, FeO, MgO, MnO, R

In [27]:
data_2 = data.drop(['химшлак последний Al2O3', 'химшлак последний SiO2'], axis=1)
target_features_2 = ['химшлак последний CaO',
                     'химшлак последний FeO', 'химшлак последний MgO',
                     'химшлак последний MnO', 'химшлак последний R']
data_2.dropna(inplace=True)
data_2.shape

(4597, 70)

In [28]:
X = data_2.drop(target_features_2, axis=1)
y = data_2[target_features_2]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [30]:
# Метрика для оценки точности модели
def rmse(predicted, actual):
    return ((predicted - actual)**2).mean()**.5

In [31]:
for y_col in target_features_2:
    lr = LinearRegression()
    lr.fit(X_train, y_train[y_col])
    y_pred = lr.predict(X_test)
    print('{}\t'.format(y_col),
          '{}\t'.format(rmse(y_pred, y_test[y_col])),
          '{}'.format(r2_score(y_test[y_col], y_pred)))

химшлак последний CaO	 3.3865630243143823	 0.22802650769356814
химшлак последний FeO	 0.2132380496355053	 0.023558956395394404
химшлак последний MgO	 2.446245973551062	 0.38973152697152824
химшлак последний MnO	 0.053543444845716416	 0.1369698072145371
химшлак последний R	 0.17994780003630134	 0.3677410835985029


In [32]:
for y_col in target_features_2:
    gb = GradientBoostingRegressor()
    gb.fit(X_train, y_train[y_col])
    y_pred = gb.predict(X_test)
    print('{}\t'.format(y_col),
          '{}\t'.format(rmse(y_pred, y_test[y_col])),
          '{}'.format(r2_score(y_test[y_col], y_pred)))

химшлак последний CaO	 3.431493392051964	 0.207406699219725
химшлак последний FeO	 0.2119291653247295	 0.03550922462129025
химшлак последний MgO	 2.4683069536047584	 0.3786747256626023
химшлак последний MnO	 0.05434422009425911	 0.11096247496939926
химшлак последний R	 0.17437056747881366	 0.4063257100311912


In [33]:
for y_col in target_features_2:
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train[y_col])
    y_pred = rf.predict(X_test)
    print('{}\t'.format(y_col),
          '{}\t'.format(rmse(y_pred, y_test[y_col])),
          '{}'.format(r2_score(y_test[y_col], y_pred)))

химшлак последний CaO	 3.3714787318418873	 0.2348881785008018
химшлак последний FeO	 0.2104346496313783	 0.04906435835518974
химшлак последний MgO	 2.441414691859713	 0.39213968006883615
химшлак последний MnO	 0.05429400839711587	 0.11260458005915086
химшлак последний R	 0.1757407717831299	 0.3969588639499607


In [24]:
# pca = PCA(n_components=10)
# pca_x = pca.fit_transform(X)
# fig, axs = plt.subplots(2, 5, figsize=(20,10))
# for i in range(10):
#     axs[i%2, i//2].scatter(pca_x[:,i], y[y_cols[0]])
#     axs[i%2, i//2].set_title(f'{i}')