In [363]:
import numpy as np
from sklearn import tree,metrics,svm,ensemble,neighbors
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import xgboost as xgb
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

### 去掉缺失值较多的列

In [210]:
nume = ['转炉终点温度', '转炉终点C', '转炉终点S',
       '转炉终点Si', '钢水净重', '连铸正样C', '连铸正样Mn', '连铸正样S', '连铸正样P', '连铸正样Si',
       '连铸正样Ceq_val', '连铸正样Cr', '连铸正样Ni_val', '连铸正样Cu_val',
       '连铸正样V_val', '连铸正样Alt_val', '连铸正样Als_val', '连铸正样Mo_val', '连铸正样Ti_val', '氮化钒铁FeV55N11-A', '低铝硅铁',
       '钒氮合金(进口)', '钒铁(FeV50-A)', '钒铁(FeV50-B)', '钒铁(FeV50-B).1', '硅铝钙',
       '硅铝合金FeAl30Si25', '硅铝锰合金球', '硅锰面（硅锰渣）', '硅铁(合格块)', '硅铁FeSi75-B',
       '石油焦增碳剂', '锰硅合金FeMn64Si27(合格块)', '锰硅合金FeMn68Si18(合格块)', '碳化硅(55%)',
       '硅钙碳脱氧剂']

### 缺失值处理、归一化

In [265]:
CTrain = pd.read_excel('q1_1_收得率.xls')[nume+['C收得率']]
# CTrain = CTrain.dropna()
CTrain = CTrain.drop(CTrain[CTrain['C收得率'].isnull()].index)
CTrain = CTrain.fillna(CTrain.mean())
X = CTrain[nume] #[['钢水净重','连铸正样C', '连铸正样Ceq_val', '低铝硅铁','石油焦增碳剂']]
Y = CTrain[['C收得率']]
X = X[nume].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)+1e-10))
len(CTrain.values)

809

In [323]:
# #缺失值数量统计
# CTrain.isnull().sum()

In [267]:
CTrain.head(1)

Unnamed: 0,转炉终点温度,转炉终点C,转炉终点S,转炉终点Si,钢水净重,连铸正样C,连铸正样Mn,连铸正样S,连铸正样P,连铸正样Si,...,硅铝锰合金球,硅锰面（硅锰渣）,硅铁(合格块),硅铁FeSi75-B,石油焦增碳剂,锰硅合金FeMn64Si27(合格块),锰硅合金FeMn68Si18(合格块),碳化硅(55%),硅钙碳脱氧剂,C收得率
0,1644.0,0.00065,0.0003,0.004,74400.0,0.0023,0.0133,0.0003,0.00019,0.0033,...,0,0,0,0,85,1547,0,88,0,0.91408


In [335]:
train_data, test_data, train_target, test_target = train_test_split(X.values,Y.values,test_size=0.2)

### 决策树回归

In [339]:
clf = tree.DecisionTreeRegressor() 
clf = clf.fit(train_data,train_target)
pre_y = clf.predict(test_data)
metrics.r2_score(pre_y,test_target),metrics.mean_absolute_error(pre_y,test_target),metrics.mean_squared_error(pre_y,test_target)

(-0.1515869933289944, 0.06560714310570313, 0.010217992437542758)

### 线性回归

In [340]:
clf = linear_model.LinearRegression()
clf = clf.fit(train_data,train_target)
pre_y = clf.predict(test_data)
metrics.r2_score(pre_y,test_target),metrics.mean_absolute_error(pre_y,test_target),metrics.mean_squared_error(pre_y,test_target)

(0.9130465056423822, 0.027141434214317157, 0.001267350494100081)

### SVM

In [341]:
clf = svm.SVR(gamma='auto')
clf = clf.fit(train_data,train_target.flatten())
pre_y = clf.predict(test_data)
metrics.r2_score(pre_y,test_target),metrics.mean_absolute_error(pre_y,test_target),metrics.mean_squared_error(pre_y,test_target)

(-2.980689372403678, 0.05004266858204905, 0.006912589625992675)

### 贝叶斯

In [342]:
clf = linear_model.BayesianRidge()
clf = clf.fit(train_data,train_target.flatten())
pre_y = clf.predict(test_data)
metrics.r2_score(pre_y,test_target),metrics.mean_absolute_error(pre_y,test_target),metrics.mean_squared_error(pre_y,test_target)

(0.9111409159589139, 0.02711932958385213, 0.001272936453247817)

### 集成

In [345]:
clf = ensemble.GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls')
clf = clf.fit(train_data,train_target.flatten().astype(np.float32))
pre_y = clf.predict(test_data)
metrics.r2_score(pre_y,test_target),metrics.mean_absolute_error(pre_y,test_target),metrics.mean_squared_error(pre_y,test_target)

(-4.684867199839898, 0.053272618055577356, 0.007577338095989948)

### 多项式回归（效果最好）

In [346]:
model = Pipeline([('poly', PolynomialFeatures(degree=2)),('linear', LinearRegression(fit_intercept=False))])
clf = model.fit(train_data,train_target.flatten().astype(np.float32))
pre_y = clf.predict(test_data)
metrics.r2_score(pre_y,test_target),metrics.mean_absolute_error(pre_y,test_target),metrics.mean_squared_error(pre_y,test_target)

(0.8664093632372909, 0.016723627794210886, 0.001873739885633892)

# Mn元素预测

### 数据预处理

In [348]:
nume = ['转炉终点温度', '转炉终点C', '转炉终点Mn', '转炉终点S',
       '转炉终点Si', '钢水净重', '连铸正样C', '连铸正样Mn', '连铸正样S', '连铸正样P', '连铸正样Si',
       '连铸正样Ceq_val', '连铸正样Cr', '连铸正样Ni_val', '连铸正样Cu_val',
       '连铸正样V_val', '连铸正样Alt_val', '连铸正样Als_val', '连铸正样Mo_val', '连铸正样Ti_val', '氮化钒铁FeV55N11-A', '低铝硅铁',
       '钒氮合金(进口)', '钒铁(FeV50-A)', '钒铁(FeV50-B)', '钒铁(FeV50-B).1', '硅铝钙',
       '硅铝合金FeAl30Si25', '硅铝锰合金球', '硅锰面（硅锰渣）', '硅铁(合格块)', '硅铁FeSi75-B',
       '石油焦增碳剂', '锰硅合金FeMn64Si27(合格块)', '锰硅合金FeMn68Si18(合格块)', '碳化硅(55%)',
       '硅钙碳脱氧剂']

In [364]:

MnTrain = pd.read_excel('data1.xlsx')[nume]
# Mn初始含量较低，对反应收得率影响小，使用均值替换缺失值
MnTrain[['转炉终点Mn']] = MnTrain[['转炉终点Mn']].fillna(MnTrain[['转炉终点Mn']].mean())
# 去掉为对合金化后合金钢采样的数据
MnTrain = MnTrain.drop(MnTrain[MnTrain['连铸正样Mn'].isnull()].index)
# 使用均值替代少量的缺失值
MnTrain[nume] = MnTrain[nume].fillna(MnTrain[nume].mean())
# 重新计算Mn收得率
mn_t=['硅铝锰合金球','硅锰面（硅锰渣）','锰硅合金FeMn64Si27(合格块)','锰硅合金FeMn68Si18(合格块)']
mn_p = [0.3,0.664,0.664,0.664]
mn_total=(MnTrain[mn_t]*mn_p).sum(axis=1)
mn_comsu=(MnTrain['连铸正样Mn']-MnTrain['转炉终点Mn'])*MnTrain['钢水净重']
MnTrain['Mn收得率']=mn_comsu/mn_total
# 构造训练数据并对自变量值进行归一化
X = MnTrain[nume].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)+1e-10))
Y = MnTrain[['Mn收得率']]
train_data, test_data, train_target, test_target = train_test_split(X.values,Y.values,test_size=0.2)
len(MnTrain.values)

810

In [361]:
MnTrain.isnull().sum()

转炉终点温度                 0
转炉终点C                  0
转炉终点Mn                 0
转炉终点S                  0
转炉终点Si                 0
钢水净重                   0
连铸正样C                  0
连铸正样Mn                 0
连铸正样S                  0
连铸正样P                  0
连铸正样Si                 0
连铸正样Ceq_val            0
连铸正样Cr                 0
连铸正样Ni_val             0
连铸正样Cu_val             0
连铸正样V_val              0
连铸正样Alt_val            0
连铸正样Als_val            0
连铸正样Mo_val             0
连铸正样Ti_val             0
氮化钒铁FeV55N11-A         0
低铝硅铁                   0
钒氮合金(进口)               0
钒铁(FeV50-A)            0
钒铁(FeV50-B)            0
钒铁(FeV50-B).1          0
硅铝钙                    0
硅铝合金FeAl30Si25         0
硅铝锰合金球                 0
硅锰面（硅锰渣）               0
硅铁(合格块)                0
硅铁FeSi75-B             0
石油焦增碳剂                 0
锰硅合金FeMn64Si27(合格块)    0
锰硅合金FeMn68Si18(合格块)    0
碳化硅(55%)               0
硅钙碳脱氧剂                 0
Mn收得率                  0
dtype: int64

In [365]:
model = Pipeline([('poly', PolynomialFeatures(degree=2)),('linear', LinearRegression(fit_intercept=False))])
clf = model.fit(train_data,train_target.flatten().astype(np.float32))
pre_y = clf.predict(test_data)
metrics.r2_score(pre_y,test_target),metrics.mean_absolute_error(pre_y,test_target),metrics.mean_squared_error(pre_y,test_target)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').