In [1]:
# Select the required modeling tools in the system
import numpy as np
from numpy import arange
from matplotlib import pyplot
import pandas as pd
from pandas import  set_option
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Import LR model, DT model, RF model, KNN model
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# stacking
from mlxtend.regressor import StackingRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.pipeline import Pipeline

# Import evaluation indicators
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

In [2]:
# import data
dataset = pd.read_csv("D:\\XXXX.csv", index_col=0)
dataset.head()

Unnamed: 0_level_0,Lactobacillus,Sphingobacterium,Staphylococcus,Pediococcus,Oceanobacillus,Cohesion
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,61.3688,2.1642,0.2034,1.4533,0.1382,0.9369
1,35.644,0.9823,0.1818,0.5699,0.102,0.7974
2,86.0404,0.3686,0.2114,0.8978,0.1013,1.2641
3,3.6153,0.7897,0.1974,0.072,0.1378,0.311
4,2.6453,0.8683,0.0613,0.0333,0.0438,0.1892


In [3]:
# Divide data into training and test sets
Y = dataset.Cohesion
X = dataset.drop('Cohesion', axis = 1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.3, random_state=7)

In [4]:
# Select evaluation criteria
scoring1 = 'neg_mean_squared_error'
scoring2 = 'r2'
scoring1 = 'mean_absolute_error'

In [5]:
# Basic model and secondary model
lr = LinearRegression()
knn = KNeighborsRegressor(n_neighbors = 3)
dt = DecisionTreeRegressor(max_depth= 19, min_samples_leaf= 11, min_samples_split= 10)
rf = RandomForestRegressor(n_estimators = 700)

In [6]:
# 
models = [dt, knn, rf]
sclf = StackingCVRegressor(regressors=models, meta_regressor=lr)

In [7]:
# 
print("stacking model")
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
sclf.fit(rescaledX, Y_train)
score = cross_val_score(sclf, rescaledX, Y_train, cv=5, scoring=scoring2)
print(score.mean(), "+/-", score.std())

stacking model
0.9547009735717518 +/- 0.013462493916962883


In [8]:
# the Stacking Model
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
sclf = StackingCVRegressor(regressors=models, meta_regressor=lr)
sclf.fit(rescaledX, Y_train)

# Evaluate the training set
rescaledX_train = scaler.transform(X_train)
pre_train = sclf.predict(rescaledX_train)
print('Training_data，MSE：%s' % (mean_squared_error(Y_train, pre_train)))
print('Training_data，MAE：%s' % (mean_absolute_error(Y_train, pre_train)))
print('Training_data，R2：%s' %  (r2_score(Y_train, pre_train)))

# Evaluation the testing set
rescaledX_test = scaler.transform(X_test)
pre_test = sclf.predict(rescaledX_test)
print('Testing_data，MSE：%s' % (mean_squared_error(Y_test, pre_test)))
print('Testing_data，MAE：%s' % (mean_absolute_error(Y_test, pre_test)))
print('Testing_data，R2：%s' % (r2_score(Y_test, pre_test)))

Training_data，MSE：0.001135136342130834
Training_data，MAE：0.028674281821777458
Training_data，R2：0.9908473521227374
Testing_data，MSE：0.004729925955890493
Testing_data，MAE：0.0535494277172577
Testing_data，R2：0.9640730463334776


In [None]:
#Data saving
#training set 
trainSt = pd.DataFrame(data = Y_train)
trainSt.to_csv("D:\\XXXX.csv")
#training set 
pretrainSt = pd.DataFrame(data = pre_train)
pretrainSt.to_csv('D:\\XXX.csv')

#testing set 
testSt = pd.DataFrame(data = Y_test)
testSt.to_csv('D:\\XXX.csv')
#testing set 
pretestSt = pd.DataFrame(data = pre_test)
pretestSt.to_csv('D:\\XXX.csv')

In [9]:
# Input new data and predict cohesion
def predict_new_data(new_data):
    # Ensure that the input new data is in the form of a 2D array, i.e., [[data1, data2, data3, data4, data5]]
    rescaled_new_data = scaler.transform(new_data)
    prediction = sclf.predict(rescaled_new_data)
    return prediction

# Predicted cohesion
new_microbial_data = [[86.0404, 0.3686, 0.2114, 0.8978, 0.1013]]  # New microbial genus data (input in order)
predicted_cohesion = predict_new_data(new_microbial_data)
print('Predicted cohesion: %s' % predicted_cohesion)

Predicted cohesion: [1.25088311]


