In [1]:
import pickle
import numpy as np
import pandas as pd
import os, glob
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold, cross_validate
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

## The following are the ML models which can be used for trasinning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import MinMaxScaler,StandardScaler

import timeit
import warnings
warnings.filterwarnings("ignore")


  from pandas import MultiIndex, Int64Index


In [2]:
dataFolder = os.getcwd()

In [5]:
## read the test file
data =pd.read_csv(os.path.join(dataFolder,'10_PC_02_LHS_500_54854_01_s1_G.csv'))
data.columns =[col.strip() for col in data.columns]
data['ratio'] = data['b(CaO)']/data['b(SiO2)']
group1 = data[data['ratio']>=1.60]
group2 = data[(data['ratio']<1.60) & (data['ratio']>0.67)]
group3 = data[data['ratio']<=0.67]
groups = [group1,group2,group3]


In [11]:
## read trained Model info:
modelInfo =pd.read_csv(os.path.join(dataFolder,'ModelTrainSummary.csv'))
modelInfo = modelInfo.iloc[:,1:4]
modelInfo

Unnamed: 0,group,var,modelType
0,group1,Vol(aq),rfModel
1,group1,Vol(aq),xgbModel
2,group1,Vol(aq),lgbModel
3,group1,Vol(aq),gbModel
4,group1,Vol(aq),MLPModel
...,...,...,...
127,group3,C/S(CSHQ),lgbModel
128,group3,C/S(CSHQ),gbModel
129,group3,C/S(CSHQ),MLPModel
130,group3,C/S(CSHQ),GPyModel


In [17]:
modelInfo['modelType'].unique()

array(['rfModel', 'xgbModel', 'lgbModel', 'gbModel', 'MLPModel',
       'GPyModel', 'CONST', 'linear'], dtype=object)

In [26]:
## load the mdoels
dataCols = data.columns.to_list()
modelsLst = []
for irow,row in modelInfo.iterrows():
    group = row['group']
    var =row['var']
    modelType = row['modelType']
    icol =dataCols.index(var)
    if '/' in var:
        fileCol = var.replace("/",'')
    else:
        fileCol = var            

    modelFolder = os.path.join(dataFolder,'SavedModel',group)
    if modelType=='CONST':
        ext = '.csv'
        fileName = os.path.join(modelFolder,modelType + '_'+str(icol)+'_'+fileCol+ext)
        tempModel =pd.read_csv(fileName)
    else:
        ext ='.sav'
        fileName = os.path.join(modelFolder,modelType + '_'+str(icol)+'_'+fileCol+ext)
        tempModel = pickle.load(open(fileName, 'rb'))
    
    modelsLst.append(tempModel)

modelInfo['model'] = modelsLst   


In [27]:
modelInfo

Unnamed: 0,group,var,modelType,model
0,group1,Vol(aq),rfModel,"(DecisionTreeRegressor(max_depth=15, max_featu..."
1,group1,Vol(aq),xgbModel,"XGBRegressor(base_score=0.5, booster='gbtree',..."
2,group1,Vol(aq),lgbModel,"LGBMRegressor(bagging_fraction=0.8, bagging_fr..."
3,group1,Vol(aq),gbModel,([DecisionTreeRegressor(criterion='friedman_ms...
4,group1,Vol(aq),MLPModel,"MLPRegressor(activation='tanh', alpha=0.053108..."
...,...,...,...,...
127,group3,C/S(CSHQ),lgbModel,"LGBMRegressor(bagging_fraction=0.8, bagging_fr..."
128,group3,C/S(CSHQ),gbModel,([DecisionTreeRegressor(criterion='friedman_ms...
129,group3,C/S(CSHQ),MLPModel,"MLPRegressor(activation='tanh', alpha=0.026604..."
130,group3,C/S(CSHQ),GPyModel,GaussianProcessRegressor(kernel=Matern(length_...


In [43]:
y_pred

array([3903.928495])

In [45]:
list(model['const'].values)*len(dataY)

[12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.472615,
 12.

In [51]:
finalDF = pd.DataFrame()
for col in group1.columns[4:]: 

    colModelInfo = modelInfo[modelInfo['var'] == col]  
    
    for i, group in enumerate(groups):
    
        dataX = group.iloc[:,1:4]    
        dataY = group[col].values
        groupModelInfo = colModelInfo[colModelInfo['group'] =='group'+str(i+1)]
        if len(groupModelInfo)==1:
            modelType = groupModelInfo['modelType'].values[0]
            model = groupModelInfo['model'].values[0]
            if modelType=='CONST':
                y_pred = list(model['const'].values)*len(dataY)
                
            elif modelType=='linear':
                y_pred = model.predict(dataX.values)
            
            tempDF = pd.DataFrame({'testDataY':dataY, 'predDataY':y_pred})
            tempDF['modelType'] = [modelType]*len(tempDF)
            tempDF['var'] = [col]*len(tempDF)
            tempDF['group'] = ['group'+str(i+1)]*len(tempDF)
        else:
            for irow,row in groupModelInfo.iterrows(): #['model'].values:
                model =row['model']
                modelType = row['modelType']
                scaler = StandardScaler().fit(dataX.values)
                X_scaled = scaler.transform(dataX.values)  # This will be used for input of trainning dataset
                y_pred  = model.predict(X_scaled)  
                tempDF = pd.DataFrame({'testDataY':dataY, 'predDataY':y_pred})
                tempDF['modelType'] = [modelType]*len(tempDF)
                tempDF['var'] = [col]*len(tempDF)
                tempDF['group'] = ['group'+str(i+1)]*len(tempDF)
            
        if len(finalDF)==0:   
            finalDF = tempDF
        else:
            finalDF = pd.concat([finalDF,tempDF],axis=0)


In [53]:
finalDF.to_csv('predict_testDataset.csv',index=False)

# Visualization of the prediction of the test dataset