
#### importing the libraries

In [41]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd 
import numpy as np 
import pickle



In [42]:
MCSEDataFrame = pd.read_csv("Material Compressive Strength Experimental Data.csv")

In [43]:
MCSEDataFrame.head(2)

Unnamed: 0,Material Quantity (gm),Additive Catalyst (gm),Ash Component (gm),Water Mix (ml),Plasticizer (gm),Moderate Aggregator,Refined Aggregator,Formulation Duration (hrs),Compression Strength MPa
0,486.42,180.6,21.26,201.66,16.11,1151.17,708.5,344.43,79.89
1,133.32,260.14,185.6,175.99,6.27,1090.57,1010.25,28.86,59.8


In [44]:
MCSEDataFrame.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6139 entries, 0 to 6138
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Material Quantity (gm)      6030 non-null   float64
 1   Additive Catalyst (gm)      6030 non-null   float64
 2   Ash Component (gm)          6030 non-null   float64
 3   Water Mix (ml)              6030 non-null   float64
 4   Plasticizer (gm)            6030 non-null   float64
 5   Moderate Aggregator         6030 non-null   float64
 6   Refined Aggregator          6030 non-null   float64
 7   Formulation Duration (hrs)  6030 non-null   float64
 8   Compression Strength MPa    6139 non-null   float64
dtypes: float64(9)
memory usage: 431.8 KB


##### Raname column name for more readability 

In [45]:
# Rename method used to rename columns. We are renaming columns name for more readability purpose.

MCSEDataFrame.rename(columns={'Material Quantity (gm)': 'Material_Quality',
                              'Additive Catalyst (gm)': 'Additive_catalyst',
                     'Ash Component (gm)': "Ash_Component",'Water Mix (ml)': 'Water_Mix', 
                     'Plasticizer (gm)':'Plasticizer',
                     'Moderate Aggregator': 'Moderate_Aggregator','Refined Aggregator': 'Refined_Aggregator',
                     'Formulation Duration (hrs)': 'Formulation_Duration',
                     'Compression Strength MPa': 'Compression_Strength'}, 
            inplace=True)

##### use of applymap function to clean the entire dataset

In [6]:
# Convert inconsistent data types to consistent types
def convertDataType(x):
    try:
        if isinstance(x, str) and x.isdigit():
            return int(x)  
        elif isinstance(x, str):
            return x.lower() 
        else:
            return x 
    except:
        return np.NAN 

MCSEDataFrame = MCSEDataFrame.applymap(convertDataType)

In [46]:
# Remove unwanted characters
def removeNonAlphacharacters(x):
    if isinstance(x, str):
        return float(re.sub(r'[^\w\s]', '', x))  # Remove non-alphanumeric characters and convert it into float
    else:
        return x  # Leave other data types unchanged

MCSEDataFrame = MCSEDataFrame.applymap(removeNonAlphacharacters)

print(MCSEDataFrame)

      Material_Quality  Additive_catalyst  Ash_Component  Water_Mix  \
0               486.42             180.60          21.26     201.66   
1               133.32             260.14         185.60     175.99   
2               559.97               2.84         111.76     295.23   
3               391.43             351.05          76.39     299.14   
4               394.78             352.61         194.35     235.54   
...                ...                ...            ...        ...   
6134            188.78             162.30         142.65     163.66   
6135            349.87             291.45          77.82     188.26   
6136            358.29              22.70          17.99     208.58   
6137            445.25             275.59         178.86     191.77   
6138            560.23             266.56         167.14     175.49   

      Plasticizer  Moderate_Aggregator  Refined_Aggregator  \
0           16.11              1151.17              708.50   
1            6.27      

####  Filling the null values with mean of that specific column.

In [47]:
# Iterate through each column and replace null values with mean of column
i = 0
while i < len(MCSEDataFrame.columns):
    column = MCSEDataFrame.columns[i]
    meanValue = MCSEDataFrame[column].mean()
    MCSEDataFrame[column].fillna(meanValue, inplace=True)
    i += 1

In [48]:
MCSEDataFrame.head(2)

Unnamed: 0,Material_Quality,Additive_catalyst,Ash_Component,Water_Mix,Plasticizer,Moderate_Aggregator,Refined_Aggregator,Formulation_Duration,Compression_Strength
0,486.42,180.6,21.26,201.66,16.11,1151.17,708.5,344.43,79.89
1,133.32,260.14,185.6,175.99,6.27,1090.57,1010.25,28.86,59.8


##  Scaling the Dataset.

##### StandardScaler is used to standardize numerical features by removing the mean and scaling to unit variance.

In [49]:
# Creating StandardScaler object
StdObject = StandardScaler()

# Scaling all features
#MCSEDataFrame[MCSEDataFrame.columns] = StdObject.fit_transform(MCSEDataFrame[MCSEDataFrame.columns])

features_to_scale = MCSEDataFrame.columns[:-1]
# Fit and transform the scaler on the selected features
MCSEDataFrame[features_to_scale] = StdObject.fit_transform(MCSEDataFrame[features_to_scale])



In [50]:
MCSEDataFrame.head(2)

Unnamed: 0,Material_Quality,Additive_catalyst,Ash_Component,Water_Mix,Plasticizer,Moderate_Aggregator,Refined_Aggregator,Formulation_Duration,Compression_Strength
0,0.691433,-0.121849,-1.231379,-0.549816,-0.13305,1.574556,-0.924276,1.526178,79.89
1,-1.684034,0.480137,1.002321,-1.1733,-0.982587,0.948866,1.774783,-1.306499,59.8


### Select the important feature for modelling to improve model performance

In [51]:

X = MCSEDataFrame.drop('Compression_Strength', axis=1)  # Exclude the target variable
y = MCSEDataFrame['Compression_Strength']  # Target variable


### MIR (Mutual Information Regression) 
It Rank the Independent features based on their mutual information with the target variable. Features with higher mutual information are considered more informative.

In [52]:
# Compute mutual information between independent variable and the target variable
miScoresValue = mutual_info_regression(X, y)

MICDataFrame = pd.DataFrame({'Feature': X.columns, 'MI_Score': miScoresValue})

selColsMIC = MICDataFrame['Feature'].tolist()  # Choose the desired number of features
selColsMIC

['Material_Quality',
 'Additive_catalyst',
 'Ash_Component',
 'Water_Mix',
 'Plasticizer',
 'Moderate_Aggregator',
 'Refined_Aggregator',
 'Formulation_Duration']

In [53]:
dfSelColsMIC= MCSEDataFrame[MICDataFrame["Feature"].tolist()]
dfSelColsMIC 

Unnamed: 0,Material_Quality,Additive_catalyst,Ash_Component,Water_Mix,Plasticizer,Moderate_Aggregator,Refined_Aggregator,Formulation_Duration
0,0.691433,-0.121849,-1.231379,-0.549816,-0.133050,1.574556,-0.924276,1.526178
1,-1.684034,0.480137,1.002321,-1.173300,-0.982587,0.948866,1.774783,-1.306499
2,1.186238,-1.467198,-0.001308,1.722853,-0.492204,0.271139,-0.010218,0.567949
3,0.052391,1.168175,-0.482055,1.817821,0.116459,1.406363,0.621723,0.308801
4,0.074928,1.179982,1.121251,0.273076,-0.054485,1.028058,-0.275697,0.829700
...,...,...,...,...,...,...,...,...
6134,-1.310929,-0.260350,0.418547,-1.472777,-0.144273,0.053180,1.705193,1.647179
6135,-0.227202,0.717102,-0.462619,-0.875281,0.705264,-0.759597,1.730596,-0.630218
6136,-0.170557,-1.316891,-1.275825,-0.381740,1.490050,0.850780,-0.173459,1.152132
6137,0.414464,0.597068,0.910711,-0.790029,0.036167,-1.378575,0.190232,1.797264


### Phase5 : Feature Engineering &Predictive Modelling

In [54]:
def createTrainTestSplit(x,y): # split dataframe into train and test dataset
    return train_test_split(x,y,test_size= 0.2,random_state=42)

In [55]:
def createModel(modelName,estimator,x,y): # Train the model provided in the argument
    x_train,x_test,y_train,y_test = createTrainTestSplit(x,y) 
    estimator.fit(x_train,y_train)
    y_pred = estimator.predict(x_test) 
    
        
    r2ScoreVal = r2_score(y_test,y_pred)
    MSEVal = np.sqrt(mean_squared_error(y_test,y_pred))
    returnList = [modelName,r2ScoreVal,MSEVal] 
    
    
    return returnList 

### Train Regression models with Features selected with MIC

In [56]:
dfSelColsMIC.columns

Index(['Material_Quality', 'Additive_catalyst', 'Ash_Component', 'Water_Mix',
       'Plasticizer', 'Moderate_Aggregator', 'Refined_Aggregator',
       'Formulation_Duration'],
      dtype='object')

In [57]:
# Separate features and target variable
XMIC = dfSelColsMIC[['Material_Quality', 'Additive_catalyst', 'Ash_Component', 'Water_Mix',
       'Plasticizer', 'Moderate_Aggregator', 'Refined_Aggregator',
       'Formulation_Duration']]
YMIC = MCSEDataFrame['Compression_Strength']

In [66]:
X_train, X_test, y_train, y_test =  createTrainTestSplit(XMIC,YMIC) 

# Define hyperparameter values
hyperparameters = {
    'max_depth': 10,
    'n_estimators': 200
}


rf_model = RandomForestRegressor(**hyperparameters)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

r2_model = r2_score(y_test, y_pred)
mse_model = mean_squared_error(y_test, y_pred)

print(f'R2 Score (Model): {r2_model:.4f}')
print(f'Mean Squared Error (Model): {mse_model:.4f}')

R2 Score (Model): 0.4457
Mean Squared Error (Model): 140.5870


In [59]:
X_test.columns

Index(['Material_Quality', 'Additive_catalyst', 'Ash_Component', 'Water_Mix',
       'Plasticizer', 'Moderate_Aggregator', 'Refined_Aggregator',
       'Formulation_Duration'],
      dtype='object')

In [60]:
# Assuming rf_model is your RandomForestRegressor model
input_features = [0.691433, -0.121849, -1.231379, -0.549816, -0.133050, 1.574556, -0.924276, 1.526178]
prediction = rf_model.predict([input_features])



In [61]:
prediction

array([61.18015607])

In [62]:
def scaledUserInputData(input_data):
    # input_data is a list or array-like containing your feature values
    scaled_data = StdObject.transform([input_data])
    return scaled_data

In [63]:
with open('model.pkl','wb') as files:
    pickle.dump(rf_model,files)
    

In [64]:
# Save the scaler
with open('scaler.pkl','wb') as files:
    pickle.dump(StdObject,files)