In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("Material.csv")

In [3]:
len(data)

6139

In [4]:
req = ["MaterialQuantity", "AdditiveCatalyst", "AshComponent", "Water", "Plasticizer",
                 "ModerateAggregator", "RefinedAggregator", "FormulationDuration", "CompressionStrength"]
curr = list(data.columns)

mapper = {}
for i, name in enumerate(curr):
    mapper[name] = req[i]

data = data.rename(columns=mapper)

In [5]:
data.head()

Unnamed: 0,MaterialQuantity,AdditiveCatalyst,AshComponent,Water,Plasticizer,ModerateAggregator,RefinedAggregator,FormulationDuration,CompressionStrength
0,486.42,180.6,21.26,201.66,16.11,1151.17,708.5,344.43,79.89
1,133.32,260.14,185.6,175.99,6.27,1090.57,1010.25,28.86,59.8
2,559.97,2.84,111.76,295.23,11.95,1024.93,810.69,237.68,77.86
3,391.43,351.05,76.39,299.14,19.0,1134.88,881.34,208.81,71.74
4,394.78,352.61,194.35,235.54,17.02,1098.24,781.01,266.84,76.07


In [6]:
print(data.dtypes)

MaterialQuantity       float64
AdditiveCatalyst       float64
AshComponent           float64
Water                  float64
Plasticizer            float64
ModerateAggregator     float64
RefinedAggregator      float64
FormulationDuration    float64
CompressionStrength    float64
dtype: object


In [7]:
duplicate_rows = data[data.duplicated()]

In [8]:
duplicate_rows

Unnamed: 0,MaterialQuantity,AdditiveCatalyst,AshComponent,Water,Plasticizer,ModerateAggregator,RefinedAggregator,FormulationDuration,CompressionStrength
1041,442.37,230.58,0.0,201.18,14.15,968.92,775.18,22.25,62.61
1498,442.37,230.58,0.0,201.18,14.15,968.92,775.18,71.25,86.58
2062,442.37,230.58,0.0,201.18,14.15,968.92,775.18,18.25,39.54
2088,442.37,230.58,0.0,201.18,14.15,968.92,775.18,43.25,79.86
2115,518.5,129.69,0.0,187.27,20.13,873.95,909.85,22.25,55.1
2183,442.37,230.58,0.0,201.18,14.15,968.92,775.18,71.25,86.58
2376,442.37,230.58,0.0,201.18,14.15,968.92,775.18,18.25,39.54
2399,442.37,230.58,0.0,201.18,14.15,968.92,775.18,18.25,39.54
2449,518.5,129.69,0.0,187.27,20.13,873.95,909.85,22.25,55.1
2632,442.37,230.58,0.0,201.18,14.15,968.92,775.18,22.25,62.61


In [9]:
duplicate_rows.count()

MaterialQuantity       25
AdditiveCatalyst       25
AshComponent           25
Water                  25
Plasticizer            25
ModerateAggregator     25
RefinedAggregator      25
FormulationDuration    25
CompressionStrength    28
dtype: int64

In [10]:
data = data.drop_duplicates()

In [11]:
data

Unnamed: 0,MaterialQuantity,AdditiveCatalyst,AshComponent,Water,Plasticizer,ModerateAggregator,RefinedAggregator,FormulationDuration,CompressionStrength
0,486.42,180.60,21.26,201.66,16.11,1151.17,708.50,344.43,79.89
1,133.32,260.14,185.60,175.99,6.27,1090.57,1010.25,28.86,59.80
2,559.97,2.84,111.76,295.23,11.95,1024.93,810.69,237.68,77.86
3,391.43,351.05,76.39,299.14,19.00,1134.88,881.34,208.81,71.74
4,394.78,352.61,194.35,235.54,17.02,1098.24,781.01,266.84,76.07
...,...,...,...,...,...,...,...,...,...
6134,188.78,162.30,142.65,163.66,15.98,1003.82,1002.47,357.91,50.61
6135,349.87,291.45,77.82,188.26,25.82,925.10,1005.31,104.20,54.24
6136,358.29,22.70,17.99,208.58,34.91,1081.07,792.44,302.76,56.57
6137,445.25,275.59,178.86,191.77,18.07,865.15,833.10,374.63,58.21


In [12]:
data.isnull().sum()

MaterialQuantity       106
AdditiveCatalyst       106
AshComponent           106
Water                  106
Plasticizer            106
ModerateAggregator     106
RefinedAggregator      106
FormulationDuration    106
CompressionStrength      0
dtype: int64

In [13]:
data.fillna(data.mean(),inplace = True)

In [14]:
data.isnull().sum()

MaterialQuantity       0
AdditiveCatalyst       0
AshComponent           0
Water                  0
Plasticizer            0
ModerateAggregator     0
RefinedAggregator      0
FormulationDuration    0
CompressionStrength    0
dtype: int64

In [15]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


data = data.clip(lower=lower_bound, upper=upper_bound, axis=1)

In [16]:
x = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)

In [38]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [47]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=150, random_state=80)
rfr.fit(X_train, y_train)
y_pred_rfr = rfr.predict(X_test)

In [48]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print("Model\t\t\t RMSE \t\t MSE \t\t\t MAE \t\t R2")
print("""RandomforestReg \t {:.4f} \t\t {:.4f} \t\t{:.4f} \t\t{:.4f}""".format(
            np.sqrt(mean_squared_error(y_test, y_pred_rfr)),mean_squared_error(y_test, y_pred_rfr),
            mean_absolute_error(y_test, y_pred_rfr), r2_score(y_test, y_pred_rfr)))

Model			 RMSE 		 MSE 			 MAE 		 R2
RandomforestReg 	 12.5832 		 158.3369 		9.8732 		0.3900


In [49]:
import pickle

with open('model.pkl', 'wb') as model_file:
    pickle.dump(rfr, model_file)

print("Random Forest model has been trained and saved to 'model.pkl'.")

Random Forest model has been trained and saved to 'model.pkl'.
