In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('cleaned_Concrete.csv')
df

Unnamed: 0,cement,slag,flyash,superplasticizer,coarseaggregate,fineaggregate,csMPa,boxcox_age
0,540.0,0.0,0.0,2.5,1040.0,676.0,79.99,3.619520
1,540.0,0.0,0.0,2.5,1055.0,676.0,61.89,3.619520
2,332.5,142.5,0.0,0.0,932.0,594.0,40.27,6.441153
3,332.5,142.5,0.0,0.0,932.0,594.0,41.05,6.840674
4,198.6,132.4,0.0,0.0,978.4,825.5,44.30,6.822265
...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,8.9,870.1,768.3,44.28,3.619520
1026,322.2,0.0,115.6,10.4,817.9,813.4,31.18,3.619520
1027,148.5,139.4,108.6,6.1,892.4,780.0,23.70,3.619520
1028,159.1,186.7,0.0,11.3,989.6,788.9,32.77,3.619520


In [3]:
X = df.drop(columns = {'csMPa'})
y = df['csMPa']

**Train Test split**

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 53)

In [5]:
#Modelling
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

#Prediction
ypred_train = model.predict(X_train)

#Evaluation
from sklearn.metrics import r2_score

r2_train = r2_score(y_train, ypred_train)

from sklearn.model_selection import cross_val_score

cv = cross_val_score(model, X_train, y_train, cv = 5).mean()

print('R2(train): ', r2_train)
print('CV-Score: ', cv)

R2(train):  0.8078876628316126
CV-Score:  0.8011773457056133


**Variable Significant**

In [6]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size = 0.2, random_state = 53)

import statsmodels.formula.api as smf

model1 = smf.ols('y_train~X_train', data = df_train).fit()
model1.summary()

0,1,2,3
Dep. Variable:,y_train,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.806
Method:,Least Squares,F-statistic:,490.2
Date:,"Tue, 11 Feb 2025",Prob (F-statistic):,2.9499999999999996e-287
Time:,13:10:32,Log-Likelihood:,-2804.1
No. Observations:,824,AIC:,5624.0
Df Residuals:,816,BIC:,5662.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-166.1109,8.464,-19.625,0.000,-182.725,-149.497
X_train[0],0.1610,0.005,33.020,0.000,0.151,0.171
X_train[1],0.1465,0.006,25.070,0.000,0.135,0.158
X_train[2],0.1317,0.008,16.918,0.000,0.116,0.147
X_train[3],0.2447,0.062,3.956,0.000,0.123,0.366
X_train[4],0.0593,0.004,14.253,0.000,0.051,0.067
X_train[5],0.0691,0.005,13.906,0.000,0.059,0.079
X_train[6],7.4958,0.191,39.295,0.000,7.121,7.870

0,1,2,3
Omnibus:,21.203,Durbin-Watson:,1.892
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26.381
Skew:,0.296,Prob(JB):,1.87e-06
Kurtosis:,3.647,Cond. No.,42500.0


---------------------------------------------------------------------------------------------------------

as every column has p < 0.05 ---> No need for any Wrapper Method

Therefore, this is our **Final Model (without any unimportant columns)**.

In [7]:
#Modelling
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

#Prediction
ypred_train = model.predict(X_train)
ypred_test = model.predict(X_test)

#Evaluation
from sklearn.metrics import r2_score

r2_train = r2_score(y_train, ypred_train)
r2_test = r2_score(y_test, ypred_test)

from sklearn.model_selection import cross_val_score

cv = cross_val_score(model, X_train, y_train, cv = 5).mean()

print('R2(train): ', r2_train)
print('CV-Score: ', cv)
print('R2(test): ', r2_test)


if (abs(cv - r2_train) <= 0.05) and (abs(r2_train - r2_test) <= 0.05):
  print('Good Model')
else:
  print('Bad Model')

R2(train):  0.8078876628316126
CV-Score:  0.8011773457056133
R2(test):  0.8175539059750273
Good Model
