In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('cleaned_Concrete.csv')
df

Unnamed: 0,cement,slag,flyash,superplasticizer,coarseaggregate,fineaggregate,csMPa,boxcox_age
0,540.0,0.0,0.0,2.5,1040.0,676.0,79.99,3.619520
1,540.0,0.0,0.0,2.5,1055.0,676.0,61.89,3.619520
2,332.5,142.5,0.0,0.0,932.0,594.0,40.27,6.441153
3,332.5,142.5,0.0,0.0,932.0,594.0,41.05,6.840674
4,198.6,132.4,0.0,0.0,978.4,825.5,44.30,6.822265
...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,8.9,870.1,768.3,44.28,3.619520
1026,322.2,0.0,115.6,10.4,817.9,813.4,31.18,3.619520
1027,148.5,139.4,108.6,6.1,892.4,780.0,23.70,3.619520
1028,159.1,186.7,0.0,11.3,989.6,788.9,32.77,3.619520


In [3]:
X = df.drop(columns = {'csMPa'})
y = df['csMPa']

**Train Test Split**

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 53)

In [5]:
#Creating Polynomials
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 2, include_bias = False)
X_poly_train = pd.DataFrame(poly.fit_transform(X_train), columns = poly.get_feature_names_out())

#Modelling
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_poly_train, y_train)

#Prediction
ypred_train = model.predict(X_poly_train)

#Evaluation
from sklearn.metrics import r2_score

r2_train = r2_score(y_train, ypred_train)

from sklearn.model_selection import cross_val_score

cv = cross_val_score(model, X_poly_train, y_train, cv = 5, scoring = 'r2').mean()

print('Degree: 2')
print('R2(train): ', r2_train)
print('CV-Score: ', cv)

Degree: 2
R2(train):  0.8766404596361661
CV-Score:  0.8541771933671489


**Variable Significant**

In [6]:
X_poly_train

Unnamed: 0,cement,slag,flyash,superplasticizer,coarseaggregate,fineaggregate,boxcox_age,cement^2,cement slag,cement flyash,...,superplasticizer^2,superplasticizer coarseaggregate,superplasticizer fineaggregate,superplasticizer boxcox_age,coarseaggregate^2,coarseaggregate fineaggregate,coarseaggregate boxcox_age,fineaggregate^2,fineaggregate boxcox_age,boxcox_age^2
0,275.0,0.0,0.0,0.0,1088.0,808.0,3.619520,75625.00,0.0,0.00,...,0.00,0.00,0.00,0.000000,1183744.00,879104.00,3938.038222,652864.00,2924.572503,13.100928
1,150.7,0.0,185.3,15.6,1074.5,678.0,3.619520,22710.49,0.0,27924.71,...,243.36,16762.20,10576.80,56.464519,1154550.25,728511.00,3889.174696,459684.00,2454.034848,13.100928
2,288.0,192.0,0.0,0.0,932.0,717.8,3.619520,82944.00,55296.0,0.00,...,0.00,0.00,0.00,0.000000,868624.00,668989.60,3373.393036,515236.84,2598.091761,13.100928
3,190.3,0.0,125.2,9.9,1088.1,802.6,5.165842,36214.09,0.0,23825.56,...,98.01,10772.19,7945.74,51.141841,1183961.61,873309.06,5620.953198,644166.76,4146.105171,26.685928
4,255.0,0.0,0.0,0.0,889.8,945.0,2.041652,65025.00,0.0,0.00,...,0.00,0.00,0.00,0.000000,791744.04,840861.00,1816.661530,893025.00,1929.360694,4.168341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,491.0,26.0,123.0,3.9,822.0,699.0,3.619520,241081.00,12766.0,60393.00,...,15.21,3205.80,2726.10,14.116130,675684.00,574578.00,2975.245789,488601.00,2530.044777,13.100928
820,157.0,236.0,0.0,0.0,935.4,781.2,3.619520,24649.00,37052.0,0.00,...,0.00,0.00,0.00,0.000000,874973.16,730734.48,3385.699405,610273.44,2827.569356,13.100928
821,178.0,129.8,118.6,3.6,1007.3,746.8,2.817196,31684.00,23104.4,21110.80,...,12.96,3626.28,2688.48,10.141906,1014653.29,752251.64,2837.761760,557710.24,2103.882143,7.936595
822,153.0,145.0,113.0,8.0,867.0,824.0,3.619520,23409.00,22185.0,17289.00,...,64.00,6936.00,6592.00,28.956163,751689.00,714408.00,3138.124208,678976.00,2982.484830,13.100928


In [7]:
y_train = y_train.reset_index().drop(columns = {'index'})
y_train

Unnamed: 0,csMPa
0,24.50
1,13.46
2,38.80
3,40.39
4,10.22
...,...
819,57.92
820,33.66
821,34.24
822,26.23


In [8]:
pd.set_option('display.max_columns', None)
df_poly_train = pd.concat([X_poly_train, y_train], axis = 1)
df_poly_train

Unnamed: 0,cement,slag,flyash,superplasticizer,coarseaggregate,fineaggregate,boxcox_age,cement^2,cement slag,cement flyash,cement superplasticizer,cement coarseaggregate,cement fineaggregate,cement boxcox_age,slag^2,slag flyash,slag superplasticizer,slag coarseaggregate,slag fineaggregate,slag boxcox_age,flyash^2,flyash superplasticizer,flyash coarseaggregate,flyash fineaggregate,flyash boxcox_age,superplasticizer^2,superplasticizer coarseaggregate,superplasticizer fineaggregate,superplasticizer boxcox_age,coarseaggregate^2,coarseaggregate fineaggregate,coarseaggregate boxcox_age,fineaggregate^2,fineaggregate boxcox_age,boxcox_age^2,csMPa
0,275.0,0.0,0.0,0.0,1088.0,808.0,3.619520,75625.00,0.0,0.00,0.00,299200.00,222200.00,995.368117,0.00,0.00,0.00,0.00,0.00,0.000000,0.00,0.00,0.00,0.00,0.000000,0.00,0.00,0.00,0.000000,1183744.00,879104.00,3938.038222,652864.00,2924.572503,13.100928,24.50
1,150.7,0.0,185.3,15.6,1074.5,678.0,3.619520,22710.49,0.0,27924.71,2350.92,161927.15,102174.60,545.461728,0.00,0.00,0.00,0.00,0.00,0.000000,34336.09,2890.68,199104.85,125633.40,670.697135,243.36,16762.20,10576.80,56.464519,1154550.25,728511.00,3889.174696,459684.00,2454.034848,13.100928,13.46
2,288.0,192.0,0.0,0.0,932.0,717.8,3.619520,82944.00,55296.0,0.00,0.00,268416.00,206726.40,1042.421882,36864.00,0.00,0.00,178944.00,137817.60,694.947922,0.00,0.00,0.00,0.00,0.000000,0.00,0.00,0.00,0.000000,868624.00,668989.60,3373.393036,515236.84,2598.091761,13.100928,38.80
3,190.3,0.0,125.2,9.9,1088.1,802.6,5.165842,36214.09,0.0,23825.56,1883.97,207065.43,152734.78,983.059823,0.00,0.00,0.00,0.00,0.00,0.000000,15675.04,1239.48,136230.12,100485.52,646.763478,98.01,10772.19,7945.74,51.141841,1183961.61,873309.06,5620.953198,644166.76,4146.105171,26.685928,40.39
4,255.0,0.0,0.0,0.0,889.8,945.0,2.041652,65025.00,0.0,0.00,0.00,226899.00,240975.00,520.621140,0.00,0.00,0.00,0.00,0.00,0.000000,0.00,0.00,0.00,0.00,0.000000,0.00,0.00,0.00,0.000000,791744.04,840861.00,1816.661530,893025.00,1929.360694,4.168341,10.22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,491.0,26.0,123.0,3.9,822.0,699.0,3.619520,241081.00,12766.0,60393.00,1914.90,403602.00,343209.00,1777.184528,676.00,3198.00,101.40,21372.00,18174.00,94.107531,15129.00,479.70,101106.00,85977.00,445.201012,15.21,3205.80,2726.10,14.116130,675684.00,574578.00,2975.245789,488601.00,2530.044777,13.100928,57.92
820,157.0,236.0,0.0,0.0,935.4,781.2,3.619520,24649.00,37052.0,0.00,0.00,146857.80,122648.40,568.264707,55696.00,0.00,0.00,220754.40,184363.20,854.206820,0.00,0.00,0.00,0.00,0.000000,0.00,0.00,0.00,0.000000,874973.16,730734.48,3385.699405,610273.44,2827.569356,13.100928,33.66
821,178.0,129.8,118.6,3.6,1007.3,746.8,2.817196,31684.00,23104.4,21110.80,640.80,179299.40,132930.40,501.460929,16848.04,15394.28,467.28,130747.54,96934.64,365.672070,14065.96,426.96,119465.78,88570.48,334.119473,12.96,3626.28,2688.48,10.141906,1014653.29,752251.64,2837.761760,557710.24,2103.882143,7.936595,34.24
822,153.0,145.0,113.0,8.0,867.0,824.0,3.619520,23409.00,22185.0,17289.00,1224.00,132651.00,126072.00,553.786625,21025.00,16385.00,1160.00,125715.00,119480.00,524.830462,12769.00,904.00,97971.00,93112.00,409.005808,64.00,6936.00,6592.00,28.956163,751689.00,714408.00,3138.124208,678976.00,2982.484830,13.100928,26.23


In [9]:
import statsmodels.formula.api as smf

model = smf.ols('y_train~X_poly_train', data = df_poly_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,y_train,R-squared:,0.877
Model:,OLS,Adj. R-squared:,0.871
Method:,Least Squares,F-statistic:,160.0
Date:,"Tue, 11 Feb 2025",Prob (F-statistic):,0.0
Time:,13:10:50,Log-Likelihood:,-2621.6
No. Observations:,824,AIC:,5315.0
Df Residuals:,788,BIC:,5485.0
Df Model:,35,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1116.1948,198.154,5.633,0.000,727.223,1505.167
X_poly_train[0],-0.9899,0.205,-4.829,0.000,-1.392,-0.588
X_poly_train[1],-1.0673,0.245,-4.361,0.000,-1.548,-0.587
X_poly_train[2],-1.9848,0.334,-5.950,0.000,-2.640,-1.330
X_poly_train[3],-3.1030,2.371,-1.309,0.191,-7.757,1.551
X_poly_train[4],-1.2369,0.197,-6.294,0.000,-1.623,-0.851
X_poly_train[5],-0.7825,0.194,-4.025,0.000,-1.164,-0.401
X_poly_train[6],-17.4080,6.408,-2.717,0.007,-29.986,-4.830
X_poly_train[7],0.0002,6.12e-05,3.200,0.001,7.58e-05,0.000

0,1,2,3
Omnibus:,32.894,Durbin-Watson:,1.831
Prob(Omnibus):,0.0,Jarque-Bera (JB):,81.515
Skew:,0.142,Prob(JB):,1.99e-18
Kurtosis:,4.514,Cond. No.,1350000000.0


-------------------------------------------------------------------------------

**So, index {3, 10, 14, 16, 21, 27, 28, 32} --> p > 0.05**

Index 21 column (flyash superplasticizer) has greatest p value --> p = 0.698 --> Let's Drop it first

In [10]:
# df_poly_train = df_poly_train.drop(columns = {'flyash superplasticizer'})
df_poly_train = df_poly_train.drop(df_poly_train.columns[21], axis=1)

In [11]:
X_poly_train = df_poly_train.drop(columns = {'csMPa'})
y = df_poly_train['csMPa']

In [12]:
#Modelling
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_poly_train, y_train)

#Prediction
ypred_train = model.predict(X_poly_train)

#Evaluation
from sklearn.metrics import r2_score

r2_train = r2_score(y_train, ypred_train)

from sklearn.model_selection import cross_val_score

cv = cross_val_score(model, X_poly_train, y_train, cv = 5, scoring = 'r2').mean()

print('Degree: 2')
print('R2(train): ', r2_train)
print('CV-Score: ', cv)

Degree: 2
R2(train):  0.8766169435171227
CV-Score:  0.8554855912736127


In [13]:
import statsmodels.formula.api as smf

model = smf.ols('y_train~X_poly_train', data = df_poly_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,y_train,R-squared:,0.877
Model:,OLS,Adj. R-squared:,0.871
Method:,Least Squares,F-statistic:,164.9
Date:,"Tue, 11 Feb 2025",Prob (F-statistic):,0.0
Time:,13:10:50,Log-Likelihood:,-2621.7
No. Observations:,824,AIC:,5313.0
Df Residuals:,789,BIC:,5478.0
Df Model:,34,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1092.6529,188.513,5.796,0.000,722.606,1462.700
X_poly_train[0],-0.9588,0.189,-5.086,0.000,-1.329,-0.589
X_poly_train[1],-1.0289,0.224,-4.599,0.000,-1.468,-0.590
X_poly_train[2],-1.9128,0.277,-6.909,0.000,-2.456,-1.369
X_poly_train[3],-3.8243,1.468,-2.605,0.009,-6.707,-0.942
X_poly_train[4],-1.2230,0.193,-6.332,0.000,-1.602,-0.844
X_poly_train[5],-0.7553,0.181,-4.169,0.000,-1.111,-0.400
X_poly_train[6],-17.1552,6.371,-2.693,0.007,-29.661,-4.649
X_poly_train[7],0.0002,5.82e-05,3.242,0.001,7.44e-05,0.000

0,1,2,3
Omnibus:,33.48,Durbin-Watson:,1.832
Prob(Omnibus):,0.0,Jarque-Bera (JB):,83.361
Skew:,0.147,Prob(JB):,7.91e-19
Kurtosis:,4.53,Cond. No.,1280000000.0


**So, index {14, 26, 27, 31} --> p > 0.05**

Index 31 column (fineaggregate^2) has greatest p value --> p = 0.546 --> Let's Drop it first

In [14]:
df_poly_train = df_poly_train.drop(df_poly_train.columns[31], axis = 1)

In [15]:
X_poly_train = df_poly_train.drop(columns = {'csMPa'})
y = df_poly_train['csMPa']

In [16]:
#Modelling
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_poly_train, y_train)

#Prediction
ypred_train = model.predict(X_poly_train)

#Evaluation
from sklearn.metrics import r2_score

r2_train = r2_score(y_train, ypred_train)

from sklearn.model_selection import cross_val_score

cv = cross_val_score(model, X_poly_train, y_train, cv = 5, scoring = 'r2').mean()

print('Degree: 2')
print('R2(train): ', r2_train)
print('CV-Score: ', cv)

Degree: 2
R2(train):  0.8765599593236217
CV-Score:  0.8557391068346927


In [17]:
import statsmodels.formula.api as smf

model = smf.ols('y_train~X_poly_train', data = df_poly_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,y_train,R-squared:,0.877
Model:,OLS,Adj. R-squared:,0.871
Method:,Least Squares,F-statistic:,170.0
Date:,"Tue, 11 Feb 2025",Prob (F-statistic):,0.0
Time:,13:10:51,Log-Likelihood:,-2621.9
No. Observations:,824,AIC:,5312.0
Df Residuals:,790,BIC:,5472.0
Df Model:,33,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1020.3250,145.479,7.014,0.000,734.754,1305.896
X_poly_train[0],-0.8963,0.157,-5.693,0.000,-1.205,-0.587
X_poly_train[1],-0.9476,0.179,-5.306,0.000,-1.298,-0.597
X_poly_train[2],-1.8269,0.237,-7.696,0.000,-2.293,-1.361
X_poly_train[3],-4.2297,1.305,-3.240,0.001,-6.792,-1.667
X_poly_train[4],-1.1802,0.180,-6.572,0.000,-1.533,-0.828
X_poly_train[5],-0.6590,0.086,-7.657,0.000,-0.828,-0.490
X_poly_train[6],-15.9979,6.073,-2.634,0.009,-27.919,-4.077
X_poly_train[7],0.0002,5.38e-05,3.260,0.001,6.97e-05,0.000

0,1,2,3
Omnibus:,33.9,Durbin-Watson:,1.832
Prob(Omnibus):,0.0,Jarque-Bera (JB):,83.662
Skew:,0.158,Prob(JB):,6.81e-19
Kurtosis:,4.529,Cond. No.,896000000.0


**So, index {6, 14} --> p > 0.05**

Index 6 column has greatest p value --> p = 0.229 --> Let's Drop it first

In [18]:
df_poly_train = df_poly_train.drop(df_poly_train.columns[6], axis = 1)

In [19]:
X_poly_train = df_poly_train.drop(columns = {'csMPa'})
y_train = df_poly_train['csMPa']

In [20]:
#Modelling
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_poly_train, y_train)

#Prediction
ypred_train = model.predict(X_poly_train)

#Evaluation
from sklearn.metrics import r2_score

r2_train = r2_score(y_train, ypred_train)

from sklearn.model_selection import cross_val_score

cv = cross_val_score(model, X_poly_train, y_train, cv = 5, scoring = 'r2').mean()

print('Degree: 2')
print('R2(train): ', r2_train)
print('CV-Score: ', cv)

Degree: 2
R2(train):  0.8754757111365741
CV-Score:  0.8567343587673207


In [21]:
import statsmodels.formula.api as smf

model = smf.ols('y_train~X_poly_train', data = df_poly_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,y_train,R-squared:,0.875
Model:,OLS,Adj. R-squared:,0.87
Method:,Least Squares,F-statistic:,173.8
Date:,"Tue, 11 Feb 2025",Prob (F-statistic):,0.0
Time:,13:10:51,Log-Likelihood:,-2625.5
No. Observations:,824,AIC:,5317.0
Df Residuals:,791,BIC:,5473.0
Df Model:,32,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,920.4385,140.977,6.529,0.000,643.706,1197.171
X_poly_train[0],-0.8292,0.156,-5.317,0.000,-1.135,-0.523
X_poly_train[1],-0.8667,0.177,-4.908,0.000,-1.213,-0.520
X_poly_train[2],-1.7207,0.235,-7.329,0.000,-2.182,-1.260
X_poly_train[3],-4.2896,1.310,-3.275,0.001,-6.861,-1.718
X_poly_train[4],-1.1164,0.179,-6.251,0.000,-1.467,-0.766
X_poly_train[5],-0.5990,0.083,-7.190,0.000,-0.763,-0.435
X_poly_train[6],0.0002,5.38e-05,3.075,0.002,5.99e-05,0.000
X_poly_train[7],0.0004,0.000,3.187,0.001,0.000,0.001

0,1,2,3
Omnibus:,30.102,Durbin-Watson:,1.822
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75.188
Skew:,0.089,Prob(JB):,4.71e-17
Kurtosis:,4.469,Cond. No.,865000000.0


**So, index {13} --> p > 0.05** --> Let's drop it

Dropping more columns are leading to increase in p for different columns --> so let's stop here and this is our final model for degree = 2

# **Final Model**

In [22]:
X_poly_test = pd.DataFrame(poly.fit_transform(X_test), columns = poly.get_feature_names_out())

In [23]:
y_test = y_test.reset_index().drop(columns = {'index'})

In [24]:
df_poly_test = pd.concat([X_poly_test, y_test], axis = 1)
df_poly_test

Unnamed: 0,cement,slag,flyash,superplasticizer,coarseaggregate,fineaggregate,boxcox_age,cement^2,cement slag,cement flyash,cement superplasticizer,cement coarseaggregate,cement fineaggregate,cement boxcox_age,slag^2,slag flyash,slag superplasticizer,slag coarseaggregate,slag fineaggregate,slag boxcox_age,flyash^2,flyash superplasticizer,flyash coarseaggregate,flyash fineaggregate,flyash boxcox_age,superplasticizer^2,superplasticizer coarseaggregate,superplasticizer fineaggregate,superplasticizer boxcox_age,coarseaggregate^2,coarseaggregate fineaggregate,coarseaggregate boxcox_age,fineaggregate^2,fineaggregate boxcox_age,boxcox_age^2,csMPa
0,136.0,162.0,126.0,10.0,923.0,764.0,3.619520,18496.00,22032.00,17136.0,1360.0,125528.0,103904.00,492.254778,26244.00,20412.0,1620.00,149526.00,123768.00,586.362309,15876.0,1260.0,116298.0,96264.0,456.059573,100.00,9230.00,7640.00,36.195204,851929.00,705172.00,3340.817352,583696.00,2765.313604,13.100928,29.07
1,304.0,140.0,0.0,6.0,895.0,722.0,3.619520,92416.00,42560.00,0.0,1824.0,272080.0,219488.00,1100.334209,19600.00,0.0,840.00,125300.00,101080.00,506.732859,0.0,0.0,0.0,0.0,0.000000,36.00,5370.00,4332.00,21.717123,801025.00,646190.00,3239.470780,521284.00,2613.293747,13.100928,33.42
2,155.0,183.0,0.0,9.0,1047.0,697.0,3.619520,24025.00,28365.00,0.0,1395.0,162285.0,108035.00,561.025666,33489.00,0.0,1647.00,191601.00,127551.00,662.372238,0.0,0.0,0.0,0.0,0.000000,81.00,9423.00,6273.00,32.575684,1096209.00,729759.00,3789.637885,485809.00,2522.805736,13.100928,18.28
3,203.5,135.7,0.0,0.0,1076.2,759.3,3.619520,41412.25,27614.95,0.0,0.0,219006.7,154517.55,736.572406,18414.49,0.0,0.00,146040.34,103037.01,491.168922,0.0,0.0,0.0,0.0,0.000000,0.00,0.00,0.00,0.000000,1158206.44,817158.66,3895.327881,576536.49,2748.301858,13.100928,22.63
4,393.0,0.0,0.0,0.0,940.6,785.6,1.128705,154449.00,0.00,0.0,0.0,369655.8,308740.80,443.581049,0.00,0.0,0.00,0.00,0.00,0.000000,0.0,0.0,0.0,0.0,0.000000,0.00,0.00,0.00,0.000000,884728.36,738935.36,1061.659885,617167.36,886.710616,1.273975,19.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,425.0,106.3,0.0,16.5,852.1,887.1,3.619520,180625.00,45177.50,0.0,7012.5,362142.5,377017.50,1538.296180,11299.69,0.0,1753.95,90578.23,94298.73,384.755021,0.0,0.0,0.0,0.0,0.000000,272.25,14059.65,14637.15,59.722087,726074.41,755897.91,3084.193354,786946.41,3210.876569,13.100928,60.29
202,331.0,0.0,0.0,0.0,978.0,825.0,2.041652,109561.00,0.00,0.0,0.0,323718.0,273075.00,675.786656,0.00,0.0,0.00,0.00,0.00,0.000000,0.0,0.0,0.0,0.0,0.000000,0.00,0.00,0.00,0.000000,956484.00,806850.00,1996.735195,680625.00,1684.362511,4.168341,16.26
203,255.5,170.3,0.0,0.0,1026.6,724.3,3.619520,65280.25,43511.65,0.0,0.0,262296.3,185058.65,924.787468,29002.09,0.0,0.00,174829.98,123348.29,616.404328,0.0,0.0,0.0,0.0,0.000000,0.00,0.00,0.00,0.000000,1053907.56,743566.38,3715.799668,524610.49,2621.618644,13.100928,32.05
204,145.0,0.0,134.0,11.0,979.0,812.0,3.619520,21025.00,0.00,19430.0,1595.0,141955.0,117740.00,524.830462,0.00,0.0,0.00,0.00,0.00,0.000000,17956.0,1474.0,131186.0,108808.0,485.015737,121.00,10769.00,8932.00,39.814725,958441.00,794948.00,3543.510496,659344.00,2939.050585,13.100928,13.20


In [25]:
#Dropping all the unimportant columns here
df_poly_test = df_poly_test.drop(df_poly_test.columns[21], axis=1)
df_poly_test = df_poly_test.drop(df_poly_test.columns[31], axis = 1)
df_poly_test = df_poly_test.drop(df_poly_test.columns[6], axis = 1)

In [26]:
X_poly_test = df_poly_test.drop(columns = {'csMPa'})
y_test = df_poly_test['csMPa']

In [27]:
#Modelling
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_poly_train, y_train)

#Prediction
ypred_train = model.predict(X_poly_train)
ypred_test = model.predict(X_poly_test)

#Evaluation
from sklearn.metrics import r2_score

r2_train = r2_score(y_train, ypred_train)
r2_test = r2_score(y_test, ypred_test)

from sklearn.model_selection import cross_val_score

cv = cross_val_score(model, X_poly_train, y_train, cv = 5, scoring = 'r2').mean()

print('Degree: 2')
print('R2(train): ', r2_train)
print('CV-Score: ', cv)
print('R2(test): ', r2_test)

if (abs(cv - r2_train) <= 0.05) and (abs(r2_train - r2_test) <= 0.05):
  print('Good Model')
else:
  print('Bad Model')

Degree: 2
R2(train):  0.8754757111365741
CV-Score:  0.8567343587673207
R2(test):  0.8658981823112474
Good Model
