In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
df=pd.read_csv('/kaggle/input/concrete-compressive-strength-data-set/concrete_data.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

Our data does not have any missing values. Let's see the distribution of the data 

In [None]:
for column in df.columns:
    plt.figure(figsize=(14,4))
    sns.boxplot(x=column,data=df)
    plt.show()

The dataset doesn't have that much outliers

In [None]:
sns.pairplot(data=df,diag_kind='kde')

The features of the datasets are not normally distributed, so there is a need to make them normally distributed. We will make use of qq plots

Before, making the data noprmally distributed let's divide the whole dataset into X and y

In [None]:
df.columns

In [None]:
X=df.drop('concrete_compressive_strength',axis=1)
y=df['concrete_compressive_strength']

Transforming the data into normal distribution using box-cox transformer

In [None]:
from sklearn.preprocessing import PowerTransformer
pt=PowerTransformer()
X_normal_transformed_arr=pt.fit_transform(X)

In [None]:
X_normal_transformed=pd.DataFrame(X_normal_transformed_arr,columns=X.columns)
from scipy import stats

In [None]:
for col in X.columns:
    plt.figure(figsize=(16,5))
    plt.subplot(2,2,1)
    stats.probplot(x=X[col],dist='norm',plot=plt)
    plt.title(col)
    plt.subplot(2,2,2)
    stats.probplot(x=X_normal_transformed[col],dist='norm',plot=plt)
    plt.title(label=col+' After Transformation')
    plt.subplot(2,2,3)
    sns.kdeplot(x=X[col])
    plt.subplot(2,2,4)
    sns.kdeplot(x=X_normal_transformed[col])
    plt.show()
    
    

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score 

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_normal_transformed_arr,y,test_size=1/3)

In [None]:
lr=LinearRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

In [None]:
y_pred=lr.predict(X_test)
lr_pred_df=pd.DataFrame(np.c_[y_test,y_pred],columns=['Actual compressive strength','Predicted compressive strength by LR'])
lr_pred_df

In [None]:
rfr=RandomForestRegressor()
rfr.fit(X_train,y_train)
rfr.score(X_test,y_test)

In [None]:
n_range=[i for i in range(15,1000)]

 As compared to linear regression, Random Forest Regressor is more efficient.Let's try to make it more efficient by grid search cross validation technique and oob_score

In [None]:
param_dist={'max_depth':[2,3,4,5],
           'bootstrap':[True,False],
           'max_features':['auto','sqrt','log2',None],
            }
cv_rfr=GridSearchCV(rfr,cv=10,param_grid=param_dist,n_jobs=-1)
cv_rfr.fit(X_train,y_train)
print(cv_rfr.best_params_)

In [None]:
rfr.set_params(max_depth=5,bootstrap=True,max_features='auto')

In [None]:
rfr.score(X_train,y_train)

In [None]:
rfr.score(X_test,y_test)

Clearly, random forest regressor is better as compared to Linear Regression.Let's try the performance of adboost regressor on this dataset and which performs better

In [None]:
from sklearn.ensemble import AdaBoostRegressor
adbr=AdaBoostRegressor(n_estimators=1500,learning_rate=0.1,loss='square')
adbr.fit(X_train,y_train)
print(adbr.score(X_train,y_train))
print(adbr.score(X_test,y_test))

Clearly, the accuracy of adaboost gradient regressor is less than Random Forest Regressor. Let's try gradient boost regressor.

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
grbst=GradientBoostingRegressor(max_depth=2,warm_start=True)
min_val_error=float('inf')
error_going_up=0
for n_estimators in range(1,1500):
    grbst.n_estimators=n_estimators
    grbst.fit(X_train,y_train)
    y_pred=grbst.predict(X_test)
    error=mean_squared_error(y_test,y_pred)
    if error<min_val_error:
        min_val_error=error
        error_going_up=0
    else:
        error_going_up+=1
        if error_going_up==5:
            break

In [None]:
grbst.n_estimators

In [None]:
print(grbst.score(X_train,y_train))
print(grbst.score(X_test,y_test))

As you can see that the performance of the gradient boosting regressor is similar to the RandomForestRegressor

Let's try Extreme gradient Regressor

In [None]:
from xgboost import XGBRegressor
xgbr=XGBRegressor()
xgbr.fit(X_train,y_train)
print(xgbr.score(X_train,y_train))
print(xgbr.score(X_test,y_test))

Hence, the maximum accuracy is being given XGboost regressor. So, I would deploy the model based on this algorithm only.