In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNetCV
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import mean_squared_error,r2_score

In [None]:
df=pd.read_csv('/kaggle/input/regression-with-neural-networking/concrete_data.csv')

In [None]:
df.info()

In [None]:
print(f'The number of rows are {df.shape[0]} and columns are {df.shape[1]}')

In [None]:
df.columns

In [None]:
df.head(5)

In [None]:
df.corr()

In [None]:
df.describe(include='all')

In [None]:
sns.boxplot(df['Blast Furnace Slag'])

In [None]:
sns.boxplot(df['Age'])

In [None]:
sns.boxplot(df['Superplasticizer'])

In [None]:
sns.boxplot(df['Fly Ash'])

In [None]:
sns.boxplot(df['Cement'])

In [None]:
q1 = df['Blast Furnace Slag'].quantile(0.25)
q3 = df['Blast Furnace Slag'].quantile(0.75)

# Calculate the interquartile range (IQR)
iqr = q3 - q1

# Define the lower and upper bounds for outliers
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Remove outliers
df = df[(df['Blast Furnace Slag'] >= lower_bound) & (df['Blast Furnace Slag'] <= upper_bound)]

In [None]:
# Calculate the first and third quartiles
q1 = df['Age'].quantile(0.25)
q3 = df['Age'].quantile(0.75)

# Calculate the interquartile range (IQR)
iqr = q3 - q1

# Define the lower and upper bounds for outliers
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Remove outliers
df = df[(df['Age'] >= lower_bound) & (df['Age'] <= upper_bound)]

In [None]:
# Calculate the first and third quartiles
q1 = df['Superplasticizer'].quantile(0.25)
q3 = df['Superplasticizer'].quantile(0.75)

# Calculate the interquartile range (IQR)
iqr = q3 - q1

# Define the lower and upper bounds for outliers
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Remove outliers
df = df[(df['Superplasticizer'] >= lower_bound) & (df['Superplasticizer'] <= upper_bound)]

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
age_discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
df['age_bin'] = age_discretizer.fit_transform(df[['Age']])
df.head()

In [None]:
X=df.drop(columns=['Strength','Age'],axis=0)
y=df['Strength']

In [None]:
y.head(5)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Applying Regression without any transformation
lr = LinearRegression()

lr.fit(X_train,y_train)

y_pred = lr.predict(X_test)

r2_score(y_test,y_pred)

In [None]:
# Cross checking with cross val score
lr = LinearRegression()
np.mean(cross_val_score(lr,X,y,scoring='r2'))

In [None]:
from sklearn.preprocessing import PowerTransformer

In [None]:
# Applying Box-Cox Transform
pt = PowerTransformer(method='box-cox')

X_train_transformed = pt.fit_transform(X_train+0.000001)
X_test_transformed = pt.transform(X_test+0.000001)

pd.DataFrame({'cols':X_train.columns,'box_cox_lambdas':pt.lambdas_})



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro, probplot
import pandas as pd
import numpy as np

for col in X_train.columns:
    # Convert inf values to NaN
    X_train[col] = X_train[col].replace([np.inf, -np.inf], np.nan)
    
    plt.figure(figsize=(14,4))
    
    plt.subplot(121)
    sns.histplot(data=X_train, x=col, binwidth=20, kde=False)
    plt.title(col)
    plt.xlabel("Value")
    plt.ylabel("Frequency")

    plt.subplot(122)
    probplot(X_train[col], dist="norm", plot=plt)
    plt.title(col)
    plt.xlabel("Theoretical Quantiles")
    plt.ylabel("Quantiles")

    # Check if the feature is normally distributed
    stat, p = shapiro(X_train[col].dropna())  # Drop NaN values before performing the test
    if p > 0.05:
        print(f"{col} is normally distributed (Shapiro-Wilk test, p-value={p:.4f})")
    else:
        print(f"{col} is not normally distributed (Shapiro-Wilk test, p-value={p:.4f})")

    plt.show()


In [None]:
lr = LinearRegression()
lr.fit(X_train_transformed,y_train)

y_pred2 = lr.predict(X_test_transformed)

r2_score(y_test,y_pred2)

In [None]:


#lr = LinearRegression()
#np.mean(cross_val_score(lr,X_train_transformed,y,scoring='r2'))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
X_train_transformed = pd.DataFrame(X_train_transformed, columns=X_train.columns)

# Iterate over each column in X_train_transformed
for col in X_train_transformed.columns:
    plt.figure(figsize=(14,4))
    
    # Before transformation histogram
    plt.subplot(121)
    sns.histplot(X_train[col], kde=True)
    plt.title(f'Before Transformation - {col}')
    plt.xlabel("Value")
    plt.ylabel("Frequency")

    # After transformation histogram
    plt.subplot(122)
    sns.histplot(X_train_transformed[col], kde=True)
    plt.title(f'After Transformation - {col}')
    plt.xlabel("Value")
    plt.ylabel("Frequency")

    plt.show()


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression


poly = PolynomialFeatures(degree=2)
X_polytrain = poly.fit_transform(X_train_transformed)
X_polytest = poly.transform(X_test_transformed)

# Step 3: Fit the model
lrpoly = LinearRegression()
lrpoly.fit(X_polytrain, y_train)  # Assuming y_train is your target variable




In [None]:
polynomial_regression_score_train=lrpoly.score(X_train_transformed,y_train)

In [None]:
print("Polynomial Regression  Score (Training Data):", polynomial_regression_score_train)

In [None]:
# Step 4: Predictions
polynomial_regression_pred = lrpoly.predict(X_polytest)

In [None]:
polynomial_regression_mse=mean_squared_error(y_test,polynomial_regression_pred)
print("Mean Squared Error on Test Data (Predictions):")
print("-----------------------------")
print("Polynomial Regression MSE:",polynomial_regression_mse)

In [None]:
polynomial_regression_r2=r2_score(y_test,polynomial_regression_pred)
print("R^2 Prediction on Test Data (Predictions):")
print("-----------------------------")
print("R^2 Polynomial Regression Score:",polynomial_regression_r2)

In [None]:
from sklearn.svm import SVR
svr=SVR()
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [1, 3, 5],
    'gamma': ['scale', 'auto'],
}

svr_grid=GridSearchCV(svr,param_grid,cv=5)
svr_grid.fit(X_train_transformed,y_train)
print("Tuned Lasso Regression Parameters: {}".format(svr_grid.best_params_))
print("Best score is {}".format(svr_grid.best_score_))

In [None]:

random_forest_model = RandomForestRegressor()
gradient_boosting_model = GradientBoostingRegressor()

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso_regression_model = Lasso()
param_grid = {'alpha': np.arange(1, 10, 0.1),
              'tol': [0.0001, 0.001, 0.01],
              'warm_start': [True, False],
              'max_iter': [100,300,500],
              'selection': ['cyclic', 'random']}

# Use grid search to find the best value for alpha
lasso_cv = GridSearchCV(lasso_regression_model, param_grid, cv=10)
# Fit the model
lasso_cv.fit(X_train_transformed, y_train)

# Print the tuned parameters and score
print("Tuned Lasso Regression Parameters: {}".format(lasso_cv.best_params_))
print("Best score is {}".format(lasso_cv.best_score_))

In [None]:
ridge_regression_model  = Ridge()
param_grid = {'alpha': np.arange(1, 10, 0.1),
             'solver': ['cholesky','svd','sparse_cg']}

# Use grid search to find the best value for alpha
ridge_cv = GridSearchCV(ridge_regression_model, param_grid, cv=10)
# Fit the model
ridge_cv.fit(X_train_transformed, y_train)

# Print the tuned parameters and score
print("Tuned Ridge Regression Parameters: {}".format(ridge_cv.best_params_))
print("Best score is {}".format(ridge_cv.best_score_))


In [None]:

elasticnet_cv = ElasticNetCV(l1_ratio=(0.5, 1.0), eps=0.001, cv=5)

# Fit the model
elasticnet_cv.fit(X_train_transformed, y_train)

In [None]:

random_forest_model.fit(X_train_transformed, y_train)
gradient_boosting_model.fit(X_train_transformed, y_train)

In [None]:
svr_regression_score_train = svr_grid.score(X_train_transformed, y_train)

In [None]:
elastic_regression_score_train=elasticnet_cv.score(X_train_transformed,y_train)
ridge_regression_score_train = ridge_cv.score(X_train_transformed, y_train)
lasso_regression_score_train = lasso_cv.score(X_train_transformed, y_train)
random_forest_score_train = random_forest_model.score(X_train_transformed, y_train)
gradient_boosting_score_train = gradient_boosting_model.score(X_train_transformed, y_train)

In [None]:
print("Elastic Regression  Score (Training Data):", elastic_regression_score_train)
print("SVR Regression  Score (Training Data):",svr_regression_score_train)
print("Ridge Regression R^2 Score (Training Data):", ridge_regression_score_train)
print("Lasso Regression R^2 Score (Training Data):", lasso_regression_score_train)
print("Random Forest R^2 Score (Training Data):", random_forest_score_train)
print("Gradient Boosting R^2 Score (Training Data):", gradient_boosting_score_train)

In [None]:
elastic_regression_predictions = elasticnet_cv.predict(X_test_transformed)
ridge_regression_predictions = ridge_cv.predict(X_test_transformed)
svr_regression_predictions = svr_grid.predict(X_test_transformed)
lasso_regression_predictions = lasso_cv.predict(X_test_transformed)
random_forest_predictions = random_forest_model.predict(X_test_transformed)
gradient_boosting_predictions = gradient_boosting_model.predict(X_test_transformed)

In [None]:
elastic_regression_mse=mean_squared_error(y_test,elastic_regression_predictions)
svr_regression_mse=mean_squared_error(y_test,svr_regression_predictions)
ridge_regression_mse = mean_squared_error(y_test, ridge_regression_predictions)
lasso_regression_mse = mean_squared_error(y_test, lasso_regression_predictions)
random_forest_mse = mean_squared_error(y_test, random_forest_predictions)
gradient_boosting_mse = mean_squared_error(y_test, gradient_boosting_predictions)

print("Mean Squared Error on Test Data (Predictions):")
print("-----------------------------")
print("SVR Regression:",svr_regression_mse)
print("Elastic Regression:",elastic_regression_mse)
print("Ridge Regression:", ridge_regression_mse)
print("Lasso Regression:", lasso_regression_mse)
print("Random Forest:", random_forest_mse)
print("Gradient Boosting:", gradient_boosting_mse)

In [None]:


# Compute R^2 score for each model's predictions on the test data
svr_regression_r2=r2_score(y_test,svr_regression_predictions)
ridge_regression_r2 = r2_score(y_test, ridge_regression_predictions)
lasso_regression_r2 = r2_score(y_test, lasso_regression_predictions)
random_forest_r2 = r2_score(y_test, random_forest_predictions)
gradient_boosting_r2 = r2_score(y_test, gradient_boosting_predictions)
elastic_regression_r2 = r2_score(y_test, elastic_regression_predictions)
print("R^2 Score on Test Data (Predictions):")
print("SVR Regreesion:",svr_regression_r2)
print("Elastic Regreesion:",elastic_regression_r2)
print("Ridge Regression:", ridge_regression_r2)
print("Lasso Regression:", lasso_regression_r2)
print("Random Forest:", random_forest_r2)
print("Gradient Boosting:", gradient_boosting_r2)


In [None]:
X_test.shape,y_test.shape

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test,random_forest_predictions, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linestyle='--', color='red')  # Plot the diagonal line
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.show()


In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test, gradient_boosting_predictions, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linestyle='--', color='red')  # Plot the diagonal line
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.show()


In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test, svr_regression_predictions, color='red')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linestyle='--', color='red')  # Plot the diagonal line
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.show()


In [None]:
import plotly.graph_objs as go

# Create scatter plot trace for actual vs. predicted values using Plotly
scatter_trace = go.Scatter(
    x=y_test,
    y=svr_regression_predictions,
    mode='markers',
    marker=dict(color='red'),  # Adjust color as needed
    name='SVR Regression Predictions'
)

# Create diagonal line trace
diagonal_trace = go.Scatter(
    x=[y_test.min(), y_test.max()],
    y=[y_test.min(), y_test.max()],
    mode='lines',
    line=dict(color='red', dash='dash'),  # Adjust color and dash style as needed
    name='Diagonal Line'
)

# Define layout
layout = go.Layout(
    title='Actual vs SVR Regression Predictions',
    xaxis=dict(title='Actual Target Variable'),
    yaxis=dict(title='Predicted Target Variable')
)

# Create the figure
fig = go.Figure(data=[scatter_trace, diagonal_trace], layout=layout)

# Show the plot
fig.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Assuming X_test_transformed contains at least two features
# Generate 3D data points
X_test_transformed_3d = np.column_stack((y_test, ridge_regression_predictions, 
                                         lasso_regression_predictions, random_forest_predictions,
                                         gradient_boosting_predictions, elastic_regression_predictions))

# Initialize the figure and axes
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plotting data points
for i in range(1, X_test_transformed_3d.shape[1]):
    ax.scatter(X_test_transformed_3d[:, 0], X_test_transformed_3d[:, i], color='C'+str(i-1), label='Model '+str(i))

# Adding labels and title
ax.set_xlabel('Target Variable')
ax.set_ylabel('Predicted Value')
ax.set_zlabel('Model')
ax.set_title('Comparison of Predicted Values by Different Regression Models')

# Adding legend
ax.legend()

# Showing the plot
plt.show()


In [None]:
import plotly.graph_objs as go

# Define colors for each model
colors = ['rgb(255,0,0)', 'rgb(0,255,0)', 'rgb(0,0,255)', 'rgb(255,255,0)', 'rgb(255,0,255)', 'rgb(0,255,255)']

# Define traces for each regression model
traces = []
for i in range(1, X_test_transformed_3d.shape[1]):
    trace = go.Scatter3d(
        x=X_test_transformed_3d[:, 0],  # Target variable
        y=X_test_transformed_3d[:, i],  # Predicted value for the current model
        z=[i] * len(y_test),             # Model index (for z-axis)
        mode='markers',
        marker=dict(
            color=colors[i-1],            # Assigning color based on the index of the model
            size=5,
            opacity=0.8
        ),
        name='Model ' + str(i)
    )
    traces.append(trace)

# Create the layout
layout = go.Layout(
    scene=dict(
        xaxis=dict(title='Target Variable'),
        yaxis=dict(title='Predicted Value'),
        zaxis=dict(title='Model'),
    ),
    title='Comparison of Predicted Values by Different Regression Models'
)

# Combine the traces and layout into a figure
fig = go.Figure(data=traces, layout=layout)

# Show the plot
fig.show()


In [None]:
import plotly.graph_objs as go
# Define predictions for each regression model
model_predictions = {
    "Elastic Regression": elastic_regression_predictions,
    "Ridge Regression": ridge_regression_predictions,
    "SVR Regression": svr_regression_predictions,
    "Lasso Regression": lasso_regression_predictions,
    "Random Forest": random_forest_predictions,
    "Gradient Boosting": gradient_boosting_predictions
}

# Initialize a list to hold scatter plot traces
scatter_traces = []

# Add trace for actual vs. actual as reference line
scatter_traces.append(go.Scatter(x=y_test, y=y_test, mode='markers', name='Actual', marker=dict(color='black')))

# Add traces for each model's predictions
for model_name, predictions in model_predictions.items():
    scatter_traces.append(go.Scatter(x=y_test, y=predictions, mode='markers', name=model_name, opacity=0.7))

# Define layout
layout = go.Layout(
    title='Actual vs Predicted Values for Multiple Regression Models',
    xaxis=dict(title='Actual Target Variable'),
    yaxis=dict(title='Predicted Target Variable'),
    legend=dict(orientation='h', x=0, y=-0.2)
)

# Create the figure
fig = go.Figure(data=scatter_traces, layout=layout)

# Show the plot
fig.show()


In [None]:
import plotly.graph_objs as go

# Define marker symbols for each model
marker_symbols = ['circle', 'square', 'diamond', 'cross', 'x', 'triangle-up']

# Initialize a list to hold scatter plot traces
scatter_traces = []

# Add trace for actual vs. actual as reference line
scatter_traces.append(go.Scatter(x=y_test, y=y_test, mode='markers', name='Actual', marker=dict(color='black')))

# Add traces for each model's predictions
for i, (model_name, predictions) in enumerate(model_predictions.items()):
    scatter_traces.append(go.Scatter(
        x=y_test, 
        y=predictions, 
        mode='markers', 
        name=model_name, 
        marker=dict(symbol=marker_symbols[i], opacity=0.7)
    ))

# Define layout
layout = go.Layout(
    title='Actual vs Predicted Values for Multiple Regression Models',
    xaxis=dict(title='Actual Target Variable'),
    yaxis=dict(title='Predicted Target Variable'),
    legend=dict(orientation='h', x=0, y=-0.2)
)

# Create the figure
fig = go.Figure(data=scatter_traces, layout=layout)

# Show the plot
fig.show()


In [None]:
import plotly.graph_objs as go

# Create scatter plot trace for actual vs. predicted values using Plotly
scatter_trace = go.Scatter(
    x=y_test,
    y=gradient_boosting_predictions,
    mode='markers',
    marker=dict(color='blue'),  # Adjust color as needed
    name='Gradient Boosting Predictions'
)

# Create scatter plot trace for actual values
actual_trace = go.Scatter(
    x=y_test,
    y=y_test,
    mode='markers',
    marker=dict(color='black'),  # Adjust color as needed
    name='Actual'
)

# Define layout
layout = go.Layout(
    title='Actual vs Gradient Boosting Predictions',
    xaxis=dict(title='Actual Target Variable'),
    yaxis=dict(title='Predicted Target Variable')
)

# Create the figure
fig = go.Figure(data=[scatter_trace, actual_trace], layout=layout)

# Show the plot
fig.show()


In [None]:
import plotly.graph_objs as go

# Create scatter plot trace for actual vs. predicted values using Plotly
scatter_trace = go.Scatter(
    x=y_test,
    y=ridge_regression_predictions,
    mode='markers',
    marker=dict(color='blue'),  # Adjust color as needed
    name='Gradient Boosting Predictions'
)

# Create scatter plot trace for actual values
actual_trace = go.Scatter(
    x=y_test,
    y=y_test,
    mode='markers',
    marker=dict(color='black'),  # Adjust color as needed
    name='Actual'
)

# Define layout
layout = go.Layout(
    title='Actual vs Gradient Boosting Predictions',
    xaxis=dict(title='Actual Target Variable'),
    yaxis=dict(title='Predicted Target Variable')
)

# Create the figure
fig = go.Figure(data=[scatter_trace, actual_trace], layout=layout)

# Show the plot
fig.show()


In [None]:
import plotly.graph_objs as go

# Create scatter plot trace for actual vs. predicted values using Plotly
scatter_trace = go.Scatter(
    x=y_test,
    y=random_forest_predictions,
    mode='markers',
    marker=dict(color='blue'),  # Adjust color as needed
    name='Random Forest Predictions'
)

# Create scatter plot trace for actual values
actual_trace = go.Scatter(
    x=y_test,
    y=y_test,
    mode='markers',
    marker=dict(color='black'),  # Adjust color as needed
    name='Actual'
)

# Define layout
layout = go.Layout(
    title='Actual vs Random Boosting Predictions',
    xaxis=dict(title='Actual Target Variable'),
    yaxis=dict(title='Predicted Target Variable')
)

# Create the figure
fig = go.Figure(data=[scatter_trace, actual_trace], layout=layout)

# Show the plot
fig.show()


Graduient Boosting Algorithm Performed Well 