In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.preprocessing import MinMaxScaler

In [13]:
df = pd.read_csv('../dataset/concrete_data.csv')

In [14]:
seed = 1234
np.random.seed(seed)

## Dataset without features reduction

### With outliers

In [16]:
scaler = MinMaxScaler(feature_range=(-1, 1))
rescaledX = scaler.fit_transform(df)
df_rescaled = pd.DataFrame(rescaledX, columns=df.columns)

X = df_rescaled.drop(columns=["concrete_compressive_strength"]).values
y = df_rescaled["concrete_compressive_strength"].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, random_state=seed)

reg = RandomForestRegressor(100, random_state=seed)
reg.fit(X_train , y_train)
rms = mean_squared_error(y_valid, reg.predict(X_valid), squared=True)
print(rms)
r2 = r2_score(y_valid, reg.predict(X_valid))
adj_r2 = 1-(1-r2)*(len(X_valid) - 1)/(len(X_valid) - X_valid.shape[1] - 1)
print(adj_r2)

0.015192617326421085
0.9014066403505829


### Without outliers

In [17]:
scaler = MinMaxScaler(feature_range=(-1, 1))
rescaledX = scaler.fit_transform(df)
df_rescaled = pd.DataFrame(rescaledX, columns=df.columns)

# Identify the quartiles
q1, q3 = np.percentile(df_rescaled['age'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = df_rescaled[(df_rescaled['age'] >= lower_bound) & (df_rescaled['age'] <= upper_bound)]

# Identify the quartile 
q1, q3 = np.percentile(clean_data['blast_furnace_slag'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['blast_furnace_slag'] >= lower_bound) & (clean_data['blast_furnace_slag'] <= upper_bound)]

# Identify the quartile  
q1, q3 = np.percentile(clean_data['water'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['water'] >= lower_bound) & (clean_data['water'] <= upper_bound)]

# Identify the quartile  
q1, q3 = np.percentile(clean_data['superplasticizer'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (0.75 * iqr)
upper_bound = q3 + (0.75 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['superplasticizer'] >= lower_bound) & (clean_data['superplasticizer'] <= upper_bound)]

X = clean_data.drop(columns=["concrete_compressive_strength"]).values
y = clean_data["concrete_compressive_strength"].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, random_state=seed)

reg = RandomForestRegressor(100, random_state=seed)
reg.fit(X_train , y_train)
rms = mean_squared_error(y_valid, reg.predict(X_valid), squared=True)
print(rms)
r2 = r2_score(y_valid, reg.predict(X_valid))
adj_r2 = 1-(1-r2)*(len(X_valid) - 1)/(len(X_valid) - X_valid.shape[1] - 1)
print(adj_r2)

0.015715934877428547
0.8948506492706396


## Dataset with features reduction

### With outliers

In [19]:
scaler = MinMaxScaler(feature_range=(-1, 1))
rescaledX = scaler.fit_transform(df)
df_rescaled = pd.DataFrame(rescaledX, columns=df.columns)

X = df_rescaled.drop(columns=["concrete_compressive_strength"]).values

df_rescaled = df_rescaled.drop(columns=["fly_ash"])

X = df_rescaled.drop(columns=["concrete_compressive_strength"]).values
y = df_rescaled["concrete_compressive_strength"].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, random_state=seed)
reg = RandomForestRegressor(100, random_state=seed)
reg.fit(X_train , y_train)
rms = mean_squared_error(y_valid, reg.predict(X_valid), squared=True)
print(rms)
r2 = r2_score(y_valid, reg.predict(X_valid))
adj_r2 = 1-(1-r2)*(len(X_valid) - 1)/(len(X_valid) - X_valid.shape[1] - 1)
print(adj_r2)

0.015166064297178225
0.9019726419225333


### Without outliers

In [20]:
scaler = MinMaxScaler(feature_range=(-1, 1))
rescaledX = scaler.fit_transform(df)
df_rescaled = pd.DataFrame(rescaledX, columns=df.columns)

X = df_rescaled.drop(columns=["concrete_compressive_strength"]).values

df_rescaled = df_rescaled.drop(columns=["fly_ash"])

# Identify the quartiles
q1, q3 = np.percentile(df_rescaled['age'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = df_rescaled[(df_rescaled['age'] >= lower_bound) & (df_rescaled['age'] <= upper_bound)]

# Identify the quartile 
q1, q3 = np.percentile(clean_data['blast_furnace_slag'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['blast_furnace_slag'] >= lower_bound) & (clean_data['blast_furnace_slag'] <= upper_bound)]

# Identify the quartile  
q1, q3 = np.percentile(clean_data['water'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['water'] >= lower_bound) & (clean_data['water'] <= upper_bound)]

# Identify the quartile  
q1, q3 = np.percentile(clean_data['superplasticizer'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (0.75 * iqr)
upper_bound = q3 + (0.75 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['superplasticizer'] >= lower_bound) & (clean_data['superplasticizer'] <= upper_bound)]

X = clean_data.drop(columns=["concrete_compressive_strength"]).values
y = clean_data["concrete_compressive_strength"].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, random_state=seed)
reg = RandomForestRegressor(100, random_state=seed)
reg.fit(X_train , y_train)
rms = mean_squared_error(y_valid, reg.predict(X_valid), squared=True)
print(rms)
r2 = r2_score(y_valid, reg.predict(X_valid))
adj_r2 = 1-(1-r2)*(len(X_valid) - 1)/(len(X_valid) - X_valid.shape[1] - 1)
print(adj_r2)

0.016736906602992447
0.8885174054148526
