In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.preprocessing import MinMaxScaler

In [25]:
df = pd.read_csv('../dataset/concrete_data.csv')

In [26]:
seed = 1234
np.random.seed(seed)

## Dataset without features reduction

### With outliers

In [27]:
scaler = MinMaxScaler(feature_range=(-1, 1))
rescaledX = scaler.fit_transform(df)
df_rescaled = pd.DataFrame(rescaledX, columns=df.columns)

X = df_rescaled.drop(columns=["concrete_compressive_strength"]).values
y = df_rescaled["concrete_compressive_strength"].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, random_state=seed, test_size=0.3)
X_test, X_valid, y_test, y_valid = train_test_split(X_valid, y_valid, shuffle=True, random_state=seed, test_size=1/3)

reg = RandomForestRegressor(100, random_state=seed)
reg.fit(X_train , y_train)
rms = mean_squared_error(y_test, reg.predict(X_test), squared=True)
print(rms)
r2 = r2_score(y_test, reg.predict(X_test))
adj_r2 = 1-(1-r2)*(len(X_test) - 1)/(len(X_test) - X_test.shape[1] - 1)
print(adj_r2)

0.014592172783657214
0.898240619251174


## Dataset with features reduction

### With outliers

In [29]:
scaler = MinMaxScaler(feature_range=(-1, 1))
rescaledX = scaler.fit_transform(df)
df_rescaled = pd.DataFrame(rescaledX, columns=df.columns)

X = df_rescaled.drop(columns=["concrete_compressive_strength"]).values

df_rescaled = df_rescaled.drop(columns=["fly_ash"])

X = df_rescaled.drop(columns=["concrete_compressive_strength"]).values
y = df_rescaled["concrete_compressive_strength"].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, random_state=seed, test_size=0.3)
X_test, X_valid, y_test, y_valid = train_test_split(X_valid, y_valid, shuffle=True, random_state=seed, test_size=1/3)

reg = RandomForestRegressor(100, random_state=seed)
reg.fit(X_train, y_train)
rms = mean_squared_error(y_test, reg.predict(X_test), squared=True)
print(rms)
r2 = r2_score(y_test, reg.predict(X_test))
adj_r2 = 1-(1-r2)*(len(X_test) - 1)/(len(X_test) - X_test.shape[1] - 1)
print(adj_r2)

0.014217871731188965
0.9013515831836628


### Without outliers

In [30]:
scaler = MinMaxScaler(feature_range=(-1, 1))
rescaledX = scaler.fit_transform(df)
df_rescaled = pd.DataFrame(rescaledX, columns=df.columns)

X = df_rescaled.drop(columns=["concrete_compressive_strength"]).values

df_rescaled = df_rescaled.drop(columns=["fly_ash"])

# Identify the quartiles
q1, q3 = np.percentile(df_rescaled['age'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = df_rescaled[(df_rescaled['age'] >= lower_bound) & (df_rescaled['age'] <= upper_bound)]

# Identify the quartile 
q1, q3 = np.percentile(clean_data['blast_furnace_slag'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['blast_furnace_slag'] >= lower_bound) & (clean_data['blast_furnace_slag'] <= upper_bound)]

# Identify the quartile  
q1, q3 = np.percentile(clean_data['water'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['water'] >= lower_bound) & (clean_data['water'] <= upper_bound)]

# Identify the quartile  
q1, q3 = np.percentile(clean_data['superplasticizer'], [25, 75])
# Calculate the interquartile range
iqr = q3 - q1
# Calculate the lower and upper bounds
lower_bound = q1 - (0.75 * iqr)
upper_bound = q3 + (0.75 * iqr)
# Drop the outliers
clean_data = clean_data[(clean_data['superplasticizer'] >= lower_bound) & (clean_data['superplasticizer'] <= upper_bound)]

X = clean_data.drop(columns=["concrete_compressive_strength"]).values
y = clean_data["concrete_compressive_strength"].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, random_state=seed, test_size=0.3)
X_test, X_valid, y_test, y_valid = train_test_split(X_valid, y_valid, shuffle=True, random_state=seed, test_size=1/3)

reg = RandomForestRegressor(100, random_state=seed)
reg.fit(X_train , y_train)
rms = mean_squared_error(y_test, reg.predict(X_test), squared=True)
print(rms)
r2 = r2_score(y_test, reg.predict(X_test))
adj_r2 = 1-(1-r2)*(len(X_test) - 1)/(len(X_test) - X_test.shape[1] - 1)
print(adj_r2)

0.014926592236208975
0.8985489219626223
