In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
try:
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: train.csv not found. Please make sure the file is in the current directory.")
    exit()

# Select features and target
features = ['GrLivArea', 'YearBuilt']
target = 'SalePrice'
X = df[features]
y = df[target]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print("Data preparation complete. Training and validation sets are ready.")
print(f"Training data shape: {X_train_scaled.shape}, {y_train.shape}")
print(f"Validation data shape: {X_val_scaled.shape}, {y_val.shape}")

Data preparation complete. Training and validation sets are ready.
Training data shape: (1168, 2), (1168,)
Validation data shape: (292, 2), (292,)


##### 3. Blending

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
try:
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: train.csv not found. Please make sure the file is in the current directory.")
    exit()

# Select features and target
features = ['GrLivArea', 'YearBuilt']
target = 'SalePrice'
X = df[features]
y = df[target]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print("Data preparation complete. Training and validation sets are ready.")
print(f"Training data shape: {X_train_scaled.shape}, {y_train.shape}")
print(f"Validation data shape: {X_val_scaled.shape}, {y_val.shape}")

# Initialize individual models
linear_reg = LinearRegression()
svr = SVR()
tree_reg = DecisionTreeRegressor(random_state=42)

# Train individual models
linear_reg.fit(X_train_scaled, y_train)
svr.fit(X_train_scaled, y_train)
tree_reg.fit(X_train_scaled, y_train)

# Make predictions on the validation set
linear_pred = linear_reg.predict(X_val_scaled)
svr_pred = svr.predict(X_val_scaled)
tree_pred = tree_reg.predict(X_val_scaled)

# Evaluate individual models
linear_rmse = mean_squared_error(y_val, linear_pred, squared=False)
svr_rmse = mean_squared_error(y_val, svr_pred, squared=False)
tree_rmse = mean_squared_error(y_val, tree_pred, squared=False)

print(f"\nIndividual Model Performance:")
print(f"Linear Regression RMSE: {linear_rmse:.2f}")
print(f"Support Vector Regression RMSE: {svr_rmse:.2f}")
print(f"Decision Tree Regression RMSE: {tree_rmse:.2f}")

# Blending with equal weights
blended_equal_predictions = (linear_pred + svr_pred + tree_pred) / 3.0
blended_equal_rmse = mean_squared_error(y_val, blended_equal_predictions, squared=False)
print(f"\nBlending:")
print(f"Blended (Equal Weights) RMSE: {blended_equal_rmse:.2f}")

# Blending with custom weights
weighted_predictions = (0.4 * linear_pred + 0.4 * svr_pred + 0.2 * tree_pred)
weighted_rmse = mean_squared_error(y_val, weighted_predictions, squared=False)
print(f"Blended (Weighted) RMSE: {weighted_rmse:.2f}")

# Bagging
bagging_reg = BaggingRegressor(DecisionTreeRegressor(random_state=42),
                                n_estimators=100,
                                random_state=42)
bagging_reg.fit(X_train_scaled, y_train)
bagging_predictions = bagging_reg.predict(X_val_scaled)
bagging_rmse = mean_squared_error(y_val, bagging_predictions, squared=False)
print(f"\nBagging:")
print(f"Bagging Model RMSE: {bagging_rmse:.2f}")

# Stacking
estimators = [
    ('lr', LinearRegression()),
    ('svr', SVR()),
    ('dt', DecisionTreeRegressor(random_state=42))
]
stacking_reg = StackingRegressor(estimators=estimators,
                                final_estimator=LinearRegression())
stacking_reg.fit(X_train_scaled, y_train)
stacking_predictions = stacking_reg.predict(X_val_scaled)
stacking_rmse = mean_squared_error(y_val, stacking_predictions, squared=False)
print(f"\nStacking:")
print(f"Stacking Model RMSE: {stacking_rmse:.2f}")

Data preparation complete. Training and validation sets are ready.
Training data shape: (1168, 2), (1168,)
Validation data shape: (292, 2), (292,)





Individual Model Performance:
Linear Regression RMSE: 49955.53
Support Vector Regression RMSE: 88555.22
Decision Tree Regression RMSE: 46609.91

Blending:
Blended (Equal Weights) RMSE: 51961.28
Blended (Weighted) RMSE: 56303.45





Bagging:
Bagging Model RMSE: 39467.54

Stacking:
Stacking Model RMSE: 44661.20




##### 4. Bagging

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
try:
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: train.csv not found. Please make sure the file is in the current directory.")
    exit()

# Select features and target
features = ['GrLivArea', 'YearBuilt']
target = 'SalePrice'
X = df[features]
y = df[target]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features and convert to NumPy arrays
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train).astype(np.float32)
X_val_scaled = scaler.transform(X_val).astype(np.float32)
y_train = y_train.values.astype(np.float32)
y_val = y_val.values.astype(np.float32)


print("Data preparation complete. Training and validation sets are ready.")
print(f"Training data shape: {X_train_scaled.shape}, {y_train.shape}")
print(f"Validation data shape: {X_val_scaled.shape}, {y_val.shape}")

def create_bootstrap_sample(X, y):
    """Creates a bootstrap sample from the training data."""
    n_samples = X.shape[0]
    indices = np.random.choice(n_samples, size=n_samples, replace=True)
    return X[indices], y[indices]

def bagging_scratch(X_train, y_train, X_val, n_estimators=10, random_state=None):
    """Implements bagging from scratch using DecisionTreeRegressor as the base model."""
    if random_state is not None:
        np.random.seed(random_state)

    estimators = []
    for _ in range(n_estimators):
        X_bootstrap, y_bootstrap = create_bootstrap_sample(X_train, y_train)
        tree = DecisionTreeRegressor(random_state=random_state)
        tree.fit(X_bootstrap, y_bootstrap)
        estimators.append(tree)

    predictions = np.zeros(X_val.shape[0])
    for tree in estimators:
        predictions += tree.predict(X_val)
    final_predictions = predictions / n_estimators
    return final_predictions

# Implement bagging from scratch
n_estimators = 100  # Number of bootstrap samples and trees
bagging_scratch_predictions = bagging_scratch(X_train_scaled, y_train, X_val_scaled, n_estimators=n_estimators, random_state=42)
bagging_scratch_rmse = mean_squared_error(y_val, bagging_scratch_predictions, squared=False)
print(f"\nBagging (Scratch) Model RMSE: {bagging_scratch_rmse:.2f}")

Data preparation complete. Training and validation sets are ready.
Training data shape: (1168, 2), (1168,)
Validation data shape: (292, 2), (292,)

Bagging (Scratch) Model RMSE: 40669.90




##### 5. Stacking

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

def stacking_scratch(X_train, y_train, X_val, random_state=None):
    """Implements stacking from scratch."""
    if random_state is not None:
        np.random.seed(random_state)

    # Level-0 models
    level0_models = [
        ('lr', LinearRegression()),
        ('svr', SVR()),
        ('dt', DecisionTreeRegressor(random_state=random_state))
    ]

    # Train level-0 models
    level0_predictions_train = np.zeros((X_train.shape[0], len(level0_models)))
    level0_predictions_val = np.zeros((X_val.shape[0], len(level0_models)))

    for i, (name, model) in enumerate(level0_models):
        model.fit(X_train, y_train)
        level0_predictions_train[:, i] = model.predict(X_train)
        level0_predictions_val[:, i] = model.predict(X_val)

    # Level-1 model (Meta-learner)
    level1_model = LinearRegression()
    level1_model.fit(level0_predictions_train, y_train)
    final_predictions = level1_model.predict(level0_predictions_val)

    return final_predictions

# Implement stacking from scratch
stacking_scratch_predictions = stacking_scratch(X_train_scaled, y_train, X_val_scaled, random_state=42)
stacking_scratch_rmse = mean_squared_error(y_val, stacking_scratch_predictions, squared=False)
print(f"\nStacking (Scratch) Model RMSE: {stacking_scratch_rmse:.2f}")


Stacking (Scratch) Model RMSE: 46609.91


