In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
# load the dataset
data = pd.read_csv('AmesHousing.csv')
data.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [3]:
data.shape

(2930, 82)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order            2930 non-null   int64  
 1   PID              2930 non-null   int64  
 2   MS SubClass      2930 non-null   int64  
 3   MS Zoning        2930 non-null   object 
 4   Lot Frontage     2440 non-null   float64
 5   Lot Area         2930 non-null   int64  
 6   Street           2930 non-null   object 
 7   Alley            198 non-null    object 
 8   Lot Shape        2930 non-null   object 
 9   Land Contour     2930 non-null   object 
 10  Utilities        2930 non-null   object 
 11  Lot Config       2930 non-null   object 
 12  Land Slope       2930 non-null   object 
 13  Neighborhood     2930 non-null   object 
 14  Condition 1      2930 non-null   object 
 15  Condition 2      2930 non-null   object 
 16  Bldg Type        2930 non-null   object 
 17  House Style   

In [5]:
data.duplicated().sum()

0

In [6]:
missing = data.isnull().sum()
missing

Order               0
PID                 0
MS SubClass         0
MS Zoning           0
Lot Frontage      490
                 ... 
Mo Sold             0
Yr Sold             0
Sale Type           0
Sale Condition      0
SalePrice           0
Length: 82, dtype: int64

In [7]:
# drop columns with more than 50% of the missing values
threshold = len(data) * 0.5
data = data.loc[:, data.isnull().sum() < threshold]
data.shape

(2930, 77)

In [9]:
X = data.drop('SalePrice',axis=1)

In [10]:
# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [11]:
# Define the ColumnTransformer with SelectKBest
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('selectkbest', SelectKBest(score_func=f_regression, k=10))  # Select top 10 features
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ]
)
preprocessor

In [12]:
#separate features and labels
X = data.drop('SalePrice',axis=1)
y = data['SalePrice']

In [13]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Create a pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

In [15]:
# Define the parameter grid
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_features': ['sqrt', 'log2'],  # Corrected to use valid options
    'model__max_depth': [None, 10, 20],
    'preprocessor__num__selectkbest__k': [5, 10]  # Adjust the number of features to select
}

# Setup GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)

In [16]:
# Fit the model
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [17]:
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")
# Best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.2f}")

# Make predictions on the test set
y_pred = grid_search.predict(X_test)

# Calculate and print the RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error on test set: {rmse:.2f}")

Best parameters: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__n_estimators': 100, 'preprocessor__num__selectkbest__k': 10}
Best cross-validation score: 0.87
Root Mean Squared Error on test set: 29377.55


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

In [19]:
# Define models and their respective parameter grids
models = {
    'RandomForest': RandomForestRegressor(),
    'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'GradientBoosting': GradientBoostingRegressor()
}

param_grids = {
    'RandomForest': {
        'model__n_estimators': [100, 200],
        'model__max_features': ['sqrt', 'log2'],
        'model__max_depth': [None, 10, 20],
        'preprocessor__num__selectkbest__k': [5, 10]
    },
    'LinearRegression': {
        'model__fit_intercept': [True, False],
        'preprocessor__num__selectkbest__k': [5, 10]
    },
    'SVR': {
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf'],
        'preprocessor__num__selectkbest__k': [5, 10]
    },
    'GradientBoosting': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1],
        'model__max_depth': [3, 5, 7],
        'preprocessor__num__selectkbest__k': [5, 10]
    }
}


In [20]:
results = {}

for model_name, model in models.items():
    # Create a pipeline with preprocessing and the model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Setup GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, n_jobs=-1, verbose=1)
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    # Store results
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'rmse': mean_squared_error(y_test, grid_search.predict(X_test), squared=False)
    }

# Display results
for model_name, metrics in results.items():
    print(f"{model_name} - Best Parameters: {metrics['best_params']}, "
          f"Best Cross-Validation Score: {metrics['best_score']:.2f}, "
          f"RMSE on Test Set: {metrics['rmse']:.2f}")


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
RandomForest - Best Parameters: {'model__max_depth': 20, 'model__max_features': 'sqrt', 'model__n_estimators': 100, 'preprocessor__num__selectkbest__k': 10}, Best Cross-Validation Score: 0.87, RMSE on Test Set: 29935.99
LinearRegression - Best Parameters: {'model__fit_intercept': False, 'preprocessor__num__selectkbest__k': 10}, Best Cross-Validation Score: 0.86, RMSE on Test Set: 30123.95
SVR - Best Parameters: {'model__C': 10, 'model__kernel': 'linear', 'preprocessor__num__selectkbest__k': 10}, Best Cross-Validation Score: 0.65, RMSE on Test Set: 52427.11
GradientBoosting - Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200, 'preprocessor__num__selectkbest__k': 10}, Best Cross-Validation Score: 0.90

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import mean_squared_error

# Load and preprocess 
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('selectkbest', SelectKBest(k=10))
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer,categorical_cols )
    ]
)

# Define the voting regressor
voting_regressor = VotingRegressor(estimators=[
    ('rf', RandomForestRegressor(n_estimators=100)),
    ('gb', GradientBoostingRegressor(n_estimators=100)),
    ('lr', LinearRegression())
])

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', voting_regressor)
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
predictions = pipeline.predict(X_test)

# Evaluate performance
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"Voting Regressor RMSE: {rmse:.2f}")


Voting Regressor RMSE: 26947.54


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for the Voting Regressor
param_grid = {
    'model__rf__n_estimators': [100, 200],
    'model__gb__n_estimators': [100, 200],
    'model__lr__fit_intercept': [True, False],
    'preprocessor__num__selectkbest__k': [5, 10, 15]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation RMSE: {best_score ** 0.5:.2f}")


Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [None]:
import matplotlib.pyplot as plt

# Use the best model to predict on the test set
best_model = grid_search.best_estimator_
test_predictions = best_model.predict(X_test)

# Plotting
plt.figure(figsize=(10, 6))
plt.scatter(y_test, test_predictions, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs. Predicted Prices')
plt.show()
