In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.base import BaseEstimator, TransformerMixin

# Custom transformer for high missing categorical features
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='constant', fill_value=None):
        self.strategy = strategy
        self.fill_value = fill_value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.strategy == 'constant':
            return X.fillna(self.fill_value)
        return X

# Load dataset
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/ames_housing.csv"
df = pd.read_csv(url)

# Split the dataset into features and target
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

# Identify columns based on missing value percentage
high_missing_cols = ['Pool QC', 'Misc Feature', 'Alley', 'Fence', 'Mas Vnr Type', 'Fireplace Qu']
moderate_missing_cols = ['Lot Frontage', 'Garage Cond', 'Garage Qual', 'Garage Finish', 'Garage Yr Blt', 'Garage Type', 'Bsmt Exposure', 'BsmtFin Type 2', 'Bsmt Cond', 'Bsmt Qual', 'BsmtFin Type 1', 'Mas Vnr Area']
low_missing_cols = ['Bsmt Half Bath', 'Bsmt Full Bath', 'BsmtFin SF 1', 'Garage Cars', 'Garage Area', 'Total Bsmt SF', 'Bsmt Unf SF', 'BsmtFin SF 2', 'Electrical']

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing for high missing categorical data
high_missing_transformer = Pipeline(steps=[
    ('imputer', CustomImputer(strategy='constant', fill_value='None')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for moderate missing categorical data
moderate_missing_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for categorical data with low missing values
low_missing_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('high_cat', high_missing_transformer, high_missing_cols),
        ('moderate_cat', moderate_missing_transformer, moderate_missing_cols),
        ('low_cat', low_missing_transformer, low_missing_cols)
    ], remainder='passthrough')

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures()),
    ('pca', PCA(n_components=15)),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())  # Placeholder; will be set by GridSearchCV
])

# Define parameter grid for GridSearchCV
param_grid = [
    {
        'poly__degree': [1, 2],
        'model': [LinearRegression()],
    },
    {
        'poly__degree': [1, 2],
        'model': [Ridge()],
        'model__alpha': [0.1, 1.0, 10.0]
    },
    {
        'poly__degree': [1, 2],
        'model': [Lasso()],
        'model__alpha': [0.1, 1.0, 10.0]
    }
]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score (MSE): ", -grid_search.best_score_)

# Transform the data using the best model
X_train_transformed = grid_search.best_estimator_.named_steps['preprocessor'].transform(X_train)
X_test_transformed = grid_search.best_estimator_.named_steps['preprocessor'].transform(X_test)

print("Training data shape after preprocessing:", X_train_transformed.shape)
print("Testing data shape after preprocessing:", X_test_transformed.shape)

# Evaluate on the test set
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test set score: ", test_score)


HTTPError: HTTP Error 404: Not Found

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

# Define the customImputer class
class customImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.imputer_dict_ = {}

    def fit(self, X, y=None):
        for column in X:
            if X[column].dtype == np.dtype('O'):  # Object data type
                self.imputer_dict_[column] = X[column].mode()[0]
            else:
                self.imputer_dict_[column] = X[column].median()
        return self

    def transform(self, X):
        X_copy = X.copy()
        for column in X_copy:
            X_copy[column].fillna(self.imputer_dict_[column], inplace=True)
        return X_copy

# Load data
data = pd.read_csv('AmesHousing.csv')

# Assuming 'SalePrice' is the target variable and all others are features
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=[object]).columns.tolist()

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', customImputer()),
            ('poly', PolynomialFeatures())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ]
)

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),  # Scaling the features after transformation
    ('ridge', Ridge())
])

# Define parameter grid for GridSearchCV
param_grid = {
    'preprocessor__num__poly__degree': [2, 3],
    'ridge__alpha': [0.1, 1.0, 10.0]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', error_score='raise')

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)

# Evaluate on test data
test_score = grid_search.score(X_test, y_test)
print("Test score: ", test_score)


Best parameters found:  {'preprocessor__num__poly__degree': 2, 'ridge__alpha': 10.0}
Best score found:  -1677823325.0158963
Test score:  -2937806916.960132


In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, make_scorer

# Define the customImputer class
class customImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.imputer_dict_ = {}

    def fit(self, X, y=None):
        for column in X:
            if X[column].dtype == np.dtype('O'):  # Object data type
                self.imputer_dict_[column] = X[column].mode()[0]
            else:
                self.imputer_dict_[column] = X[column].median()
        return self

    def transform(self, X):
        X_copy = X.copy()
        for column in X_copy:
            X_copy[column].fillna(self.imputer_dict_[column], inplace=True)
        return X_copy

# Load data
data = pd.read_csv('AmesHousing.csv')

# Assuming 'SalePrice' is the target variable and all others are features
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=[object]).columns.tolist()

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', customImputer()),
            ('poly', PolynomialFeatures())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ]
)

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),  # Scaling the features after transformation
    ('ridge', Ridge())
])

# Define parameter grid for GridSearchCV
param_grid = {
    'preprocessor__num__poly__degree': [2, 3],
    'ridge__alpha': [0.1, 1.0, 10.0]
}

# Create GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, 
    scoring='neg_mean_squared_error', 
    error_score='raise', 
    verbose=3
)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)

# Evaluate on test data
y_pred = grid_search.predict(X_test)
test_score = mean_squared_error(y_test, y_pred)
print("Test mean squared error: ", test_score)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END preprocessor__num__poly__degree=2, ridge__alpha=0.1;, score=-2958909442.935 total time=   0.7s
[CV 2/5] END preprocessor__num__poly__degree=2, ridge__alpha=0.1;, score=-15844879143.989 total time=   0.5s
[CV 3/5] END preprocessor__num__poly__degree=2, ridge__alpha=0.1;, score=-1270983144.294 total time=   0.4s
[CV 4/5] END preprocessor__num__poly__degree=2, ridge__alpha=0.1;, score=-7782826595.761 total time=   0.6s
[CV 5/5] END preprocessor__num__poly__degree=2, ridge__alpha=0.1;, score=-860571295.012 total time=   0.5s
[CV 1/5] END preprocessor__num__poly__degree=2, ridge__alpha=1.0;, score=-3065918867.810 total time=   0.5s
[CV 2/5] END preprocessor__num__poly__degree=2, ridge__alpha=1.0;, score=-8264022523.219 total time=   0.5s
[CV 3/5] END preprocessor__num__poly__degree=2, ridge__alpha=1.0;, score=-826738523.019 total time=   0.5s
[CV 4/5] END preprocessor__num__poly__degree=2, ridge__alpha=1.0;, score=-233