In [1]:
import pandas as pd

# Read the data
X = pd.read_csv(r"C:\Users\fogat\ML challenges\housing challenge\train.csv", index_col='Id') 
X_test = pd.read_csv(r"C:\Users\fogat\ML challenges\housing challenge\test.csv", index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)



In [2]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Example data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# Create pipeline
#pipe = Pipeline([
#    ("preprocess", preprocess_data(X_train, X_valid)),
#    ("model", RandomForestClassifier(random_state=42))
#])



In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [4]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer


class PreprocessTransformer(BaseEstimator, TransformerMixin):
    """
    Sklearn-compatible preprocessing transformer.

    Performs:
      - One-hot encoding for low-cardinality categorical features (<10 unique)
      - Ordinal encoding for high-cardinality categorical features
      - Median imputation for missing values
      - Handles unseen categories in validation/test sets
    """

    def __init__(self, low_card_threshold=10, impute_strategy="median"):
        self.low_card_threshold = low_card_threshold
        self.impute_strategy = impute_strategy
        self.ordinal_encoder = None
        self.onehot_encoder = None
        self.imputer = None

    def fit(self, X, y=None):
        X = pd.DataFrame(X).copy()

        # Identify categorical and numeric columns
        self.object_cols_ = [col for col in X.columns if X[col].dtype == "object"]
        self.low_card_cols_ = [col for col in self.object_cols_ if X[col].nunique() < self.low_card_threshold]
        self.high_card_cols_ = list(set(self.object_cols_) - set(self.low_card_cols_))
        self.num_cols_ = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]

        # Initialize encoders
        self.ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        self.onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        self.imputer = SimpleImputer(strategy=self.impute_strategy)

        # Fit encoders
        if self.high_card_cols_:
            self.ordinal_encoder.fit(X[self.high_card_cols_])
        if self.low_card_cols_:
            self.onehot_encoder.fit(X[self.low_card_cols_])

        # Create encoded version for imputer fitting
        X_enc = self._encode(X)
        self.imputer.fit(X_enc)

        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        X_enc = self._encode(X)

        # Apply imputation
        X_imp = pd.DataFrame(
            self.imputer.transform(X_enc),
            columns=X_enc.columns,
            index=X.index
        )
        return X_imp

    def _encode(self, X):
        """Encodes the input data (numeric + categorical encodings)."""
        parts = []

        # Numeric columns
        if self.num_cols_:
            parts.append(X[self.num_cols_])

        # Ordinal-encoded columns
        if self.high_card_cols_:
            X_high = pd.DataFrame(
                self.ordinal_encoder.transform(X[self.high_card_cols_]),
                columns=self.high_card_cols_,
                index=X.index
            )
            parts.append(X_high)

        # One-hot-encoded columns
        if self.low_card_cols_:
            X_low = pd.DataFrame(
                self.onehot_encoder.transform(X[self.low_card_cols_]),
                columns=self.onehot_encoder.get_feature_names_out(self.low_card_cols_),
                index=X.index
            )
            parts.append(X_low)

        return pd.concat(parts, axis=1)

    def get_feature_names_out(self, input_features=None):
        """Return final feature names after preprocessing."""
        names = list(self.num_cols_)
        names += list(self.high_card_cols_)
        if self.low_card_cols_:
            names += list(self.onehot_encoder.get_feature_names_out(self.low_card_cols_))
        return np.array(names)



In [37]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor

# Create pipeline
pipe = Pipeline([
    ("preprocess", PreprocessTransformer(low_card_threshold=15)),
    ("model", GradientBoostingRegressor(n_estimators=1000,learning_rate=0.05, n_iter_no_change=5,random_state=0))
])

pipe.fit(X_train, y_train)

In [38]:
#scoring results
from sklearn.model_selection import cross_val_score

scores = cross_val_score(pipe, X, y,
                              cv=8,
                              scoring='r2')

print("r2 scores:\n", scores)
print("Average r2 score (across experiments):")
print(scores.mean())


r2 scores:
 [0.91343366 0.90360297 0.85802972 0.89618297 0.89272876 0.90272191
 0.88833756 0.87675461]
Average r2 score (across experiments):
0.8914740209118641


In [39]:
# Get test predictions
preds_test = pipe.predict(X_test)


# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)
