In [3]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


train_data_path = r"D:\machine_learning\house pred\house-prices-advanced-regression-techniques\train.csv"
test_data_path = r"D:\machine_learning\house pred\house-prices-advanced-regression-techniques\test.csv"

df = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)


df.drop(columns=['Id'], inplace=True)


for col in df.columns:
    if df[col].isna().sum() > 0:
        print(f'Column: {col}, Missing Values: {df[col].isna().sum()}, Type: {df[col].dtype}')


class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.select_dtypes(include=[self.dtype])


string_columns = df.select_dtypes(include=['object']).columns
numeric_columns = df.select_dtypes(include=['number']).columns


numeric_columns = numeric_columns.drop('SalePrice')


numeric_pipeline = Pipeline([
    ('selector', ColumnSelector(dtype=np.number)),
    ('imputer', SimpleImputer(strategy='mean'))
])

string_pipeline = Pipeline([
    ('selector', ColumnSelector(dtype=object)),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_columns),
    ('cat', string_pipeline, string_columns)
])


model = RandomForestRegressor(n_estimators=100, random_state=0)


pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])


y = df['SalePrice']
X = df.drop(['SalePrice'], axis=1)


X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)


pipeline.fit(X_train, y_train)


preds = pipeline.predict(X_valid)

score = mean_absolute_error(y_valid, preds)
print(f'Mean Absolute Error: {score}')


df_test_ids = df_test['Id']
df_test.drop(columns=['Id'], inplace=True)


test_data_processed = preprocessor.transform(df_test)


test_preds = model.predict(test_data_processed)


submission = pd.DataFrame({
    'Id': df_test_ids,
    'SalePrice': test_preds
})


submission_path = 'D:\machine_learning\house pred\submission.csv'
submission.to_csv(submission_path, index=False)
print(f'Submission file saved to: {submission_path}')


Column: LotFrontage, Missing Values: 259, Type: float64
Column: Alley, Missing Values: 1369, Type: object
Column: MasVnrType, Missing Values: 872, Type: object
Column: MasVnrArea, Missing Values: 8, Type: float64
Column: BsmtQual, Missing Values: 37, Type: object
Column: BsmtCond, Missing Values: 37, Type: object
Column: BsmtExposure, Missing Values: 38, Type: object
Column: BsmtFinType1, Missing Values: 37, Type: object
Column: BsmtFinType2, Missing Values: 38, Type: object
Column: Electrical, Missing Values: 1, Type: object
Column: FireplaceQu, Missing Values: 690, Type: object
Column: GarageType, Missing Values: 81, Type: object
Column: GarageYrBlt, Missing Values: 81, Type: float64
Column: GarageFinish, Missing Values: 81, Type: object
Column: GarageQual, Missing Values: 81, Type: object
Column: GarageCond, Missing Values: 81, Type: object
Column: PoolQC, Missing Values: 1453, Type: object
Column: Fence, Missing Values: 1179, Type: object
Column: MiscFeature, Missing Values: 1406, 