In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

def preprocess_all(df, is_train=True):
    df = df.copy()
    if is_train:
        df = df.drop(['SalePrice'], axis=1)
    df = df.drop(['Id'], axis=1)

    qual_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, np.nan: 0}
    bsmtfin_map = {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0}
    bsmtexp_map = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0}
    garagefinish_map = {'Fin': 3, 'RFn': 2, 'Unf': 1, np.nan: 0}
    binary_map = {'Y': 1, 'N': 0}

    for col in ['BsmtQual', 'ExterQual', 'KitchenQual', 'BsmtCond', 'GarageQual', 'GarageCond']:
        df[col] = df[col].map(qual_map)

    df['BsmtExposure'] = df['BsmtExposure'].map(bsmtexp_map)
    df['BsmtFinType1'] = df['BsmtFinType1'].map(bsmtfin_map)
    df['BsmtFinType2'] = df['BsmtFinType2'].map(bsmtfin_map)
    df['GarageFinish'] = df['GarageFinish'].map(garagefinish_map)
    df['CentralAir'] = df['CentralAir'].map(binary_map)

    df = pd.get_dummies(df, drop_first=True)

    return df.fillna(0)

X = preprocess_all(train)
y = train['SalePrice']
X_test = preprocess_all(test, is_train=False)

X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

val_preds = model.predict(X_val)
rmse = mean_squared_error(y_val, val_preds, squared=False)
print(f"Validation RMSE: {rmse:.2f}")

test_preds = model.predict(X_test)
submission['SalePrice'] = test_preds
submission.to_csv("Submission_AllVariables_Tree.csv", index=False)

Validation RMSE: 27278.26




In [2]:
16516.02754

16516.02754