In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

df = pd.read_csv("train.csv")

tier1 = [
    'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars',
    'GarageArea', '1stFlrSF', 'YearBuilt', 'FullBath',
    'TotRmsAbvGrd', 'Fireplaces', 'MasVnrArea',
    'BsmtQual', 'ExterQual', 'KitchenQual'
]

qual_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, np.nan: 0}
df['BsmtQual'] = df['BsmtQual'].map(qual_map)
df['ExterQual'] = df['ExterQual'].map(qual_map)
df['KitchenQual'] = df['KitchenQual'].map(qual_map)

df_t1 = df[tier1 + ['SalePrice']].dropna()

X = df_t1[tier1]
y = df_t1['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    print(f"{name} RMSE: {rmse:.2f}")



LinearRegression RMSE: 32292.57
RandomForest RMSE: 26826.87




In [4]:
tier2 = [
    'Neighborhood', 'YearRemodAdd', 'Foundation',
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtCond',
    'GarageFinish', 'GarageYrBlt', 'MSSubClass',
    'HouseStyle', 'LotArea', 'WoodDeckSF',
    'OpenPorchSF', 'CentralAir', 'MSZoning'
]

selected_cols = tier1 + tier2

qual_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, np.nan: 0}
bsmtcond_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, np.nan: 0}
garagefinish_map = {'Fin': 3, 'RFn': 2, 'Unf': 1, np.nan: 0}
centralair_map = {'Y': 1, 'N': 0}

df_copy = df.copy()
df_copy['BsmtQual'] = df_copy['BsmtQual'].map(qual_map)
df_copy['ExterQual'] = df_copy['ExterQual'].map(qual_map)
df_copy['KitchenQual'] = df_copy['KitchenQual'].map(qual_map)
df_copy['BsmtCond'] = df_copy['BsmtCond'].map(bsmtcond_map)
df_copy['GarageFinish'] = df_copy['GarageFinish'].map(garagefinish_map)
df_copy['CentralAir'] = df_copy['CentralAir'].map(centralair_map)

cat_cols = ['Neighborhood', 'Foundation', 'HouseStyle', 'MSZoning']
df_encoded = pd.get_dummies(df_copy[selected_cols + ['SalePrice']], columns=cat_cols)

df_encoded = df_encoded.fillna(0)

X = df_encoded.drop('SalePrice', axis=1)
y = df_encoded['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    print(f"{name} RMSE (Tier1+2): {rmse:.2f}")



LinearRegression RMSE (Tier1+2): 32089.42
RandomForest RMSE (Tier1+2): 27002.59




In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

tier1 = [
    'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars',
    'GarageArea', '1stFlrSF', 'YearBuilt', 'FullBath',
    'TotRmsAbvGrd', 'Fireplaces', 'MasVnrArea',
    'BsmtQual', 'ExterQual', 'KitchenQual'
]

tier2 = [
    'Neighborhood', 'YearRemodAdd', 'Foundation',
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtCond',
    'GarageFinish', 'GarageYrBlt', 'MSSubClass',
    'HouseStyle', 'LotArea', 'WoodDeckSF',
    'OpenPorchSF', 'CentralAir', 'MSZoning'
]

tier3 = [
    'Street', 'Alley', 'Utilities', 'PoolQC', 'Fence',
    'MiscFeature', 'MoSold', 'YrSold', 'Condition2',
    'Heating', 'Electrical', 'Functional', 'PavedDrive'
]

all_features = tier1 + tier2 + tier3

qual_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, np.nan: 0}
garagefinish_map = {'Fin': 3, 'RFn': 2, 'Unf': 1, np.nan: 0}
centralair_map = {'Y': 1, 'N': 0}

def preprocess(df):
    df = df.copy()

    df['BsmtQual'] = df['BsmtQual'].map(qual_map)
    df['ExterQual'] = df['ExterQual'].map(qual_map)
    df['KitchenQual'] = df['KitchenQual'].map(qual_map)
    df['BsmtCond'] = df['BsmtCond'].map(qual_map)
    df['GarageFinish'] = df['GarageFinish'].map(garagefinish_map)
    df['CentralAir'] = df['CentralAir'].map(centralair_map)

    selected = all_features + ['Id']
    df = df[selected]

    cat_cols = df.select_dtypes(include='object').columns
    df = pd.get_dummies(df, columns=cat_cols)

    return df.fillna(0)

train_proc = preprocess(train)
test_proc = preprocess(test)

X = train_proc.drop("Id", axis=1)
y = train["SalePrice"]
X, test_proc = X.align(test_proc.drop("Id", axis=1), join='left', axis=1, fill_value=0)

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

preds = model.predict(test_proc)
submission['SalePrice'] = preds
submission.to_csv("Submission_TierSystem_LR.csv", index=False)

In [11]:
16468.94362

16468.94362

In [13]:
from sklearn.linear_model import Ridge

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

tier1 = [
    'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars',
    'GarageArea', '1stFlrSF', 'YearBuilt', 'FullBath',
    'TotRmsAbvGrd', 'Fireplaces', 'MasVnrArea',
    'BsmtQual', 'ExterQual', 'KitchenQual'
]
tier2 = [
    'Neighborhood', 'YearRemodAdd', 'Foundation',
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtCond',
    'GarageFinish', 'GarageYrBlt', 'MSSubClass',
    'HouseStyle', 'LotArea', 'WoodDeckSF',
    'OpenPorchSF', 'CentralAir', 'MSZoning'
]
selected = tier1 + tier2

qual_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, np.nan: 0}
garagefinish_map = {'Fin': 3, 'RFn': 2, 'Unf': 1, np.nan: 0}
binary_map = {'Y': 1, 'N': 0}

def encode(df):
    df = df.copy()
    df['BsmtQual'] = df['BsmtQual'].map(qual_map)
    df['ExterQual'] = df['ExterQual'].map(qual_map)
    df['KitchenQual'] = df['KitchenQual'].map(qual_map)
    df['BsmtCond'] = df['BsmtCond'].map(qual_map)
    df['GarageFinish'] = df['GarageFinish'].map(garagefinish_map)
    df['CentralAir'] = df['CentralAir'].map(binary_map)
    return df

X = encode(train[selected])
X_test = encode(test[selected])

X = pd.get_dummies(X, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

X['GrLivArea_squared'] = X['GrLivArea'] ** 2
X['TotalBsmtSF_squared'] = X['TotalBsmtSF'] ** 2
X['OverallQual_x_YearBuilt'] = X['OverallQual'] * X['YearBuilt']
X['GrLivArea_x_OverallQual'] = X['GrLivArea'] * X['OverallQual']

X_test['GrLivArea_squared'] = X_test['GrLivArea'] ** 2
X_test['TotalBsmtSF_squared'] = X_test['TotalBsmtSF'] ** 2
X_test['OverallQual_x_YearBuilt'] = X_test['OverallQual'] * X_test['YearBuilt']
X_test['GrLivArea_x_OverallQual'] = X_test['GrLivArea'] * X_test['OverallQual']

X = X.fillna(0)
X_test = X_test.fillna(0)

y = train['SalePrice']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)
model = Ridge(alpha=10)
model.fit(X_train, y_train)

val_preds = model.predict(X_val)
rmse = mean_squared_error(y_val, val_preds, squared=False)
print(f"Ridge Validation RMSE: {rmse:.2f}")

test_preds = model.predict(X_test)
submission['SalePrice'] = test_preds
submission.to_csv("Submission_TierSystem_Ridge.csv", index=False)

Ridge Validation RMSE: 28276.40




In [14]:
17625.58894

17625.58894

In [17]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

tier1 = [
    'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars',
    'GarageArea', '1stFlrSF', 'YearBuilt', 'FullBath',
    'TotRmsAbvGrd', 'Fireplaces', 'MasVnrArea',
    'BsmtQual', 'ExterQual', 'KitchenQual'
]
tier2 = [
    'Neighborhood', 'YearRemodAdd', 'Foundation',
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtCond',
    'GarageFinish', 'GarageYrBlt', 'MSSubClass',
    'HouseStyle', 'LotArea', 'WoodDeckSF',
    'OpenPorchSF', 'CentralAir', 'MSZoning'
]
selected = tier1 + tier2

qual_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, np.nan: 0}
garagefinish_map = {'Fin': 3, 'RFn': 2, 'Unf': 1, np.nan: 0}
binary_map = {'Y': 1, 'N': 0}

def encode(df):
    df = df.copy()
    df['BsmtQual'] = df['BsmtQual'].map(qual_map)
    df['ExterQual'] = df['ExterQual'].map(qual_map)
    df['KitchenQual'] = df['KitchenQual'].map(qual_map)
    df['BsmtCond'] = df['BsmtCond'].map(qual_map)
    df['GarageFinish'] = df['GarageFinish'].map(garagefinish_map)
    df['CentralAir'] = df['CentralAir'].map(binary_map)
    return df

X = encode(train[selected])
X_test = encode(test[selected])
X = pd.get_dummies(X, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

X = X.fillna(0)
X_test = X_test.fillna(0)
y = train['SalePrice']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbosity': -1,
    'seed': 42
}

callbacks = [lgb.early_stopping(stopping_rounds=50, verbose=False)]

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_val],
    callbacks=callbacks
)

val_preds = model.predict(X_val, num_iteration=model.best_iteration)
rmse = mean_squared_error(y_val, val_preds, squared=False)
print(f"LightGBM RMSE: {rmse:.2f}")

test_preds = model.predict(X_test, num_iteration=model.best_iteration)
submission['SalePrice'] = test_preds
submission.to_csv("Submission_TierSystem_LGBM.csv", index=False)

LightGBM RMSE: 30213.39




In [18]:
16183.21670

16183.2167