# House Price CV & Baseline Model

In [None]:
import seaborn as sns

from house_price_utils import *

setup_notebook()

In [None]:
data, data_test = load_data()
data = data[sorted(data)]
X_raw = data.drop(columns=["SalePrice"])
y = data.SalePrice

In [None]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values()
    return mi_scores


def group_features(df: pd.DataFrame, ordinal_cutoff: int = 10) -> Dict[str, Set]:
    columns = set(df)
    cat_columns = set(df.select_dtypes(include=object))
    ordinal_cols = []
    cont_cols = []
    for column in columns - cat_columns:
        if df[column].nunique() < ordinal_cutoff:
            ordinal_cols.append(column)
        else:
            cont_cols.append(column)
    return {
        "nominal": cat_columns,
        "ordinal": set(ordinal_cols),
        "continuous": set(cont_cols),
    }


def clean(df: pd.DataFrame, features: Dict[str, Set]) -> pd.DataFrame:
    num_columns = features["continuous"]
    df_cont = df[list(num_columns)].copy()
    df_cont = df_cont.fillna(df_cont.mean())
    assert all(df_cont.isnull().sum() == 0)

    # TODO: make this configurable since ordinal encdoing doesn't work well with linear models
    encoder = OrdinalEncoder().set_output(transform="pandas")
    df_ord = encoder.fit_transform(df[list(features["ordinal"])])

    encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
    df_cat = encoder.fit_transform(df[list(features["nominal"])])

    df_clean = pd.concat([df_cont, df_ord, df_cat], axis=1)
    has_nans = [c for c in df_clean if df_clean[c].isnull().any()]
    return df_clean.drop(columns=has_nans)

In [None]:
full_data = pd.concat([X_raw, data_test])
features = group_features(full_data)
full_clean_data = clean(full_data, features)
X = full_clean_data[lambda x: x.index.isin(X_raw.index)]
X_test = full_clean_data[lambda x: x.index.isin(data_test.index)]

In [None]:
X.shape, X_test.shape

In [None]:
selected_features = make_mi_scores(X, y, "auto").index[-10:].tolist()
selected_features

# Baseline Model
 Given the large number of categorical variables, let's start with random forest so we don't have to do any further encoding.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

In [None]:
model = RandomForestRegressor(random_state=0)

In [None]:
scores = cross_validate(
    model,
    X,
    y,
    cv=10,
    scoring="neg_mean_absolute_error",
    return_train_score=True,
)

In [None]:
-scores["train_score"].mean(), scores["train_score"].std()

In [None]:
-scores["test_score"].mean(), scores["test_score"].std()

## Observations
* Model is overfitting.


In [None]:
from sklearn.model_selection import ValidationCurveDisplay

In [None]:
ValidationCurveDisplay.from_estimator(
    model,
    X,
    y,
    param_name="n_estimators",
    param_range=[50, 100, 200, 400, 800],
    scoring="neg_mean_absolute_error",
)

# Submit

In [None]:
model.fit(X, y)

In [None]:
create_submission(model, X_test)