# House Price EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

plt.style.use("ggplot")
pd.options.display.max_columns = None
pd.options.display.max_rows = 30
warnings.filterwarnings("ignore", message="use_inf_as_na option is deprecated")

In [None]:
data = pd.read_csv("../input/train.csv", index_col="Id")
data_test = pd.read_csv("../input/test.csv", index_col="Id")
data = data[sorted(data)]
X = data.drop(columns=["SalePrice"])
target = data.SalePrice

In [None]:
data.shape

# Feature Data Analysis

In [None]:
X

In [None]:
X.describe()

In [None]:
data_test.describe()

In [None]:
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=[object]).columns.tolist()

assert len(X.columns) == len(num_features) + len(cat_features)

print(f"{len(num_features)=}, {len(cat_features)=}")

In [None]:
data[num_features].select_dtypes(include=[np.float64]).nunique().sort_values(
    ascending=False
)

In [None]:
data[num_features].select_dtypes(include=[np.int64]).nunique().sort_values(
    ascending=False
)[:10]

In [None]:
data[cat_features].nunique().sort_values(ascending=False)[:10]

In [None]:
data.groupby("Neighborhood").agg({"SalePrice": ["mean", "std"]}).sort_values(
    ("SalePrice", "mean")
)

## Observations
* `Qual` features are ordinal
* `Cond` features are ordinal
* More categorical than numerical features.
* Quite a few cat features with cardinality greater than 5. Neighborbood especially big one. May need to to target encoding.

# Check Train/Test Imbalances

## Numerical Features

In [None]:
(data[num_features].mean() - data_test[num_features].mean()).div(
    data[num_features].std()
).abs().sort_values(ascending=False)

## Categorical Features

In [None]:
from scipy.stats import chi2_contingency

In [None]:
def chi_square_test(train_data, valid_data, cat_features):
    chi_results = []
    for c in cat_features:
        contingency_table = pd.DataFrame(
            {
                "train": train_data[c].value_counts(),
                "valid": valid_data[c].value_counts(),
            }
        ).fillna(0)
        chi2, p, dof, expected = chi2_contingency(contingency_table)
        chi_results.append((c, chi2, p))
    return pd.DataFrame(
        chi_results, columns=["category", "chi_squared", "p_value"]
    ).sort_values("p_value")

In [None]:
chi_results = chi_square_test(data, data_test, cat_features)

In [None]:
chi_results.sort_values("p_value")[:10]

## Observations
* No big t-value differences in train/test numeric data.
* No cat features seem to have significantly different proportions at 1% level.
* Seems like train and validation data sets are balanced across numerical and categorical features.
* May want to stratify across `Neighborhood`.

# Target Analysis

In [None]:
target.describe()

In [None]:
g = sns.displot(target, kind="kde")

In [None]:
sns.displot(np.log(target), kind="kde")
_ = g.ax.set(xlabel="Log SalePrice")

## Observations
SalePrice is right skewed, may benefit from Log or Box-Cox transforming.

# Missing values

In [None]:
missing_percent = (
    data.isna().sum()[lambda x: x > 0].sort_values(ascending=False).div(len(data))
)
missing_percent

In [None]:
mostly_missing = missing_percent[lambda x: x.gt(0.5)]
mostly_missing

In [None]:
for c in mostly_missing.index:
    sns.catplot(data.fillna("N/A"), x=c, y="SalePrice", kind="box")

In [None]:
ax = data.plot.scatter("LotFrontage", "SalePrice")

In [None]:
 data[['LotFrontage', 'SalePrice']].corr()

## Observations
* `Qual` features are ordinal
* `Cond` features are ordinal
* `PoolQC`, `MiscFeature`, `Alley`, `Fence`, `MasVnrType` mostly missing and don't seem to really separate the SalePrice so can drop.
* `LotFrontage` has large missing values but seems reasonable correlated with target so will impute missing values for it.
* `FireplaceQu` has large missing values which seems to be from missing `NA`. This can be filled in.

In [None]:
cat_features = [c for c in cat_features if c not in mostly_missing]
len(cat_features)

# Duplicates

In [None]:
data.loc[data.duplicated()].empty

In [None]:
data[cat_features].duplicated().sum()

In [None]:
data[["SalePrice"] + cat_features].loc[
    data.duplicated(subset=cat_features, keep=False)
].sort_values(cat_features)

In [None]:
data.groupby(cat_features).count().sort_values("YearBuilt", ascending=False)

In [None]:
data[data[num_features].duplicated(keep=False)]

## Observations
* There are a few cat feature duplicates but they differ on the num features and targets so they look like genuine samples. 
* One num feature duplicate entry looks like `BsmtExposure` changed. Just a single value so probably not a big deal.


# Ordinal vs Nominal

In [None]:
data[num_features].nunique().sort_values(ascending=False)[lambda x: x <= 30]

In [None]:
nom_features = [
    "MSSubClass",
]
ord_features = [
    "OverallQual",
    "OverallCond",
    "YrSold",
    "MoSold",
]

for c in nom_features + ord_features:
    if c in num_features:
        num_features.remove(c)

## Observations
* `BedroomAbvGr` not documented.
* `MSSubClass` is nominal feature.
* `OverallQual`, `OverallCond`, `YrSold`, `MoSold` are ordinal.

# Feature Relationships

In [None]:
data[["SalePrice"] + num_features + ord_features].corr().style.background_gradient(
    cmap="coolwarm"
)

## Observations
* Some high correlations between garage features and years things were built.
* Nothing looks too high to be of concern at this point.

In [None]:
from sklearn.feature_selection import mutual_info_regression


def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values()
    return mi_scores

In [None]:
discrete_features = nom_features + ord_features + cat_features

In [None]:
def preprocess(data, num_feaures, discrete_features):
    X = data[num_features + discrete_features].astype({f: float for f in num_features})
    for c in num_features:
        X.loc[X[c].isna(), c] = X[c].mean()
    for c in discrete_features:
        X.loc[:, c], _ = X[c].factorize(use_na_sentinel=False)
    X = X.astype({f: int for f in discrete_features})
    return X.dropna()

In [None]:
X = preprocess(data, num_features, discrete_features)

In [None]:
X

In [None]:
X.shape

In [None]:
mi_scores = make_mi_scores(X, target[X.index], "auto")

In [None]:
ax = mi_scores.plot(kind="barh", figsize=(10, 10))
ax.tick_params(axis="y", labelsize=7)
_ = ax.set_title("Mutual Information")

In [None]:
g = sns.pairplot(X[mi_scores[-10:].index].join(target[X.index]))

## Observations
* Some heteroskedascity in in target vs num feature scatter plots.

# Conclusion
1. Drop columns `PoolQC, MiscFeature, Alley, Fence, MasVnrType`.
2. Impute `LotFrontage` at a minimum.
3. `OverallQual` is both the most correlated and highest MI feature. It looks somewhat bifurcated against SalePrice which might be from and interaction.
4. Might be worth log transform target and positive numerical features.
5. Training and validation data sets don't seem to have any imbalances.
6. Probably should to stratify validation on `Neighborhood`