In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
from pathlib import Path
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

import matplotlib.pyplot as plt
import seaborn as sns

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p, boxcox
from scipy.stats import boxcox_normmax, yeojohnson
from sklearn.preprocessing import PowerTransformer

import missingno as msno

from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression

In [3]:
def load_raw_data():
    # Read data
    data_dir = Path("../input/house-prices-advanced-regression-techniques/")
    df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
    df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")
    return df_train, df_test

# De-skewing (normalizing) target data

In [4]:
X_train_r, y_train_r = load_raw_data()

In [5]:
count_features = ['BsmtHalfBath', 'BsmtFullBath', 'HalfBath', 'FullBath',
                  'BedroomAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars',
                  'OverallCond', 'OverallQual','ExterQual','ExterCond','BsmtQual','BsmtCond']

In [6]:
X_train_r.columns.tolist()

In [7]:
sns.distplot(X_train_r['SalePrice'])

print(f"skew: {X_train_r['SalePrice'].skew()}")
print(f"kurtosis: {X_train_r['SalePrice'].kurt()}")

In [8]:

y_log = np.log(X_train_r['SalePrice'])

sns.distplot(y_log)
print(f"skew: {y_log.skew()}")
print(f"kurtosis: {y_log.kurt()}")

# Examining Missing Data

In [9]:

X_misses = X_train_r.columns[X_train_r.isnull().any()]
msno.matrix(X_train_r[X_misses], fontsize=12, labels=True)

In [11]:
missing_ct = X_train_r.isna().sum()
missing_ct.loc[missing_ct>0]

In [12]:
# Mutual Information


# Data Preparation

#-------------------------------
# Encode

# The numeric features are already encoded correctly (`float` for
# continuous, `int` for discrete), but the categoricals we'll need to
# do ourselves. Note in particular, that the `MSSubClass` feature is
# read as an `int` type, but is actually a (nominative) categorical.

# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]

# The ordinal (ordered) categorical features 

# Pandas calls the categories "levels"
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(10))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

# Add a None level for missing values
# DH comment - this concatenates none - the pattern is simple and clever remember for the future
ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}

# Create list to store all categorical features
features_ord = [key for key in ordered_levels.keys()]
features_cat = features_ord + features_nom

# DH Comment = the df[].cat.*     functions below are new. remember their usefulness
def encode(df):
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name].cat.add_categories("None", inplace=True)
    # Ordinal categories
    # DH comment - how to apply the Ordinal Ordering
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                  ordered=True))
    return df

def clean(df):
    df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    # Some values of GarageYrBlt are corrupt, so we'll replace them
    # with the year the house was built
    df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df.YearBuilt)
    # Names beginning with numbers are awkward to work with
    df.rename(columns={
        "1stFlrSF": "FirstFlrSF",
        "2ndFlrSF": "SecondFlrSF",
        "3SsnPorch": "Threeseasonporch",
    }, inplace=True,
    )
    return df

def impute_simple(df):
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df

# prepare Data

def load_data_combined():
    # Read data
    #../input/house-prices-advanced-regression-techniques/test.csv
    data_dir = Path("../input/house-prices-advanced-regression-techniques/")
    df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
    df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")
    # Merge the splits so we can process them together
    return pd.concat([df_train, df_test])

def reform_train_test_split(df_combined):
    # Read data
    #../input/house-prices-advanced-regression-techniques/test.csv
    data_dir = Path("../input/house-prices-advanced-regression-techniques/")
    df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
    df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")
    # Merge the splits so we can process them together
    df_train = df_combined.loc[df_train.index, :]
    df_test = df_combined.loc[df_test.index, :]
    return df_train, df_test

def preprocess_data(df):
    df = clean(df)
    df = encode(df)
    df = impute_simple(df)
    return df
        
def load_and_preproc_data(): 
    df_1 = load_data_combined()
    df_1 = preprocess_data(df_1)
    df_train, df_test = reform_train_test_split(df_1)
    return df_train, df_test

# Mutual Information
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
    
#look at the feature scores
X, _ = load_and_preproc_data()
y = X.pop("SalePrice")

mi_scores = make_mi_scores(X, y)
mi_scores.columns = ['mi_score']
mi_scores.loc[mi_scores > 0.01].head(30)

In [13]:
def drop_uninformative(df, mi_scores):
    return df.loc[:, mi_scores > 0.0]

In [14]:
dfu = drop_uninformative(X, mi_scores)
dfu.columns

In [15]:

corr = X_train_r.corr()
corr.sort_values(["SalePrice"], ascending = False, inplace = True)
corr["SalePrice"]

In [16]:
sns.scatterplot(data=X_train_r, x='GrLivArea', y='SalePrice').set(title="GrLivArea")

In [17]:
sns.boxplot(data=X_train_r, x='OverallQual', y='SalePrice')

In [18]:
sns.scatterplot(data=X_train_r, x='TotalBsmtSF', y='SalePrice', hue='BsmtFinType1')

In [19]:
sns.boxplot(data=X_train_r, x='BsmtFinType1', y='SalePrice')

In [20]:
sns.boxplot(data=X_train_r, x='BsmtFinType2', y='SalePrice')

In [21]:
plt.figure(figsize=(15,8))
ax = sns.scatterplot(data=X_train_r, x='OverallQual', y='SalePrice').set(title='Overall Quality')

In [22]:
ngbr_order = X_train_r.groupby(by=['Neighborhood'])['SalePrice'].median()
ngbr_order.sort_values(inplace=True)
ngbr_order

In [23]:
plt.figure(figsize=(17,9))
ax = sns.boxplot(data=X_train_r, x='Neighborhood', y='SalePrice', order=ngbr_order.index)
ax.set(title="Neighborhood")
plt.xticks(rotation=45)

In [24]:
plt.figure(figsize=(15,8))
ax = sns.scatterplot(data=X_train_r, x='OverallQual', y='SalePrice')
ax.set(title="Overall Quality")

In [25]:
plt.figure(figsize=(15,8))
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ax = sns.boxplot(data=X_train_r, x='ExterQual', y='SalePrice', order=five_levels)
ax.set(title="Exterior Quality")

In [26]:
plt.figure(figsize=(15,8))
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ax = sns.boxplot(data=X_train_r, x='ExterCond', y='SalePrice', order=five_levels)
ax.set(title="Exterior Condition")

In [27]:
def corrplot(df, method="pearson", annot=True, **kwargs):
    sns.clustermap(
        df.corr(method),
        vmin=-1.0,
        vmax=1.0,
        cmap="icefire",
        method="complete",
        annot=annot,
        **kwargs,
    )
    
corrplot(X_train_r, annot=None)

In [29]:
df = X_train_r
corr_bar_vals = df.corr('pearson')['SalePrice'].sort_values(ascending=False).head(18)
ax = corr_bar_vals.plot.barh(y='Feature', figsize=(15, 8))
# fig = px.bar(data= corr_bar_vals, x=data.index, y=)
# fig.show()

In [None]:
def normalize_numerics(df, skew_cutoff=0.5):
    '''
    Finds Features with large skewness and applies box-cox transformation
    '''
    X = df.copy()
    numeric_columns = X.select_dtypes(["number"]).columns
    skew_feats = X[numeric_columns].apply(lambda x: skew(x)).sort_values(ascending=False)
    
    too_skew = skew_feats[skew_feats > skew_cutoff].index
    
    # normalize each of the features with high skew with scipy boxcox 
    for s in too_skew:
        X[s] = boxcox1p(X[s], boxcox_normmax(X[s] + 1))
    return X

In [None]:
def find_skew_cols(df):
    X = df.copy()
    numeric_columns = X.select_dtypes(["number"]).columns
    skew_feats = X[numeric_columns].apply(lambda x: skew(x)).sort_values(ascending=False)
    return skew_feats

In [None]:
def de_skew_yeo_sk(df):
    pt = PowerTransformer(standardize=False)
    skews = find_skew_cols(df)
    skew_check = skews[skews > 0.5].index.tolist()
    
    count_features = ['BsmtHalfBath', 'BsmtFullBath', 'HalfBath', 'FullBath',
                  'BedroomAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars',
                  'OverallCond', 'OverallQual','ExterQual','ExterCond','BsmtQual','BsmtCond']
    skew_check = [f for f in skew_check if f not in count_features]
    
    lambs = {}
    for s in skew_check: 
        #df[s] = yeojohnson(df[s])[0]
        df[[s]] = pt.fit_transform(df[[s]].values)
        lamb = pt.lambdas_[0]
        
        # add lambda o
        lambs[s] = lamb 
    return df, lambs

In [None]:
df_x, df_t = load_raw_data()
pt = PowerTransformer(standardize=False)

orig_val = df_x.loc[1,['GrLivArea']]

df_x[['GrLivArea']] = pt.fit_transform(df_x[['GrLivArea']].values)
liv_lam = pt.lambdas_

yj_val = df_x.loc[1,['GrLivArea']]
test_val = df_x.loc[1, ['GrLivArea']]


inv_val = pt.inverse_transform([test_val])
print(f"lambda: {liv_lam}")

print(f"orig_val {orig_val}")
print(f"test_val {test_val}")
print(f"inv_val {int(inv_val[0][0])}")

df_x.loc[1,['GrLivArea']] = inv_val[0][0]
df_x.loc[1,['GrLivArea']]



In [None]:
def modify_yeoj_feat_val(new_val, yj_lamb):
    pt = PowerTransformer(standardize=False)
    pt.lambdas_ = list(yj_lamb)
    out = pt.transform([[new_val]])[0][0]
    return out
    
def inv_transf_yeoj(yj_val, yj_lamb):
    pt = PowerTransformer(standardize=False)
    pt.lambdas_ = list(yj_lamb)
    out = pt.inverse_transform([[yj_val]])[0][0]
    return out

In [None]:
# ---

# new_val = 1500
# print(type(new_val))
# yj_val = modify_yeoj_feat_val(new_val, liv_lam)
# print(yj_val)

# verify_val = round(inv_transf_yeoj(yj_val, liv_lam))
# print(type(round(verify_val)))
# print(f"new value: {new_val}")
# print(f"confirm new value: {verify_val}")
# new_val == verify_val

In [None]:
def de_skew_yeo(df):
    skews = find_skew_cols(df)
    skew_check = skews[skews > 0.5].index.tolist()
    
    count_features = ['BsmtHalfBath', 'BsmtFullBath', 'HalfBath', 'FullBath',
                  'BedroomAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars',
                  'OverallCond', 'OverallQual','ExterQual','ExterCond','BsmtQual','BsmtCond']
    skew_check = [f for f in skew_check if f not in count_features]
    
    for s in skew_check: 
        df[s] = yeojohnson(df[s])[0]
    return df

In [None]:
X_train_r, y_train_r = load_raw_data()
s='LotArea'
col = X_train_r[s]
sns.distplot(col)
#set(X_train_r[s])

In [None]:
# col2 = boxcox(col, boxcox_normmax(col))
col2 = yeojohnson(col)
sns.distplot(col2[0])

In [None]:
print(f" pre: {col.skew()}")
print(f" post: {pd.Series(col2[0]).skew()}")

In [None]:
# df_train, df_test = load_and_preproc_data()

skews = find_skew_cols(X_train_r)
skews
# skews_o = find_skew_cols(df_train)

# test_skew_cols = [ col for col in skews_o.index.tolist() if col in skews.index.tolist()]
# pd.DataFrame([skews[test_skew_cols], skews_o[test_skew_cols]])
#display(skews[skews > 1.0])
skew_check = skews[skews > 0.5].index.tolist()
skew_check = [f for f in skew_check if f not in count_features]
skew_check

In [None]:
# verify skewness correction
X_train_r, y_train_r = load_raw_data()

pre_adj = []
post_adj = []

for s in skew_check: 
    pre_adj.append(X_train_r[s].skew())
    X_train_r[s] = yeojohnson(X_train_r[s])[0]
    post_adj.append(X_train_r[s].skew())

skew_data = {'pre_adj': pre_adj, 'post_adj': post_adj}
skew_df = pd.DataFrame(data=skew_data, index = skew_check)
skew_df.set_index=skew_check
skew_df.columns=['pre_adj', 'post_adj']
skew_df.sort_values(by='post_adj',ascending=False)

skew_df

# examining which features are still skewed
# still_skewed = skew_df.loc[skew_df['post_adj'] > 0.5].index
# skew_df.loc[skew_df['post_adj'] > 0.5]

In [None]:
# X_train_r, y_train_r = load_raw_data()
X_train_r['TotalBsmtSF']

In [None]:
# verify skewness correction
# X_train_r, y_train_r = load_raw_data()

# pre_adj = []
# post_adj = []

# for s in skew_check: 
#     pre_adj.append(X_train_r[s].skew())
#     X_train_r[s] = de_skew_yeo(X_train_r[s])
#     post_adj.append(X_train_r[s].skew())

# skew_data = {'pre_adj': pre_adj, 'post_adj': post_adj}
# skew_df = pd.DataFrame(data=skew_data, index = skew_check)
# skew_df.set_index=skew_check
# skew_df.columns=['pre_adj', 'post_adj']
# skew_df.sort_values(by='post_adj',ascending=False)

# skew_df

# examining which features are still skewed
# still_skewed = skew_df.loc[skew_df['post_adj'] > 0.5].index
# skew_df.loc[skew_df['post_adj'] > 0.5]

In [None]:
still_skewed = skew_df[skew_df['post_adj']>0.5]
still_skewed = ['Threeseasonporch' if noob=='3SsnPorch' else noob for noob in still_skewed.index.tolist()]

mi_scores.loc[still_skewed]

In [None]:
df[df['PoolArea']>0]['PoolArea']

# in the original training set there are only 7 houses with pools. All with comprable sizes.
For this reason the poolarea feature is removed and a new feature 'HasPool' has been added.

In [None]:
# --------------
# new Features

from pandas.api.types import CategoricalDtype

# MiscVal -> drop (mi score = 0)
# X_train.drop(['MiscVal'], axis=1, inplace=True) # this is actually converted to a nominal variable already

# Pool Area -> has pool
def clean_pool(X):
    X.drop(['PoolArea'], axis=1, inplace=True)
    cat_type_01 = CategoricalDtype(categories=[0,1], ordered=True)
    X = X.astype({'HasPool': cat_type_01})
    return X

---
Basic Basement Data

In [None]:
df, df_t = load_raw_data()

---
# For Reference
---

### explore values for finished basement features

In [None]:
df, df_t = load_raw_data()

In [None]:
set(df['BsmtFinType1'])


In [None]:
X_train_bs, y_train_bs = load_raw_data()
X_train_r.columns.tolist()
 
df_bs = X_train_bs.copy()

# all basement related columns with quantitave values related to square feet
basement_qnt_cols = [ 'BsmtQual',
                     'BsmtFinSF1',
                     'BsmtFinSF2',
                     'BsmtUnfSF',
                     'TotalBsmtSF',
                     'BsmtFullBath',
                    'BsmtHalfBath', 
                    'SalePrice']

# replace all nan values with 0
df_bs = df_bs[basement_qnt_cols].fillna(0)

df_bs.info()

df_bs = df_bs[
                 (df_bs['BsmtFinSF1'] > 0) | 
                 (df_bs['BsmtFinSF2']> 0) |
                 (df_bs['BsmtUnfSF'] > 0) |
                 (df_bs['TotalBsmtSF'] > 0)
                ]

# no_bs = df_bs[not(
#                  (df_bs['BsmtFinSF1'] > 0) | 
#                  (df_bs['BsmtFinSF2']> 0) |
#                  (df_bs['BsmtUnfSF'] > 0) |
#                  (df_bs['TotalBsmtSF'] > 0))
#                 ]

df_bs.equals(df_bs[df_bs['TotalBsmtSF'] > 0])
df_bs['BsmtPcntUnf'] = (df_bs['BsmtUnfSF'] / df_bs['TotalBsmtSF'])*100
# above returns true so all we need is the total bsmt sqare feet condition

min_vals = [30, 120, 300, 500, 1000]

for min_val in min_vals:
    print(f" number of homes with more than {min_val} sq.ft. unfinished in the basement: {len(df_bs[df_bs['BsmtUnfSF'] > min_val])}")

print(f"{(225/1460)*100} percent of houses in this dataset have more than 1000 sq. ft. unfinished in the basement")

In [None]:
df_bs[ (df_bs['BsmtUnfSF']> 1000) & (df_bs['BsmtQual'].isin(['Gd', 'Ex']))]

In [None]:
remod_elig = df_bs[(df_bs['BsmtUnfSF'] > 1000) &
      (df_bs['BsmtPcntUnf'] > 95) & 
      (df_bs['BsmtQual'].isin(['Gd', 'Ex']))
     ]

set(remod_elig['BsmtQual'])
display(remod_elig)

houses_for_bsmt_remod = remod_elig.index
print(houses_for_bsmt_remod.tolist())