In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor


def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


# Prepare data
df = pd.read_csv("datasets/ames.csv")
X = df.copy()
y = X.pop("SalePrice")

# 1) Create Mathematical Transforms

In [2]:
# YOUR CODE HERE
X_1 = pd.DataFrame()  # dataframe to hold new features

X_1["LivLotRatio"] = X['GrLivArea']/X['LotArea']
X_1["Spaciousness"] = (X['FirstFlrSF'] + X['SecondFlrSF'])/X['TotRmsAbvGrd']
X_1["TotalOutsideSF"] = X['WoodDeckSF']+X['OpenPorchSF']+X['EnclosedPorch']+X['Threeseasonporch']+X['ScreenPorch']

# 2) Interaction with a Categorical

In [3]:
# One-hot encode BldgType. Use `prefix="Bldg"` in `get_dummies`
X_2 = pd.get_dummies(X.BldgType, prefix='Bldg')
# Multiply
X_2 = X_2.mul(X.GrLivArea, axis=0)

# 3) Count Feature

In [4]:
X_3 = pd.DataFrame()

# YOUR CODE HERE
features = ['WoodDeckSF','OpenPorchSF','EnclosedPorch','Threeseasonporch','ScreenPorch']
X_3["PorchTypes"] = X[features].gt(0).sum(axis=1)

# 4) Break Down a Categorical Feature

In [5]:
df.MSSubClass.unique()

array(['One_Story_1946_and_Newer_All_Styles', 'Two_Story_1946_and_Newer',
       'One_Story_PUD_1946_and_Newer',
       'One_and_Half_Story_Finished_All_Ages', 'Split_Foyer',
       'Two_Story_PUD_1946_and_Newer', 'Split_or_Multilevel',
       'One_Story_1945_and_Older', 'Duplex_All_Styles_and_Ages',
       'Two_Family_conversion_All_Styles_and_Ages',
       'One_and_Half_Story_Unfinished_All_Ages',
       'Two_Story_1945_and_Older', 'Two_and_Half_Story_All_Ages',
       'One_Story_with_Finished_Attic_All_Ages',
       'PUD_Multilevel_Split_Level_Foyer',
       'One_and_Half_Story_PUD_All_Ages'], dtype=object)

In [6]:
X_4 = pd.DataFrame()

# YOUR CODE HERE
X_4['MSClass'] = X['MSSubClass'].str.split('_', n=1, expand=True)[0]

# 5) Use a Grouped Transform

In [7]:
X_5 = pd.DataFrame()

# YOUR CODE HERE
X_5["MedNhbdArea"] = X.groupby('Neighborhood')['GrLivArea'].transform('median')

In [8]:
X_new = X.join([X_1, X_2, X_3, X_4, X_5])
score_dataset(X_new, y)

0.13954039591355258