In [47]:
### First steps. All imports. Load train and test data. Remove missing target from train data, 
### separate target from predictors in train data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# Read the data
X_full = pd.read_csv('/home/dina/input/train.csv', index_col='Id')
X_test_full = pd.read_csv('/home/dina/input/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

In [48]:
X_full.shape


(1460, 79)

In [49]:
# Decide on the approach to deal with missing values. Investigate empty values in full set of training data
missing_val_count_by_column = (X_full.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])


LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


Columns _Alley,PoolQC and MiscFeature_ are almost fully comprised of empty values. So dropping them will not have much impact on the result

In [50]:
X_full.drop(['Alley','PoolQC','MiscFeature'], axis=1, inplace=True)
X_test_full.drop(['Alley','PoolQC','MiscFeature'], axis=1, inplace=True)

Now impute missing values for the rest of the columns. Use _strategy=median_ for numerical data and _strategy=most_frequent_ for categorical data as it showed the best results in previous exercises

In [51]:
# Split into numerical and categorical data
X_num = X_full.select_dtypes(exclude=['object'])
X_cat = X_full.select_dtypes('object')

# Impute missing values
num_imputer=SimpleImputer(strategy="median") 
imputed_X_num = pd.DataFrame(num_imputer.fit_transform(X_num))

cat_imputer=SimpleImputer(strategy="most_frequent") 
imputed_X_cat = pd.DataFrame(cat_imputer.fit_transform(X_cat))

# Imputing took away column names. Put them back
imputed_X_num.columns=X_num.columns
imputed_X_cat.columns=X_cat.columns

# Merge imputed numerical and categorical data

imputed_X_full=pd.concat([imputed_X_num,imputed_X_cat], axis=1)
imputed_X_full.head()


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,Fence,SaleType,SaleCondition
0,60.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,0.0,...,Typ,Gd,Attchd,RFn,TA,TA,Y,MnPrv,WD,Normal
1,20.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,0.0,...,Typ,TA,Attchd,RFn,TA,TA,Y,MnPrv,WD,Normal
2,60.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,0.0,...,Typ,TA,Attchd,RFn,TA,TA,Y,MnPrv,WD,Normal
3,70.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,0.0,...,Typ,Gd,Detchd,Unf,TA,TA,Y,MnPrv,WD,Abnorml
4,60.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,0.0,...,Typ,TA,Attchd,RFn,TA,TA,Y,MnPrv,WD,Normal


Now deal with categorical data. First, remove categorical columns with values that are not present in test data. Then encode categorical data. Use one-hot encoding for low-cardinality columns and ordinal encoding for high-cardinality columns to avoid adding too many records to the dataset

In [52]:
# Get all categorical columns
object_cols = [col for col in imputed_X_full.columns if imputed_X_full[col].dtype == "object"]
print(object_cols)
# Columns that can be safely encoded
good_label_cols = [col for col in object_cols if 
                   set(X_test_full[col]).issubset(set(imputed_X_full[col]))]
bad_label_cols = list(set(object_cols)-set(good_label_cols))
print(bad_label_cols)
reduced_X_full = imputed_X_full.drop(bad_label_cols, axis=1)
reduced_test = X_test_full.drop(bad_label_cols, axis=1)
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(reduced_X_full, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)
# Do same for training and validation data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
bad_label_cols = list(set(object_cols)-set(good_label_cols))

print(bad_label_cols)
# Remove "bad" columns for train data training and validation subsets and for test data
reduced_X_full.drop(bad_label_cols, axis=1, inplace=True)
reduced_test.drop(bad_label_cols, axis=1, inplace=True)

X_train.drop(bad_label_cols, axis=1, inplace=True)
X_valid.drop(bad_label_cols, axis=1, inplace=True)
#reduced_X_full.head()

object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]


# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

# Columns that will be ordinal encoded
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

# Ordinal encode for train data training and validation subsets and for test data
ordinal_encoder = OrdinalEncoder()


reduced_X_full[high_cardinality_cols] = ordinal_encoder.fit_transform(reduced_X_full[high_cardinality_cols])
reduced_test[high_cardinality_cols] = ordinal_encoder.transform(reduced_test[high_cardinality_cols])
X_train[high_cardinality_cols] = ordinal_encoder.transform(X_train[high_cardinality_cols])
X_valid[high_cardinality_cols] = ordinal_encoder.transform(X_valid[high_cardinality_cols])

print(reduced_X_full.shape)
print(reduced_test.shape)
print(X_train.shape)
print(X_valid.shape)



['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence', 'SaleType', 'SaleCondition']
['GarageType', 'BsmtCond', 'GarageQual', 'MasVnrType', 'GarageFinish', 'GarageCond', 'KitchenQual', 'FireplaceQu', 'BsmtFinType2', 'BsmtQual', 'MSZoning', 'SaleType', 'Exterior1st', 'BsmtFinType1', 'Functional', 'Fence', 'Exterior2nd', 'BsmtExposure', 'Utilities']
['RoofMatl', 'Condition2']
(1460, 55)
(1459, 55)
(1168, 55)
(292, 55)


In [57]:
low_cardinality_cols
untouched_cols = set(X_valid.columns)-set(low_cardinality_cols)
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_full = pd.DataFrame(OH_encoder.fit_transform(reduced_X_full[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.fit_transform(reduced_test[low_cardinality_cols]))
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.fit_transform(X_valid[low_cardinality_cols]))


OH_cols_full.index = reduced_X_full.index
OH_cols_test.index = reduced_test.index
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

untouched_full = reduced_X_full.drop(low_cardinality_cols, axis=1)
untouched_test = reduced_test.drop(low_cardinality_cols, axis=1)
untouched_train = X_train.drop(low_cardinality_cols, axis=1)
untouched_valid = X_valid.drop(low_cardinality_cols, axis=1)

final_full = pd.concat([untouched_full, OH_cols_full], axis=1)
final_test = pd.concat([untouched_test, OH_cols_test], axis=1)
final_train = pd.concat([untouched_train, OH_cols_train], axis=1)
final_valid = pd.concat([untouched_valid, OH_cols_valid], axis=1)

print(final_full.shape)
print(final_test.shape)
print(final_train.shape)
print(final_valid.shape)

(1460, 125)
(1459, 121)
(1168, 125)
(292, 114)


In [None]:
# Best model
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
# This we learned by having 5 models compared. Only some fields were used for these models, 
# so might make sence to test models again on small samples after we have dealt with missing 
# and categorical values

from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]



In [None]:
# Here is how we compare models
from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

In [None]:
# After we are dpne with everything, we train model on the full training dataset

# Fit the model to the training data
my_model.fit(X, y)

# Generate test predictions
preds_test = my_model.predict(X_test)

# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)