In [1]:
import pandas as pd

In [20]:
from sklearn.model_selection import train_test_split

In [5]:
%%bash
cd data/home-data-for-ml-course/
kaggle competitions download -c home-data-for-ml-course

sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)
train.csv: Skipping, found more recently modified local copy (use --force to force download)
test.csv: Skipping, found more recently modified local copy (use --force to force download)


## Read the data

In [18]:
X_full = pd.read_csv('data/home-data-for-ml-course/train.csv', index_col='Id')
X_test_full = pd.read_csv('data/home-data-for-ml-course/test.csv', index_col='Id')

## Remove rows with missing target, separate target from predictors

In [15]:
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

In [19]:
# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

In [21]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

## Step 1: Preliminary investigation

In [28]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = X_train.isnull().sum()
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(1168, 37)
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


## Step 2: Drop columns with missing values

In [46]:
# Get names of columns with missing values
col_na = missing_val_count_by_column[missing_val_count_by_column > 0].index # Your code here

# Drop columns in training and validation data
reduced_X_train = X_train.drop(col_na, axis=1)
reduced_X_valid = X_valid.drop(col_na, axis=1)

## Categorical variables

Not all categorical variables have a clear ordering in the values, but we refer to those that do as ordinal variables. For tree-based models (like decision trees and random forests), **you can expect label encoding to work well with ordinal variables**.

In [None]:
# Drop columns with categorical data
drop_X_train = X_train.drop(X_train.columns[X.dtypes == 'object'], axis=1)

# or better
drop_X_train = X_train.select_dtypes(exclude=['object'])

In [None]:
# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if 
                   set(X_train[col]) == set(X_valid[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

# Apply label encoder 
label_encoder = LabelEncoder()
for col in good_label_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

In [None]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

We refer to the number of unique entries of a categorical variable as the **cardinality** of that categorical variable.  For instance, the `'Street'` variable has cardinality 2.

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

## Step 3: Imputation

In [51]:
from sklearn.impute import SimpleImputer

In [60]:
# Imputation
imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))
imputed_X_test = pd.DataFrame(imputer.transform(X_test))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
imputed_X_test.columns = X_test.columns

ValueError: X has 36 features per sample, expected 37

## Step 4: Generate test predictions

In [53]:
from sklearn.ensemble import RandomForestRegressor

In [56]:
from sklearn.metrics import mean_absolute_error

In [57]:
# Define and fit model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(imputed_X_train, y_train)

# Get validation predictions and MAE
preds_valid = model.predict(imputed_X_valid)
print("MAE (Your appraoch):")
print(mean_absolute_error(y_valid, preds_valid))

MAE (Your appraoch):
887.4611643835614


In [59]:
# Save test predictions to file
output = pd.DataFrame({'Id': imputed_X_test.index,
                       'SalePrice': preds_test})
output.to_csv('data/home-data-for-ml-course/submission.csv', index=False)

NameError: name 'imputed_X_test' is not defined