In [1]:
# update sklearn
import sklearn
print(sklearn.__version__)

0.20.2


In [8]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# load data
melb_data = pd.read_csv('melb_data.csv')

melb_target = melb_data.Price
melb_predictors = melb_data.drop(['Price'], axis=1)

# use only numeric predictors for now
melb_numeric_predictors = melb_predictors.select_dtypes(exclude='object')

# split data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(melb_numeric_predictors, 
                                                    melb_target,
                                                    train_size=0.7,
                                                    test_size=0.3,
                                                    random_state=0)

# define a function to check mae
def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    predicts = model.predict(X_test)
    return mean_absolute_error(predicts, y_test)

# case 1: drop missing values
cols_with_missing = [col for col in X_train.columns
                         if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test = X_test.drop(cols_with_missing, axis=1)
print('Mean absolute error for dropping columns with missing values:')
print(round(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test)))

# case 2: fill missing values using imputation
my_imputer = SimpleImputer()
imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)
print('Mean absolute error for filling missing values using imputation:')
print(round(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test)))

# case 3: fill missing values using imputation and record what was imputed
imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

cols_with_missing = [col for col in X_train.columns
                         if X_train[col].isnull().any()]

for col in cols_with_missing:
    imputed_X_train_plus[col + 'was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + 'was_missing'] = imputed_X_test_plus[col].isnull()

my_imputer = SimpleImputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)
print('Mean absolute error for filling missing values using imputation and recording what was imputed:')
print(round(score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test)))

Mean absolute error for dropping columns with missing values:




187115.0
Mean absolute error for filling missing values using imputation:




182532.0
Mean absolute error for filling missing values using imputation and recording what was imputed:




184363.0


In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

# help(RandomForestRegressor)
# help(cross_val_score)

Help on function cross_val_score in module sklearn.model_selection._validation:

cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv='warn', n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score='raise-deprecating')
    Evaluate a score by cross-validation
    
    Read more in the :ref:`User Guide <cross_validation>`.
    
    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    
    X : array-like
        The data to fit. Can be for example a list, or an array.
    
    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    
    groups : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.
    
    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentat

In [13]:
# class: include categorical data

# Read the data
import pandas as pd
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Drop houses where the target is missing
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

target = train_data.SalePrice

# Since missing values isn't the focus of this tutorial, we use the simplest
# possible approach, which drops these columns. 
# For more detail (and a better approach) to missing values, see
# https://www.kaggle.com/dansbecker/handling-missing-values
cols_with_missing = [col for col in train_data.columns 
                                 if train_data[col].isnull().any()]                                  
candidate_train_predictors = train_data.drop(['Id', 'SalePrice'] + cols_with_missing, axis=1)
candidate_test_predictors = test_data.drop(['Id'] + cols_with_missing, axis=1)

# "cardinality" means the number of unique values in a column.
# We use it as our only way to select categorical columns here. This is convenient, though
# a little arbitrary.
low_cardinality_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].nunique() < 10 and
                                candidate_train_predictors[cname].dtype == "object"]
numeric_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].dtype in ['int64', 'float64']]
my_cols = low_cardinality_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]
test_predictors = candidate_test_predictors[my_cols]

# train_predictors.dtypes.sample(10)

one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

def get_mae(X, y):
    # multiple by -1 to make positive MAE score instead of neg value returned as sklearn convention
    return -1 * cross_val_score(RandomForestRegressor(50), 
                                X, y, 
                                scoring = 'neg_mean_absolute_error').mean()

predictors_without_categoricals = train_predictors.select_dtypes(exclude=['object'])

mae_without_categoricals = get_mae(predictors_without_categoricals, target)

mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target)

print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))
print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))



Mean Absolute Error when Dropping Categoricals: 18441
Mean Abslute Error with One-Hot Encoding: 17969


In [45]:
# Exercise:

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

def get_mae(X,y):
    return -1 * cross_val_score(RandomForestRegressor(50),
                                X, y, 
                                scoring = 'neg_mean_absolute_error').mean()

# load data
melb_data = pd.read_csv('melb_data.csv')

# Drop houses where the target is missing
melb_data.dropna(axis=0, subset=['Price'], inplace=True)

target = melb_data.Price
predictors = melb_data.drop(['Price'], axis=1)
# print(predictors.columns)
# print(predictors.dtypes)

# Seperate numerical and categorical columns
# only use categorical columns that have less than 10 unique values
num_cols = [col for col in predictors.columns
                if predictors[col].dtype in ['int64','float64']]
cat_cols = [col for col in predictors.columns
                if predictors[col].nunique() < 10 and
                   predictors[col].dtype == 'object']
# print(num_cols)
# print('-------------------')
# print(cat_cols)

# Fill numerical missing values with imputation and record what was missing

num_data = predictors[num_cols]
imputed_num_data = num_data.copy()

cols_with_missing = [col for col in num_data.columns
                         if num_data[col].isnull().any()]

for col in cols_with_missing:
    imputed_num_data[col + '_was_missing'] = imputed_num_data[col].isnull()
    
my_impute = SimpleImputer()
imputed_num_data = my_impute.fit_transform(imputed_num_data)

# One hot encode categorical data

cat_data = predictors[cat_cols]
one_hot_encoded_cat_data = pd.get_dummies(cat_data)

# Put imputed numerical data and one hot encoded categorical data together
improved_predictors = np.concatenate((imputed_num_data, one_hot_encoded_cat_data), axis=1)
# print(improved_predictors.shape)

print('Mean absolute error for including only imputed numerical data:')
print(round(get_mae(imputed_num_data, target)))
print('Mean absolute error for including both imputed numerical data and one hot encoded categorical data:')
print(round(get_mae(improved_predictors, target)))

Mean absolute error for including only imputed numerical data:




196034.0
Mean absolute error for including both imputed numerical data and one hot encoded categorical data:




189420.0
