In [36]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [37]:
#save a path to variable
houses_file_path = '/Users/dariavasylieva/PycharmProjects/TrainingGround/Housing.csv'
#read data and store them in DataFrame
melbourne_data = pd.read_csv(houses_file_path)
#print summary of the data
melbourne_data.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


Select data for modeling and examine available columns

In [38]:
melbourne_data

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


Select Prediction Target

In [39]:
y = melbourne_data['price']

Select features

In [40]:
melbourne_features = ["area", "bathrooms", "bedrooms", "parking"]
X = melbourne_data[melbourne_features]
print(X.describe())
#print a few first rows
print(X.head())

               area   bathrooms    bedrooms     parking
count    545.000000  545.000000  545.000000  545.000000
mean    5150.541284    1.286239    2.965138    0.693578
std     2170.141023    0.502470    0.738064    0.861586
min     1650.000000    1.000000    1.000000    0.000000
25%     3600.000000    1.000000    2.000000    0.000000
50%     4600.000000    1.000000    3.000000    0.000000
75%     6360.000000    2.000000    3.000000    1.000000
max    16200.000000    4.000000    6.000000    3.000000
   area  bathrooms  bedrooms  parking
0  7420          2         4        2
1  8960          4         4        3
2  9960          2         3        2
3  7500          2         4        3
4  7420          1         4        2


Split data for training and evaluation

In [41]:
#Split data in two for training and evaluation
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)
# Define model. Specify a number for random_state to ensure the same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)
# Fit model using the part of data
melbourne_model.fit(train_X, train_y)

Make predictions and evaluate the model

In [42]:
#See predicted values
print("The predictions are:")
val_predictions = melbourne_model.predict(val_X)
print(val_predictions)

#See absolute error. How precise is our model?
print("MAE - Mean Absolute Error. val_y is data not used for training - it is data for reference:")
print(val_y)
print("Spoiler Alert: the error is HUGE! Data which model sees for the first time discovered that our model prediction skill is shit... for now.")
print(mean_absolute_error(val_y, val_predictions))

The predictions are:
[ 2730000.          3150000.          4354000.          5600000.
  3234000.          7840000.          5530000.          3500000.
  4515000.          7245000.          6867000.          2800000.
  4354000.          5243000.          6946333.33333333  1933575.
  2450000.          6946333.33333333  3010000.          3587500.
  9240000.          8855000.          5670000.          4340000.
  5600000.          3885000.          8400000.          2940000.
  2660000.          5600000.          2590000.          4200000.
  6090000.          4095000.          3220000.          6419000.
  7962500.          3465000.          3745000.          3447500.
  8575000.          9800000.          9800000.          4200000.
  3850000.          6867000.          4830000.          3566500.
  5565000.          4200000.          3395000.         12250000.
  4340000.          3045000.          2940000.          3675000.
  9240000.          5600000.          3675000.          5523000.
  58

Define function to compare MAE scores for different

In [43]:
#Improve a model by determining the best number of nodes so we find a spot between overfitting and underfitting
def get_mae(the_max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=the_max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae

Try different numbers of leaf nodes to optimize the model

In [44]:
#Try to find the best number of nodes so error is smaller
print("Get MAE for different values for nodes:")
for max_leaf_nodes in [2, 5, 10, 20, 100]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Get MAE for different values for nodes:
Max leaf nodes: 2  		 Mean Absolute Error:  1128897
Max leaf nodes: 5  		 Mean Absolute Error:  1054973
Max leaf nodes: 10  		 Mean Absolute Error:  961397
Max leaf nodes: 20  		 Mean Absolute Error:  1037581
Max leaf nodes: 100  		 Mean Absolute Error:  1173952


Create an improved model with the optimal number of leaves

In [45]:
#Using 5 nodes for the model decision tree since this value of nodes gives the smallest error value
better_model = DecisionTreeRegressor(max_leaf_nodes=10, random_state=1)
better_model.fit(train_X, train_y)
predict_values = better_model.predict(val_X)
lower_mae = mean_absolute_error(val_y, predict_values)
print("The lowest achieved MAE:")
print(lower_mae)

The lowest achieved MAE:
961397.8017401386


HANDLING THE MISSING DATA

In [48]:
# Read the data
X_full = pd.read_csv('home-data-for-ml-course/train.csv', index_col='Id')
X_test_full = pd.read_csv('home-data-for-ml-course/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

Observe the data

In [49]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,90.0,11694,9,5,2007,2007,452.0,48,0,...,774,0,108,0,0,260,0,0,7,2007
871,20,60.0,6600,5,5,1962,1962,0.0,0,0,...,308,0,0,0,0,0,0,0,8,2009
93,30,80.0,13360,5,7,1921,2006,0.0,713,0,...,432,0,0,44,0,0,0,0,8,2009
818,20,,13265,8,5,2002,2002,148.0,1218,0,...,857,150,59,0,0,0,0,0,7,2008
303,20,118.0,13704,7,5,2001,2002,150.0,0,0,...,843,468,81,0,0,0,0,0,1,2006


Preliminary investigation

In [50]:
X_train.shape

(1168, 36)

In [52]:
missing_val_count_by_col = (X_train.isnull().sum())
missing_val_count_by_col[missing_val_count_by_col > 0]

LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64

Finding out which approach is the best for this amount of missing data using this function

In [65]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

Approach - Drop missing values

In [66]:
# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]

# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

In [67]:
print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE (Drop columns with missing values):
18866.728767123288


Testing Imputation approach

In [61]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [62]:
print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE from Approach 2 (Imputation):
18062.894611872147


An Extension to Imputation

In [63]:
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

In [64]:
print("MAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

MAE from Approach 3 (An Extension to Imputation):
18148.417180365297


Generating test predictions using Imputation

In [68]:
final_X_train = imputed_X_train
final_X_valid = imputed_X_valid

Define and fit model

In [69]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(final_X_train, y_train)

Get prediction and MAE

In [70]:
my_predictions = model.predict(final_X_valid)

In [71]:
mean_absolute_error(y_valid, my_predictions)

18062.894611872147