# Machine Learning Regression Problem 

## Predicting Price of a House


# Using Decision Trees

In [1]:
import pandas as pd

# Load data
melbourne_file_path = '/home/gitika/My_Home/Python_ML_CNN/Regression/Melbourne_housing_FULL_with_missing.csv'
# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path) 
# print a summary of the data in Melbourne data
melbourne_data.describe()

#read the title (features) of the data
print(melbourne_data.columns)

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')


In [15]:
# Filter rows with missing price values
filtered_melbourne_data = melbourne_data.dropna(axis=0)

# Choose target and input features
y = filtered_melbourne_data.Price

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
                        'YearBuilt', 'Lattitude', 'Longtitude']
X = filtered_melbourne_data[melbourne_features]
X.head() 

2        2.0
4        3.0
11       3.0
14       2.0
18       2.0
        ... 
34847    3.0
34849    3.0
34853    2.0
34854    2.0
34856    2.0
Name: Rooms, Length: 8886, dtype: float64

In [129]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
#train_X, val_X, train_y, val_Y = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

# Model Building

- __Define__ : type of model, here we'll use DecisionTreeRegressor model
- __Fit__ : Find patterns in the data
- __Predict__ : Use the test data to predict
- __Validate__ : Determine the accuracy of the models predictions using loss functions (here MAE is used)

In [20]:
from sklearn.tree import DecisionTreeRegressor
# Define model
melbourne_model = DecisionTreeRegressor(random_state=1)
# Fit model
melbourne_model.fit(train_X, train_y)

In [21]:
from sklearn.metrics import mean_absolute_error

train_predictions = melbourne_model.predict(train_X)
val_predictions = melbourne_model.predict(val_X)

train_error = mean_absolute_error(train_y, train_predictions)
val_error = mean_absolute_error(val_y, val_predictions)

print('Error on training set: ' "{:.2f}".format(train_error))
print('Error on validation set: ' "{:.2f}".format(val_error))

#Mean absolute error (MAE) = Average[Mod(y-y_prediction)]

Error on training set: 225.66
Error on validation set: 246229.42


 The decision tree model has many options, most important being the tree's depth, which is measure of how many splits it makes before coming to prediction. As the number of leaves (bottom most line) fewer houses are present in each leaf. This makes model overfit on the traing data and underfit on the test data.
![decision-tree.png](attachment:c49271ce-72a8-4765-92d5-4a33ec15498e.png)<!--  -->

![overfitting.png](attachment:7f5875fd-71a7-4e0d-aefc-44b2b5e1d5b4.png)

So, we need to find an optimal number of leaves (layers) in the model.


- __Overfitting:__  capturing spurious patterns that won't recur in the future, leading to less accurate predictions, or
- __Underfitting:__  failing to capture relevant patterns, again leading to less accurate predictions.


In [24]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)


In [25]:
# compare MAE with differing values of max_leaf_nodes
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500,2000,5000]
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
best_tree_size = min(scores, key=scores.get)

best_tree_size 

500

In [26]:
# Fit the model with best_tree_size. Fill in argument to make optimal size
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

# fit the final model
final_model.fit(X, y)
y_predictions_final = final_model.predict(X)
final_mae = mean_absolute_error(y, y_predictions_final)
final_mae

133101.7203189413

# Using Random Forests Model

Decision Tree models have inherent disadvantage that choosing deep tree models (lot of leaves) results in overfitting and shallow leaves with underfitted data.

This is where Random Forests are better because:
- Random forest uses many trees and makes predictions by averaging the prediction from each component tree.
- It has better predictive accuracy than single decision tree
- Works well with default parameters

In [31]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()

# fit your model
rf_model.fit(train_X, train_y)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

Validation MAE for Random Forest Model: 177186.7634377009


## Choosing the Best Model

In [63]:
# Define several RandomForestRegressor models and score them
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='absolute_error', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]
print(len(train_X),len(train_y),len(val_X),len(val_y))
# Function for comparing different models
def score_model(model, X_t=train_X, X_v=val_X, y_t=train_y, y_v=val_y):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))


#ary = models.to_array()
#best_model = np.where(mae==np.min(mae.to_array())

6665 6665 2222 2222
Model 1 MAE: 179355
Model 2 MAE: 177195
Model 3 MAE: 177261
Model 4 MAE: 184390
Model 5 MAE: 213219


In [79]:
#looking at the MAE values choose the best model
best_model = model_2
my_model = best_model

In [95]:
# Fit the model to the training data
my_model.fit(train_X, train_y)

# Generate test predictions
preds_test = my_model.predict(val_X)

print(val_X.index, len(X))
# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': val_X.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)

Index([17359, 17097,  5265, 21286,  9450, 23809, 27724, 11376,  2235, 34372,
       ...
       23416,  1489,  6452, 10302, 11062, 19160,  4891, 15434,   724, 22917],
      dtype='int64', length=2222) 8887


# Handling Missing Value
In the code cells below I have implemented these three approaches. But before choosing any look at the data in hand.
- Approach 1: drop the columns with no value
- Approach 2: Imputation (replace missing value with some number; mean of column)
- Approach 3: Imputate and keep but track which rows values have been missing

In [94]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
# Load data
melbourne_file_path = '/home/gitika/My_Home/Python_ML_CNN/Regression/Melbourne_housing_FULL_with_missing.csv'
# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path) 
# print a summary of the data in Melbourne data
melbourne_data.describe()

#read the title (features) of the data
print(melbourne_data.columns)

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')


In [96]:
# Filter rows with missing price values
filtered_melbourne_data = melbourne_data.dropna(axis=0)

# Choose target and input features
y = filtered_melbourne_data.Price

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
                        'YearBuilt', 'Lattitude', 'Longtitude']
X = filtered_melbourne_data[melbourne_features]
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude
2,2.0,1.0,156.0,79.0,1900.0,-37.8079,144.9934
4,3.0,2.0,134.0,150.0,1900.0,-37.8093,144.9944
11,3.0,2.0,245.0,210.0,1910.0,-37.8024,144.9993
14,2.0,1.0,256.0,107.0,1890.0,-37.806,144.9954
18,2.0,1.0,220.0,75.0,1900.0,-37.801,144.9989


In [98]:
# Get names of columns with missing values
cols_with_missing = [col for col in melbourne_data.columns
                     if melbourne_data[col].isnull().any()]
print('Columns with missing values: {}'.format(cols_with_missing))

# Number of missing values in each column of training data
missing_val_count_by_column = (melbourne_data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Columns with missing values: ['Rooms', 'Price', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude', 'Longtitude', 'Regionname', 'Propertycount']
Rooms                4
Price             7610
Distance             1
Postcode             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
YearBuilt        19306
CouncilArea          3
Lattitude         7976
Longtitude        7976
Regionname           3
Propertycount        3
dtype: int64


In [100]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
#train_X, val_X, train_y, val_Y = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

In [102]:
from sklearn.ensemble import RandomForestRegressor
# Function for scoring each approach
def score_dataset(train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(train_X,train_y)
    preds = model.predict(val_X)
    return mean_absolute_error(val_y, preds)

In [104]:
#========= Approach 1: drop the columns=================================================
"""
In this approach the target feature should not have missing value. So in practice this approach can be used but if target feature has 
Nan then this code will not work. For this data set we have already deleted rows with Nan values. So all approaches will give same MAE. 
"""
# Get names of columns with missing values
cols_with_missing = [col for col in train_X.columns
                     if train_X[col].isnull().any()]
print('Columns with missing values: {}'.format(cols_with_missing))
# Drop columns in training and validation data
reduced_train_X = train_X.drop(cols_with_missing, axis=1)
reduced_val_X = val_X.drop(cols_with_missing, axis=1)

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_train_X, reduced_val_X, train_y, val_y))





Columns with missing values: []
MAE from Approach 1 (Drop columns with missing values):
184616.92448244823


In [106]:
#======= Approach 2: imputation--> replace missing value with mean value along each column using SimpleImulater==================

from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_train_X = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_val_X = pd.DataFrame(my_imputer.transform(val_X))

#Imputation removed column names; put them back
imputed_train_X.columns = train_X.columns
imputed_val_X.columns = val_X.columns

print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_train_X, imputed_val_X, train_y, val_y))

MAE from Approach 2 (Imputation):
184616.92448244823


In [108]:
#======= Approach 3: imputation+missing column informations==================

# Make copy to avoid changing original data (when imputing)
train_X_plus = train_X.copy()
val_X_plus = val_X.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    train_X_plus[col + '_was_missing'] = train_X_plus[col].isnull()
    val_X_plus[col + '_was_missing'] = val_X_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_train_X_plus = pd.DataFrame(my_imputer.fit_transform(train_X_plus))
imputed_val_X_plus = pd.DataFrame(my_imputer.transform(val_X_plus))

# Imputation removed column names; put them back
imputed_train_X_plus.columns = train_X_plus.columns
imputed_val_X_plus.columns = val_X_plus.columns

print("MAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_train_X_plus, imputed_val_X_plus, train_y, val_y))


MAE from Approach 3 (An Extension to Imputation):
184616.92448244823


In [110]:
# Check the number of cols with missing value
# Shape of training data (num_rows, num_columns)
print(train_X.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (train_X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])


(6664, 7)
Series([], dtype: int64)
