## Decision Tree Regression Homework - Question

### Import libraries

In [None]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor # Import Decision Tree Regressor
from sklearn.model_selection import train_test_split 

#data pre-processing
from sklearn import preprocessing

#Decision tree visualization
from sklearn import tree

#module for accuracy calculation
from sklearn import metrics 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

#Decision tree visualization
from matplotlib import pyplot as plt

#table
from tabulate import tabulate

### Loading data 

In [None]:
df = pd.read_csv('melb_data.csv')
df.head()

## Data preprocessing

*Check for missing values
*Drop rows with empty cells

### Factors that determine the selling price of a house are:

*'Rooms', 'Type', 'Bathroom','Landsize','YearBuilt', 'Lattitude','Longtitude'

Use label encoder to transform Type column to integers

## Split data into features, target

In [None]:
# define feature variables
features = ['Rooms', 'Type', 'Bathroom','Landsize','YearBuilt', 'Lattitude','Longtitude']
x = df[features]

In [None]:
# define target variale
y=df['Price']

## Build decision tree regression model

*splitting data into 80% for training and 20% for testing and random state 1.

In [None]:
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0) 
  
# fit the regressor with X and Y data
regressor.fit(x_train, y_train)

### Making predictions using trained model

In [None]:
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1))

## Visualize decision tree

In [None]:
fig_Tree = plt.figure(figsize=(25,20))
_ = tree.plot_tree(regressor, 
                   feature_names=features,  
                   filled=True)

In [None]:
# Save decision tree as PNG

fig_Tree.savefig("decision_tree.png")

### Measuring the accuracy of the trained decision tree regressor model

In [None]:
#Table
#create data
data = [["r2_score", r2_score(y_test, y_pred)], 
        ["mean_squared_error", mean_squared_error(y_test, y_pred)], 
        ["mean_absolute_percentage_error", mean_absolute_percentage_error(y_test, y_pred)], 
        ["score", regressor.score(x_test, y_pred)]]
  
#define header names
col_names = [" ", "Trained Model"]
  
#display table
print(tabulate(data, headers=col_names))

### Improving the Decision Tree Classifier's Accuracy by Tuning the Hyper-Parameters¶

Applying the GridSearchCV class to find the best hyperparameters and apply cross-validation

In [None]:
# Creating a dictionary of parameters to use in GridSearchCV
from sklearn.model_selection import GridSearchCV

params = {
    'criterion':  ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth':  [None, 2, 4, 6, 8, 10, 12, 14],
    'max_features': [None, 'auto', 'sqrt', 'log2', 0.2, 0.4, 0.6, 0.8],
    'splitter': ['best', 'random']
}

regressor = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid=params,
    cv=5,
    n_jobs=5,
    verbose=1,
)

regressor.fit(x_train, y_train)
print(regressor.best_params_)

#### Recreating the model using optimal parameters above, evaluating our model and checking the accuracy.

*Using random state=0

### Plotting the optimal decision tree

### Comparing the results of the two models

*using r2_score, mean_square_error, mean_absolute_percentage_error and score.