<a href="https://colab.research.google.com/github/ericyoc/ml_portfolio/blob/main/regression_trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#regression trees

# Pandas will allow us to create a dataframe of the data
# so it can be used and manipulated
import pandas as pd
# Regression Tree Algorithm
from sklearn.tree import DecisionTreeRegressor
# Split our data into a training and testing data
from sklearn.model_selection import train_test_split

In [2]:
# Surpress warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [13]:
def main():

    # load dataset
    data = pd.read_csv("ml_data/real_estate_data.csv")
    print(data.head())

    print(data.shape)
    # Most of the data is valid, there are rows with missing values
    # which we will deal with in pre-processing
    print(data.isna().sum())

    # Data pre-processing
    data.dropna(inplace=True)
    # dataset has no missing values
    print(data.isna().sum())

    # split the dataset into our features and what we are predicting (target)
    X = data.drop(columns=["MEDV"])
    Y = data["MEDV"]

    print(X.head())
    print(Y.head())

    # split data into a training and testing dataset
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=1)

    # Create regression tree
    regression_tree = DecisionTreeRegressor(criterion='squared_error', max_depth=None, max_features=None,
                                         max_leaf_nodes=None, min_impurity_decrease=0.0,
                                         min_samples_leaf=1, min_samples_split=2,
                                         min_weight_fraction_leaf=0.0, random_state=None,
                                         splitter='best')

    # training
    regression_tree.fit(X_train, Y_train)

    # Evaluation
    regression_tree.score(X_test, Y_test)
    prediction = regression_tree.predict(X_test)

    print("$",(prediction - Y_test).abs().mean()*1000)

    # train regression tree using criterion mae then report is r2 value and average error
    regression_tree = DecisionTreeRegressor(criterion='absolute_error')

    regression_tree.fit(X_train, Y_train)

    print(regression_tree.score(X_test, Y_test))

    prediction = regression_tree.predict(X_test)

    print("$",(prediction - Y_test).abs().mean()*1000)

In [14]:
if __name__ == "__main__":
    main()

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622    3  222     18.7   

   LSTAT  MEDV  
0   4.98  24.0  
1   9.14  21.6  
2   4.03  34.7  
3   2.94  33.4  
4    NaN  36.2  
(506, 13)
CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
LSTAT      20
MEDV        0
dtype: int64
CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
LSTAT      0
MEDV       0
dtype: int64
      CRIM    ZN  INDUS  CHAS 