In [1]:
"""
Students can follow the skeleton code to implement their decision regression method.
They should fill the todo part to make the code run.
"""
from random import seed
from random import randrange
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error 

In [2]:
# load data from csv
def load_csv(filename, cols=None, header=None):
    data = pd.read_csv(filename, header=header, names=cols)
    data = data.iloc[:,1:]
    # print(data)
    # data cleasing implementation: filling missing data
    
    # outlier processing implementation
    
    return data.to_numpy()

In [3]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    return mean_absolute_error(actual, predicted)

In [4]:
# Split a dataset based on an attribute and an attribute value
# implement a split method that seperate the dataset into the left branch and the right branch
def branch_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] > value:
            right.append(row)
        else:
            left.append(row)
        # todo code
    return left, right

# implement the standard deviation method to select the splitting point
def std_index(groups, total):
    # get total
    # use MEDV for learning 
    left_and_right = np.array(groups[0][-1] + groups[1][-1])
    left = np.array(groups[0][-1])
    right = np.array(groups[1][-1])
    if len(left) == 0 or len(right) == 0:
        return 0.
    all_std = np.var(left_and_right)*len(left_and_right)
    left_std = np.var(left)*len(left)
    right_std = np.var(right)*len(right)
    s_total = total_std - (left_std+right_std)
    return s_total

# Select the best split point for a dataset
def get_split(dataset, c_idx=None):
    dt_length = len(dataset)
    b_index, b_value, b_score, b_groups = -1, -1., 0., None
    # loop over this column index to find split points
    if c_idx is None:
        c_idx = list(range(len(dataset[0])-1));
    for index in c_idx:
        for row in dataset:
            # get left | right at the current row-index
            groups = branch_split(index, row[index], dataset)
            std_score = std_index(groups, dt_length)
            matching_criteria = False
            if std_score > b_score:
                matching_criteria = True
            if matching_criteria:
                b_index, b_value, b_score, b_groups = index, row[index], b_score, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}

In [6]:
# Create a terminal node value 
# generate the prediction result if the tree reaches to this leaf
def to_terminal(group):
    labels = [row[-1] for row in group]
    # implement the aggregation method => make prediction
    outputs = np.mean(labels)
    return outputs
 
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth, c_idx=None):
    # remove selected index
    # I'm not sure it would work
    # input index should not be not
    if c_idx == None:
        Warning("조교야 정신차려")
    c_idx.remove(node['index'])

    left, right = node['groups']
    del(node['groups'])
    # check for a no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left, c_idx)
        split(node['left'], max_depth, min_size, depth+1, c_idx)
    # process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right, c_idx)
        split(node['right'], max_depth, min_size, depth+1, c_idx)

# Build a decision tree
def build_tree(train, max_depth, min_size, c_idx=None):
    root = get_split(train, c_idx)
    split(root, max_depth, min_size, 1, c_idx)
    return root
 
# Make a prediction with a decision tree
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

# Classification and Regression Tree Algorithm
def decision_tree(train, test, max_depth, min_size, c_idx=None):
    tree = build_tree(train, max_depth, min_size, c_idx)
    predictions = list()
    for row in test:
        prediction = predict(tree, row)
        predictions.append(prediction)
    return (predictions)

In [None]:
# random forest implementation
# 1. implement a shuffle method to shuffle train set
# 2. implement a feature random seletion method for each tree, by that it can split branches
# 3. implement an ensemble method to aggregate the predicted results

def random_forest(n_trees, train_set, test_set, max_depth, min_size, n_features=None):
    np.random.seed(12)
    predictions = []
    
    for _ in range(n_trees):
        features = random.shuffle(list(range(13)))[0:5]
        predictions.append(decision_tree(train_set, test_set,, max_depth, min_size, features))
    # todo code
    # hint: create multiple decision trees
    # aggregate their prediction results => final results
    predictions = np.mean(predictions, axis=0)
    return predictions

In [None]:
# load and prepare data
"""
- CRIM     per capita crime rate by town
- ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS    proportion of non-retail business acres per town
- CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX      nitric oxides concentration (parts per 10 million)
- RM       average number of rooms per dwelling
- AGE      proportion of owner-occupied units built prior to 1940
- DIS      weighted distances to five Boston employment centres
- RAD      index of accessibility to radial highways
- TAX      full-value property-tax rate per $10,000
- PTRATIO  pupil-teacher ratio by town
- B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT    % lower status of the population
- MEDV     Median value of owner-occupied homes in $1000's
"""
cols = ["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"]
# select from housing_100x.csv, housing_100x_missing.csv, housing_100x_outlier.csv, housing_100x_outlier_missing.csv
# housing_100x.csv: full data without missing & outlier
# housing_100x_missing.csv: intentionally add 5% missing attributes
# housing_100x_outlier.csv: intentionally add 5% outlier samples
# housing_100x_outlier_missing.csv: intentionally add both missing attributes & outliers
filename = 'data/housing_100x.csv' 
dataset = load_csv(filename, cols=cols, header=0)

In [None]:
# evaluate algorithm
# n_folds = 5
max_depth = 5
min_size = 10
# scores = evaluate_algorithm(dataset, n_folds, max_depth, min_size)
train_length = int(len(dataset)*0.8)
train_set = dataset[:train_length,:]
test_set = dataset[train_length:,]
predicted = decision_tree(train_set, test_set, max_depth, min_size)

In [None]:
n_trees = 10
predicted = random_forest(n_trees, train_set, test_set, max_depth, min_size, 5)

In [None]:
labels = test_set[:,-1]
scores = accuracy_metric(labels, predicted)
print(scores)