# Import

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Helper Functions

In [2]:
def create_leaf(y, ml_task):
    
    if ml_task == "regression":
        leaf = float(np.mean(y))
    else:
        counts = y.value_counts().reset_index()
        leaf = counts.iloc[0,0]
    
    return leaf


def get_potential_splits(data):
    
    X = data.drop(columns='target')
    potential_splits = {}
    columns = X.columns.tolist()
    for column in columns:

        values = X[[column]]
        unique_values = np.unique(values)
        
        potential_splits[column] = unique_values - 1
    
    return potential_splits


def calculate_gini(y):
    
    counts = y.value_counts().to_numpy()
    probabilities = counts / counts.sum()
    gini = np.sum(probabilities*(1-probabilities))
     
    return gini


def calculate_mse(y):
    
    if len(y) == 0:
        mse = 0
    else:
        mse = np.mean((y - np.mean(y)) **2)
    
    return mse


def total_impurity(data_left, data_right, metric_function):\

    n = len(data_left) + len(data_right)
    prop_left = len(data_left) / n
    prop_right = len(data_right) / n

    overall_metric =  (prop_left * metric_function(data_left['target']) 
                     + prop_right * metric_function(data_right['target']))
    
    return overall_metric


def split_data(data, column_types, split_column, split_value):
    
    type_of_feature = column_types[split_column]

    if type_of_feature == "continuous":
        data_left = data[data[split_column] <= split_value]
        data_right = data[data[split_column] >  split_value]
    
    else:
        data_left = data[data[split_column] == split_value]
        data_right = data[data[split_column] != split_value]
    
    return data_left, data_right


def determine_best_split(data, column_types, potential_splits, ml_task):

    best_overall_metric = np.inf
    for column, splits in potential_splits.items():
        for split in splits:
            
            data_left, data_right = split_data(data, column_types, split_column=column, split_value=split)
            
            if ml_task == "regression":
                node_impurity = total_impurity(data_left, data_right, metric_function=calculate_mse)
            else:
                node_impurity = total_impurity(data_left, data_right, metric_function=calculate_gini)
            
            if node_impurity <= best_overall_metric:
                best_overall_metric = node_impurity
                best_split_column = column
                best_split_value = split
    
    return best_split_column, best_split_value

# Algorithm

In [3]:
def decision_tree_algorithm(df, column_types, ml_task, min_samples=2, max_depth=5):
    
    leaves = []
    path = 'root'
    datasets = [(df,path)]
    split_conditions = []
    for current_depth in range(max_depth+1):
        next_set = []
        for dataset in datasets:
            data = dataset[0]
            path = dataset[1]
            
            if (len(data.target.unique()) == 1) or (len(data) < min_samples):
                leaf = create_leaf(data[['target']], ml_task)
                leaves.append((path,leaf))
                continue

            potential_splits = get_potential_splits(data)
            split_column, split_value = determine_best_split(data, column_types, potential_splits, ml_task)
            data_left, data_right = split_data(data, column_types, split_column, split_value)

            if len(data_left) == 0 or len(data_right) == 0:
                leaf = create_leaf(data[['target']], ml_task)
                leaves.append((path,leaf))
                continue
            print(len(data_left),len(data_right))
            split_conditions.append((path,split_column,split_value))
            next_set.append((data_left,path+',l'))
            next_set.append((data_right,path+',r'))

        datasets = next_set

    for dataset in datasets:
        data = dataset[0]
        path = dataset[1]
        leaf = create_leaf(data[['target']], ml_task)
        leaves.append((path,leaf))

    return leaves, split_conditions

# Make predictions with decision tree

def make_predictions(df, column_types, leaves, split_conditions):

    df['path'] = 'root'
    df['value'] = 0
    
    for split_condition in split_conditions:
        path = split_condition[0]
        column = split_condition[1]
        value = split_condition[2]

        if column_types[column] == "continuous":
            df.loc[(df['path']==path)&(df[column]<= value),'path'] = path+',l'
            df.loc[(df['path']==path)&(df[column]> value),'path'] = path+',r'
        else:
            df.loc[(df['path']==path)&(df[column]== value),'path'] = path+',l'
            df.loc[(df['path']==path)&(df[column]!= value),'path'] = path+',r'

    df['prediction'] = df['path'].map(dict(leaves))

    return df


def calculate_accuracy(df, column_types, ml_task, leaves, split_conditions):
    predictions = make_predictions(df, column_types, leaves, split_conditions).prediction
    
    if ml_task == 'regression':    
        metric = sum((predictions - df.target)**2) / len(predictions)
    else:
        predictions_correct = predictions == df.target
        metric = predictions_correct.mean()
    
    return  metric

# Data Loading

In [4]:
## Read csvs
train_df = pd.read_csv('data/classification/train.csv', index_col=0)

In [5]:
train_df = train_df.fillna(0)
train_df = train_df[['images','urls','chars_in_subject','chars_in_body','label']].rename(columns={'label':'target'})
train, val = train_test_split(train_df, test_size = 0.2)
column_types = {'images':'continuous','urls':'continuous','chars_in_subject':'continuous','chars_in_body':'continuous'}
ml_task = 'classifier'

# Model Training

In [6]:
leaves, split_conditions = decision_tree_algorithm(train, column_types, ml_task, min_samples=2, max_depth=3)

29803 34337
9770 20033
18820 15517
6206 3564
4427 15606
10009 8811
2664 12853
1322 4884
3002 562
1136 3291
6187 9419
9092 917
2807 6004
2505 159
4815 8038


In [7]:
dict(leaves)

{'root,l,l,l,l': 1,
 'root,l,l,l,r': 1,
 'root,l,l,r,l': 1,
 'root,l,l,r,r': 1,
 'root,l,r,l,l': 1,
 'root,l,r,l,r': 1,
 'root,l,r,r,l': 3,
 'root,l,r,r,r': 1,
 'root,r,l,l,l': 0,
 'root,r,l,l,r': 0,
 'root,r,l,r,l': 1,
 'root,r,l,r,r': 1,
 'root,r,r,l,l': 1,
 'root,r,r,l,r': 1,
 'root,r,r,r,l': 1,
 'root,r,r,r,r': 1}

In [8]:
split_conditions

[('root', 'images', 1),
 ('root,l', 'chars_in_subject', 29.0),
 ('root,r', 'chars_in_body', 31735),
 ('root,l,l', 'urls', 4),
 ('root,l,r', 'chars_in_body', 1250),
 ('root,r,l', 'chars_in_subject', 40.0),
 ('root,r,r', 'urls', 40),
 ('root,l,l,l', 'chars_in_subject', 11.0),
 ('root,l,l,r', 'images', 0),
 ('root,l,r,l', 'chars_in_body', 6),
 ('root,l,r,r', 'urls', 3),
 ('root,r,l,l', 'urls', 51),
 ('root,r,l,r', 'images', 3),
 ('root,r,r,l', 'chars_in_body', 1050392),
 ('root,r,r,r', 'chars_in_subject', 51.0)]

# Prediction

In [9]:
predict = make_predictions(val, column_types, leaves, split_conditions)
predict

Unnamed: 0,images,urls,chars_in_subject,chars_in_body,target,path,value,prediction
52062,9,35,35.0,13897,0,"root,r,l,l,l",0,0
53990,0,0,22.0,132103,0,"root,l,l,l,r",0,1
50011,0,0,30.0,1332,1,"root,l,r,r,l",0,3
55164,5,20,29.0,37157,2,"root,r,r,l,l",0,1
16100,1,30,49.0,18795,1,"root,l,r,r,r",0,1
...,...,...,...,...,...,...,...,...
38064,0,12,21.0,5676,1,"root,l,l,r,l",0,1
17495,0,30,3.0,6020,1,"root,l,l,r,l",0,1
20103,0,6,19.0,4142,1,"root,l,l,r,l",0,1
4725,0,0,57.0,688,1,"root,l,r,l,r",0,1


In [10]:
accuracy = calculate_accuracy(val, column_types, ml_task, leaves, split_conditions)
accuracy

0.47050386630082314