In [None]:
import logging
logger = logging.getLogger()
if not logger.handlers:
    logger.addHandler(logging.StreamHandler())

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from data_common import PERSONALITY_FEATURES, EMOTIONS_FEATURES, EMOTIONS_LABELS, DAA, CLUSTER

from kmeans import KMeans
from kmeans_printer import print_per_game_kmeans

#paradox = 'ellsberg'
paradox = 'allais'
df = pd.read_csv(f'data/data_{paradox}.csv')

df = df[PERSONALITY_FEATURES + EMOTIONS_FEATURES + [DAA]]

In [None]:
random_state = 69
K = 2

km = KMeans(df, K, features=EMOTIONS_FEATURES, random_state=random_state)
print_per_game_kmeans(km, EMOTIONS_LABELS, plot_graph=True)

In [None]:
from symmetric_uncertainty import SUT

class DecisionNode:
    ID = 0

    def __init__(self, df, min_gain, min_samples_leaf, index=None, columns=None):
        self.__log = logging.getLogger('dt')

        self.__id = DecisionNode.ID
        DecisionNode.ID += 1
        self.__sut = SUT(df, index, columns)

        self.__min_samples_leaf = min_samples_leaf
        self.__min_samples_split = 3 * self.__min_samples_leaf
        self.__min_gain = min_gain

        self.__param = None
        self.__cutoff = None
        self.__gain = 0
        self.__left = None
        self.__right = None

        self.__fit()
    
    @property
    def id(self):
        return self.__id

    @property
    def sut(self):
        return self.__sut

    @property
    def is_leaf(self):
        return self.__left == None and self.__right == None

    @property
    def leaf_nodes(self):
        if self.is_leaf:
            return [self.id]
        else:
            return self.left.leaf_nodes + self.right.leaf_nodes

    @property
    def param(self):
        return self.__param
    
    @property
    def cutoff(self):
        return self.__cutoff

    @property
    def left(self):
        return self.__left

    @property
    def right(self):
        return self.__right

    @property
    def metric(self):
        return self.__sut.metric

    @property
    def subtree_metric(self):
        if self.is_leaf:
            return self.metric
        else:
            return self.left.metric + self.right.metric

    def label(self, df, column):
        df.loc[self.sut.df.index, column] = self.__id
        if self.left:
            self.left.label(df, column)
        if self.right:
            self.right.label(df, column)

    def __find_cutoff(self, param, cutoff):
        left_df = self.__sut.df[self.__sut.df[param] <= cutoff]
        right_df = self.__sut.df[self.__sut.df[param] > cutoff]
        return left_df, right_df
        
    def __find_cutoff_gain(self, current, param, cutoff):
        left_df, right_df = self.__find_cutoff(param, cutoff)

        left_len = len(left_df)
        right_len = len(right_df)

        if left_len < self.__min_samples_leaf or right_len < self.__min_samples_leaf:
            self.__log.info(f'\t\t\tNot enough samples in leaf: {left_len}, {right_len}')
            return 0

        left_sut = SUT(left_df, self.__sut.index, self.__sut.columns)
        self.__log.debug('\t\t\t' + '\n\t\t\t'.join(left_sut.df_p.to_string().split('\n')))

        left_metric = left_sut.metric
        self.__log.info(f'\t\t\tL: {left_metric}')

        right_sut = SUT(right_df, self.__sut.index, self.__sut.columns)
        self.__log.debug('\t\t\t' + '\n\t\t\t'.join(right_sut.df_p.to_string().split('\n')))

        right_metric = right_sut.metric
        self.__log.info(f'\t\t\tR: {right_metric}')

        return max(left_metric - current, right_metric - current)

    def __find_best_cutoff_index(self, current, param):
        best_cutoff = best_gain = None

        values = sorted(self.__sut.df[param].unique())
        cutoffs = [(values[i] + values[i+1]) / 2 for i in range(len(values) - 1)]
        self.__log.info(f'\t{values}')
        np.random.shuffle(cutoffs)
        for cutoff in cutoffs:
            self.__log.info(f'\t\t{cutoff}')
            gain = self.__find_cutoff_gain(current, param, cutoff)
            self.__log.info(f'\t\t\tGain: {gain}')
            if not best_gain or gain > best_gain:
                best_cutoff, best_gain = cutoff, gain

        return best_cutoff, best_gain

    def __find_best_cutoff(self, current):
        best_param = best_cutoff = best_gain = None

        params = PERSONALITY_FEATURES.copy()
        np.random.shuffle(params)
        for param in params:
            self.__log.info(f'\t{param}')
            cutoff, gain = self.__find_best_cutoff_index(current, param)
            self.__log.info(f'\tBest gain for {param}: {gain}')

            if not gain:
                continue

            if not best_gain or gain > best_gain:
                best_param, best_cutoff, best_gain = param, cutoff, gain

        return best_param, best_cutoff, best_gain 

    def __fit(self):
        current = self.__sut.metric
        self.__log.warning(f'Node #{self.__id} [{current}]')

        if self.sut.len < self.__min_samples_split:
            self.__log.warning(f'\tNot splitting node, not enough samples')
            return

        self.__param, self.__cutoff, self.__gain = self.__find_best_cutoff(current)
        if not self.__gain or self.__gain < self.__min_gain:
            self.__log.warning('\tNot splitting node, gain too low')
            return

        left_df, right_df = self.__find_cutoff(self.__param, self.__cutoff)

        self.__log.warning(f'\tBest gain: {self.__gain}\n')
        self.__log.warning('\tLeft:\t' + '\n\t'.join(str(SUT(left_df, self.__sut.index, self.__sut.columns)).split('\n')))
        self.__log.warning('\tRight:\t' + '\n\t'.join(str(SUT(right_df, self.__sut.index, self.__sut.columns)).split('\n')))

        self.__left = DecisionNode(left_df,
                                   min_gain=self.__min_gain,
                                   min_samples_leaf=self.__min_samples_leaf,
                                   index=self.__sut.index,
                                   columns=self.__sut.columns)
        self.__right = DecisionNode(right_df,
                                    min_gain=self.__min_gain,
                                    min_samples_leaf=self.__min_samples_leaf,
                                    index=self.__sut.index,
                                    columns=self.__sut.columns)

class DecisionTree:
    def __init__(self, df, min_gain, min_samples_leaf):
        self.__root = DecisionNode(df,
                                   min_gain=min_gain,
                                   min_samples_leaf=min_samples_leaf)

    @property
    def root(self):
        return self.__root

    def label(self, df, column):
        self.root.label(df, column)
        df[column] = df[column].astype(int)

In [None]:
df[CLUSTER] = km.labels
dt_df = df.drop(EMOTIONS_FEATURES, axis=1)

logging.getLogger('dt').setLevel('WARNING')
np.random.default_rng(seed=1337)

DecisionNode.ID = 0
dt = DecisionTree(dt_df, min_gain=0.080, min_samples_leaf=20)

In [None]:
node = 'Node'
dt.label(df, node)
print(dt.root.leaf_nodes)
print(df[node].value_counts())

In [None]:
from sklearn.decomposition import PCA
from pca import print_decomposition

lim = (-4,4)
decomp = PCA(n_components=2, svd_solver='full').fit_transform(df[EMOTIONS_FEATURES])
print_decomposition(decomp, df, labels=df[node], xlim=lim, ylim=lim)

In [None]:
from pca_edt import print_df_pca

decomp_df = df.copy()
decomp_df['PCAx'] = decomp[:, 0]
decomp_df['PCAy'] = decomp[:, 1]

for n in dt.root.leaf_nodes:
    title = f'--- Node #{n} ---'
    node_df = decomp_df[decomp_df[node] == n]
    print_df_pca(node_df, title, col_x='PCAx', col_y='PCAy', col_lbl=CLUSTER, xlim=lim, ylim=lim)

In [None]:
class DotPrinter:

    def __init__(self, dt):
        self.__dt = dt

    def __node_children(self, node, left):
        label = 'xlabel' if left else 'label'

        operator = '<=' if left else '>'
        criteria = f'{node.param} {operator} {node.cutoff}'

        child = node.left if left else node.right
        child_label = self.__node_label(child)
        child_node = f'{child.id} [label="{child_label}",shape=box,style=filled,color=".7 .3 1."];'
        child_edge = f'{node.id} -> {child.id} [{label}="{criteria}"];'
        return '\n'.join([child_node, child_edge])

    def __node_label(self, node):
        len = node.sut.len
        su = node.sut.metric
        return f'id = {node.id}\\n' + \
               f'samples = {len}\\n' + \
               f'su = {su:.5f}\\n' + \
               f'cluster   0   1\n' + \
               f'not-daa {node.sut.df_counts.loc[0].tolist()}\n' + \
               f'daa       {node.sut.df_counts.loc[1].tolist()}'

    def __node_content(self, node):
        if node.is_leaf:
            return ''

        return '\n'.join([
            self.__node_children(node, left=True),
            self.__node_content(node.left),
            self.__node_children(node, left=False),
            self.__node_content(node.right)
        ])

    def print(self):
        node_label = self.__node_label(self.__dt.root)
        node_content = self.__node_content(self.__dt.root)

        return '\n'.join([
            'digraph BST {',
            'node [fontname="Tahoma"]',
            f'{self.__dt.root.id} [label="{node_label}",shape=box,style=filled,color=".6 .2 1."];',
            node_content,
            '}'
        ])

#### Write report: Generate a DOT file and a PNG of the emotion decision tree

In [None]:
import subprocess

report_basename = f'{paradox}_model_of_models'

report_dot = f'{report_basename}.{random_state}.dot'
with open(report_dot, 'w') as fout:
    dot_output = DotPrinter(dt)
    fout.write(dot_output.print())

report_png = f'{report_basename}.{random_state}.png'
subprocess.check_call(['dot', report_dot, '-Tpng', f'-o{report_png}'])