# Exercise Self information and Entropy
Show that -log(p(A)) satisfies all 3 properties required by self-information

In [3]:
import math

## I(1) = 0

In [6]:
-math.log(1)

-0.0

## P(B) > P(A) --> I(A) > I(B)

In [46]:
print(f"""log of 10: \t{-math.log(10)} 
log of 5:\t{-math.log(5)}""")

log of 10: 	-2.302585092994046 
log of 5:	-1.6094379124341003


## I(A,B) = I(A) + I(B)

In [13]:
-math.log(3*5) == -math.log(3)-math.log(5)

True

## Decision Tree - Exercises

This notebook contains exercises concerning decision trees.

In [47]:
import numpy as np

# attribute1: corner ({true, false})
# attribute2: blue ({true, false})
#

circle = 1
triangle = 2
rectangle = 3

A = np.array([  circle, circle, circle, circle, circle, circle,
                triangle, triangle, triangle,
                rectangle, rectangle, rectangle, rectangle, rectangle
                ])

B1 = np.array([ triangle, triangle, triangle,
                rectangle,rectangle,rectangle,rectangle,rectangle
                ])

B2 = np.array([circle, circle, circle, circle, circle, circle])

C1 =  np.array([ triangle, triangle, triangle,
                circle, circle
                ])

C2 = np.array([circle, circle, circle, circle,
                rectangle, rectangle, rectangle, rectangle, rectangle])

### Exercise 1 - Entropy:
Implement the entropy in python. Compare your implementation with your calculated values.

**Use the following stub for your implementation:**

```python
def entropy(Y: np.array):
    """ Compute the entropy of a given array of class-labels.
    
    Parameters
    ----------
    Y: np.array
        A one dimensional numpy array containing class labels. Class labels
        are assumed not to be one-hot encoded but categorical integer values.

    Returns
    ----------
    Entropy of Y.
    """
    pass
```

In [73]:
def entropy(Y: np.array):
    """ Compute the entropy of a given array of class-labels.
    
    Parameters
    ----------
    Y: np.array
        A one dimensional numpy array containing class labels. Class labels
        are assumed not to be one-hot encoded but categorical integer values.

    Returns
    ----------
    Entropy of Y.
    """

    # get unique values and counts
    unique, counts = np.unique(Y, return_counts=True)
    #print(f"Unique: {unique} \nCounts: {counts}")
    # number of unique values
    n_unique = unique.shape[0]
    sum = 0
    # entropy function
    for i in range(n_unique):
        probability = counts[i] / Y.shape[0]
        #print(f"Probability: {probability}")
        log_p = np.log2(probability)
        sum -= (probability*log_p)
        
    return sum

In [74]:
print(f"""Entropies: 
A: \t{entropy(A)} 
B1: \t{entropy(B1)} 
B2: \t{entropy(B2)} 
C1: \t{entropy(C1)} 
C2: \t{entropy(C2)} """)

Entropies: 
A: 	1.5306189948485174 
B1: 	0.954434002924965 
B2: 	0.0 
C1: 	0.9709505944546686 
C2: 	0.9910760598382222 


### Exercise 2 - Information Gain:

Implement the conditional entropy and information gain. Compare your implementation with your calculated results.

**Use the following stubs for your implementation:**

```python
def conditional_entropy(Sa: list):
    """ Compute the conditional entropy.

    Compute the conditional entropy for a list of numpy arrays with given class labels. Each list entry
    is assumed to contain the class labels of a set of data that was created by splitting a training set
    of data according to an attribute.

    Parameters
    ----------
    Sa: [np.array]
        A list of one dimensional numpy arrays each containing class labels. Class labels
        are assumed not to be one-hot encoded but categorical integer values.

    Returns
    ----------
    Entropy of Y.
    """
    pass
```

```python
def information_gain(T : np.array, Sa : list):
    """ Compute the information gain.

    Parameters
    ----------
    T: np.array
        A one dimensional numpy array containing class labels. Class labels
        are assumed not to be one-hot encoded but categorical integer values.

    Sa: [np.array]
        A list of one dimensional numpy arrays each containing class labels. Class labels
        are assumed not to be one-hot encoded but categorial integer values.

    Returns
    ----------
    Bits saved when encoding Sa instaed of T.
    """
    pass
```



In [90]:
def conditional_entropy(Sa: list):
    """ Compute the conditional entropy.

    Compute the conditional entropy for a list of numpy arrays with given class labels. Each list entry
    is assumed to contain the class labels of a set of data that was created by splitting a training set
    of data according to an attribute.

    Parameters
    ----------
    Sa: [np.array]
        A list of one dimensional numpy arrays each containing class labels. Class labels
        are assumed not to be one-hot encoded but categorical integer values.

    Returns
    ----------
    Conditional entropy of Sa.
    """
    # create array containing all values
    T = np.concatenate((Sa[0], Sa[1]))
    # initialize sum
    sum = 0
    # conditional entropy function for every element
    for s in Sa:
        sum += (len(s) / len(T)) * entropy(s)
        
    return sum

In [91]:
Sa = [B1, B2]

In [92]:
conditional_entropy(Sa)

0.5453908588142657

In [93]:
def information_gain(T : np.array, Sa : list):
    """ Compute the information gain.

    Parameters
    ----------
    T: np.array
        A one dimensional numpy array containing class labels. Class labels
        are assumed not to be one-hot encoded but categorical integer values.

    Sa: [np.array]
        A list of one dimensional numpy arrays each containing class labels. Class labels
        are assumed not to be one-hot encoded but categorial integer values.

    Returns
    ----------
    Bits saved when encoding Sa instaed of T.
    """
    return entropy(T) - conditional_entropy(Sa)

In [94]:
information_gain(A, Sa)

0.9852281360342516

### Exercise 3 - Gini Impurity:

Implement the Gini Impurity and compare your results with your calculations.

**Use the following stubs for your implementation:**

```python
def gini_impurity(Y:np.array):
   """ Compute the gini impurity.

    Parameters
    ----------
    Y: np.array
        A one dimensional numpy array containing class labels. Class labels
        are assumed not to be one-hot encoded but categorical integer values.

    Returns
    ----------
    Gini impurity of the set with labels Y.
    """
    pass
```


### Exercise 4 - Decision Tree in scikit-learn:

Let's now use a full implementation of decision trees from scikit-learn.

* Build a decision tree using **gini impurity** and a decision tree using **entropy** based on the implementation in sklearn (https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html).
* Interpret and compare the results by using the ```plot_tree()``` method.

In [None]:
from sklearn import tree
Y = np.array([  circle, circle, circle, circle, circle, circle,
                triangle, triangle, triangle,
                rectangle, rectangle, rectangle, rectangle, rectangle
            ])

# Attributes (aka Features)
# x1 = corners, x2 = blue
# Notice that sklearn does not support categorical attributes we therefore encode this
# (one-hot-encoded) by using 0 as False and 1 as True.
#
X = np.array([[0, 1],  # circle 1
              [0, 1],  # circle 2
              [0, 0], # circle 3
              [0, 0], # circle 4
              [0, 0], # circle 5
              [0, 0], # circle 6
              [1, 1],   # triangle 1
              [1, 1],   # triangle 2
              [1, 1],   # triangle 3
              [1, 0],  # rectangle 1
              [1, 0],  # rectangle 2
              [1, 0],  # rectangle 3
              [1, 0],  # rectangle 4
              [1, 0]   # rectangle 5
            ])

### Exercise 5 - Experiment with Decision Trees in scikit-learn:

In this exercise it is your job to experiment with the decision tree implementation of sklearn. I provided you with a ```gen_data()``` method
which is capable of generating (random) data. Use the ```accuracy_score()``` method from sklearn.metrics for evaluation. Train a classifier for this synthetic data and try to answer the following questions:

* How does the accuracy change depending on the number of data?
* Create a seperate dataset for training and testing, what is the difference between training and test accuracy? Why is it different?
* Restrict the depth of the decision tree, what changes?
* Have a closer look on the decisions in a tree. Explain the results. (Note: You can increase the size of the plotted tree using this line of code ```plt.figure(figsize=(20,20))``` before calling the ```tree.plot_tree``` method)

In [None]:
import matplotlib.pyplot as plt

def gen_data(num_samples=10):

    std = 10
    mean = 0

    X = std * np.random.uniform(0, 1, (num_samples, 2)) + mean
    Y = np.zeros(num_samples)
    Y[0:int(num_samples/2)] = 1

    plt.figure()
    plt.scatter(X[0:int(num_samples/2),0], X[0:int(num_samples/2),1])
    plt.scatter(X[int(num_samples/2):-1,0], X[int(num_samples/2):-1,1])
    plt.legend(['Class-1', 'Class-2'])
    return X,Y

### Exercise 6 - Condition monitoring of hydraulic systems sata set

Source: https://archive.ics.uci.edu/ml/datasets/Condition+monitoring+of+hydraulic+systems

* Load the dataset and use a DecisionTree to classify it.
* Make sure you are using a test- and train-split.
* Try to predict the different type of faults using decision trees.
* Explain what attributes the trees select.


#### Data Set Information:

The data set was experimentally obtained with a hydraulic test rig. This test rig consists of a primary working and a secondary cooling-filtration circuit which are connected via the oil tank [1], [2]. The system cyclically repeats constant load cycles (duration 60 seconds) and measures process values such as pressures, volume flows and temperatures while the condition of four hydraulic components (cooler, valve, pump and accumulator) is quantitatively varied.


#### Attribute Information:

The data set was experimentally obtained with a hydraulic test rig. This test rig consists of a primary working and a secondary cooling-filtration circuit which are connected via the oil tank [1], [2]. The system cyclically repeats constant load cycles (duration 60 seconds) and measures process values such as pressures, volume flows and temperatures while the condition of four hydraulic components (cooler, valve, pump and accumulator) is quantitatively varied.

**Attributes are:**
```
Attribute   Sensor	    Physical quantity		        Unit	    Sampling rate
X[0]           PS1		Pressure			bar		100 Hz
X[1]           PS2		Pressure			bar		100 Hz
X[2]           PS3		Pressure			bar		100 Hz
X[3]           PS4		Pressure			bar		100 Hz
X[4]           PS5		Pressure			bar		100 Hz
X[5]           PS6		Pressure			bar		100 Hz
X[6]           EPS1             Motor power			W		100 Hz
X[7]           FS1		Volume flow			l/min		10 Hz
X[8]           FS2		Volume flow			l/min		10 Hz
X[9]           TS1		Temperature			Â°C		1 Hz
X[10]          TS2		Temperature			Â°C		1 Hz
X[11]          TS3		Temperature			Â°C		1 Hz
X[12]          TS4		Temperature			Â°C		1 Hz
X[13]          VS1		Vibration			mm/s		1 Hz
X[14]          CE		Cooling efficiency (virtual)	%		1 Hz
X[15]          CP		Cooling power (virtual)		kW		1 Hz
X[16]          SE		Efficiency factor		%		1 Hz
```

The target conditions are:

**1: Cooler condition / %:***
* 3: close to total failure
* 20: reduced effifiency
* 100: full efficiency

**2: Valve condition / %:**
* 100: optimal switching behavior
* 90: small lag
* 80: severe lag
* 73: close to total failure

**3: Internal pump leakage:**
* 0: no leakage
* 1: weak leakage
* 2: severe leakage

**4: Hydraulic accumulator / bar:**
* 130: optimal pressure
* 115: slightly reduced pressure
* 100: severely reduced pressure
* 90: close to total failure

    

In [None]:
!wget https://github.com/shegenbart/Jupyter-Exercises/raw/main/data/condition_monitoring.pickle -P ../data

import pickle
import numpy as  np
from dataclasses import dataclass

@dataclass
class Dataset:
    Description: str
    Attributes: list()
    Targets_cooler: list()
    Targets_valve: list()
    Targets_leakage: list()
    Targets_accu: list()
    X: np.array
    Y_cooler: np.array
    Y_valve: np.array
    Y_leakage: np.array
    Y_accu: np.array

def load_dataset(filename):
    with open(filename, 'rb') as fd:
        dataset = pickle.load(fd)
    return dataset

data = load_dataset('../data/condition_monitoring.pickle')