In [1]:
import pandas as pd
import math

# Preprocessing and evaluating dataset

In [2]:
with open("./data/breast-cancer.names") as file:
    for line in file:
        print(line)

Citation Request:

   This breast cancer domain was obtained from the University Medical Centre,

   Institute of Oncology, Ljubljana, Yugoslavia.  Thanks go to M. Zwitter and 

   M. Soklic for providing the data.  Please include this citation if you plan

   to use this database.



1. Title: Breast cancer data (Michalski has used this)



2. Sources: 

   -- Matjaz Zwitter & Milan Soklic (physicians)

      Institute of Oncology 

      University Medical Center

      Ljubljana, Yugoslavia

   -- Donors: Ming Tan and Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)

   -- Date: 11 July 1988



3. Past Usage: (Several: here are some)

     -- Michalski,R.S., Mozetic,I., Hong,J., & Lavrac,N. (1986). The 

        Multi-Purpose Incremental Learning System AQ15 and its Testing 

        Application to Three Medical Domains.  In Proceedings of the 

        Fifth National Conference on Artificial Intelligence, 1041-1045,

        Philadelphia, PA: Morgan Kaufmann.

        -- accuracy

In [3]:
df = pd.read_csv("./data/breast-cancer.data", header=None)
df = df.rename(columns={
    0: "Label", 
    1: "Age", 
    2: "Menopause",
    3: "Tumor Size",
    4: "Inv Nodes",
    5: "Node Caps",
    6: "Malignance Degree",
    7: "Breast",
    8: "Breast Quadrant",
    9: "Irradiated"
})
df.head(15)

Unnamed: 0,Label,Age,Menopause,Tumor Size,Inv Nodes,Node Caps,Malignance Degree,Breast,Breast Quadrant,Irradiated
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
5,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no
6,no-recurrence-events,50-59,premeno,25-29,0-2,no,2,left,left_low,no
7,no-recurrence-events,60-69,ge40,20-24,0-2,no,1,left,left_low,no
8,no-recurrence-events,40-49,premeno,50-54,0-2,no,2,left,left_low,no
9,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,left_up,no


In [4]:
df.columns
# df["Age"].value_counts()
# df["Menopause"].value_counts()
# df["Tumor Size"].value_counts()
# df["Inv Nodes"].value_counts()
# df["Node Caps"].value_counts()
# df["Malignance Degree"].value_counts()
# df["Breast"].value_counts()
# df["Breast Quadrant"].value_counts()
# df["Irradiated"].value_counts()
df = df[df["Breast Quadrant"] != "?"]
df = df[df["Node Caps"] != "?"]

# Assumptions and extracting columns
Chosen three attributes: age, tumor size, and degree.

* Age:
    * Young if between 20-39
    * Aging if between 40-59
    * Old if 60 or more
* Tumor size:
    * Small if between 0-19
    * Medium if between 20-39
    * Large if 40 or more
* Degree:
    * 1
    * 2
    * 3

In [5]:
def get_age(age_range):
    age = int(age_range.split("-")[1])
    try:
        if age < 40:
            return "Young"
        if age < 60:
            return "Aging"
    except ValueError:
        pass
    return "Old"


def get_tumor_size(size_range):
    size = int(size_range.split("-")[1])
    try:
        if size < 20:
            return "Small"
        if size < 40:
            return "Medium"
    except ValueError:
        pass
    return "Large"


def get_breast_quad(quad):
    try:
        if quad == "left_low":
            return "Left Low"
        if quad == "left_up":
            return "Left Up"
        if quad == "right_up":
            return "Right Up"
        if quad == "right_low":
            return "Right Low"
    except ValueError:
        pass
    return "Central"


def get_degree(degree):
    try:
        if degree == 1:
            return "Degree 1"
        if degree == 2:
            return "Degree 2"
    except ValueError:
        pass
    return "Degree 3"


def get_label(label):
    if label == "no-recurrence-events":
        return 1
    return 0


df["Age"] = df["Age"].apply(lambda x: get_age(x))
df["Tumor Size"] = df["Tumor Size"].apply(lambda x: get_tumor_size(x))
df["Breast Quadrant"] = df["Breast Quadrant"].apply(lambda x: get_breast_quad(x))
df["Malignance Degree"] = df["Malignance Degree"].apply(lambda x: get_degree(x))
df["Label"] = df["Label"].apply(lambda x: get_label(x))

In [6]:
filtered_df = df.filter(["Label", "Age", "Tumor Size", "Malignance Degree"], axis=1)
filtered_df

Unnamed: 0,Label,Age,Tumor Size,Malignance Degree
0,1,Young,Medium,Degree 3
1,1,Aging,Medium,Degree 2
2,1,Aging,Medium,Degree 2
3,1,Old,Small,Degree 2
4,1,Aging,Small,Degree 2
5,1,Old,Small,Degree 2
6,1,Aging,Medium,Degree 2
7,1,Old,Medium,Degree 1
8,1,Aging,Large,Degree 2
9,1,Aging,Medium,Degree 2


# Train/test split

In [7]:
seed = 200

df_training_data = filtered_df.sample(frac=0.7, random_state=seed)
df_test_data = filtered_df.drop(df_training_data.index).sample(frac=1.0, random_state=seed)
training_data, test_data = df_training_data.to_numpy().tolist(), df_test_data.to_numpy().tolist()

X_train = [i[1:] for i in training_data]
Y_train = [i[0] for i in training_data]

X_test = [i[1:] for i in test_data]
Y_test = [i[0] for i in test_data]

for d in training_data:
    print(d)

[0, 'Aging', 'Medium', 'Degree 3']
[1, 'Aging', 'Large', 'Degree 3']
[1, 'Aging', 'Medium', 'Degree 2']
[1, 'Old', 'Large', 'Degree 2']
[1, 'Young', 'Medium', 'Degree 2']
[1, 'Old', 'Large', 'Degree 3']
[1, 'Aging', 'Medium', 'Degree 2']
[1, 'Young', 'Small', 'Degree 3']
[1, 'Aging', 'Small', 'Degree 2']
[1, 'Aging', 'Medium', 'Degree 2']
[0, 'Aging', 'Medium', 'Degree 3']
[1, 'Aging', 'Small', 'Degree 1']
[1, 'Young', 'Large', 'Degree 2']
[0, 'Old', 'Medium', 'Degree 2']
[0, 'Young', 'Medium', 'Degree 1']
[0, 'Aging', 'Medium', 'Degree 1']
[1, 'Aging', 'Small', 'Degree 1']
[1, 'Aging', 'Medium', 'Degree 3']
[1, 'Aging', 'Medium', 'Degree 1']
[0, 'Young', 'Medium', 'Degree 3']
[1, 'Young', 'Medium', 'Degree 2']
[1, 'Young', 'Medium', 'Degree 2']
[1, 'Old', 'Medium', 'Degree 2']
[1, 'Old', 'Medium', 'Degree 2']
[0, 'Aging', 'Medium', 'Degree 3']
[1, 'Old', 'Small', 'Degree 1']
[1, 'Old', 'Medium', 'Degree 1']
[1, 'Aging', 'Small', 'Degree 1']
[0, 'Aging', 'Medium', 'Degree 3']
[1, 'Agin

# Decision tree functions

### Entropy
* $pos$ = number of first label
* $neg$ = number of second label
* $p_{+} = \frac{pos}{pos + neg}$
* $p_{-} = \frac{neg}{pos + neg}$
* If either $pos$ or $neg$ is 0, return 0
* Else, return $H(S) = - (p_{+}\log_2 p_{+}) - (p_{-}\log_2 p_{-})$

### Get highest gain
1. `classes` tells how many classes there are in total for the node in question. If there is three classes, `classes` will be `[1, 2, 3]`
2. Use the `gain` function for each individual class as described below
3. Return the index of the minimum entropy value (this is the highest gain value)
    * This index will be used to further split the child nodes

### Gain
1. Loop through all values of the split dictionary and measure the entropy for each attribute in the class *(i.e. "Aging")*
2. Each subset in the main set consists of two values: its *entropy* and its *length*
3. Returns the gain of the class that was passed to it:
     * $\text{Gain}(S, A) = \sum_{v \in \text{values}(A)}^{} \frac{\lvert{S_v}\rvert H(S_v)}{\lvert{S}\rvert}$


### Split
* Splits the node into its class values
* `{Age: [Young, Aging, Old], Degree: [1, 2, 3], Tumor Size: [Small, Medium, Large]}`
* All unique attributes in the argument `node` will be stored in a set `all_attributes`
    * i.e. `{"Young", "Aging", "Old"}`
* Loops through each element in the node and adds them to their own list

### Build tree
This function is recursive, and returns when the node is either empty or pure.
1. Takes a node as a parameter
2. If the node is empty or pure:
    1. Find the most common label
    2. Add it to the classifier as a return value
    3. Return from the recursive function
3. Else:
    1. Find the class that has the highest gain
    2. Split on said class
    3. Add the necessary `if` sentence to the classifier
    4. Call `build_tree` again on each child of the class (will propagate down each branch)

In [8]:
def entropy(node):
    pos = len([i for i in node if i[0] == 0])
    neg = len([i for i in node if i[0] == 1])
    total = pos + neg
    if min(pos, neg) == 0:
        return 0
    p_pos, p_neg = (pos / total), (neg / total)
    entropy = - p_pos * math.log(p_pos, 2) - p_neg * math.log(p_neg, 2)
    return entropy


def get_highest_gain(node):
    before = entropy(node)
    # print("First node", node[0])
    classes = [i for i in range(1, len(node[0]))]
    # print("Classes", classes)
    entropies = [gain(node, c) for c in classes]
    # print("Entropies", entropies)
    return entropies.index(min(entropies)) + 1


def gain(node, attribute):
    main_set = [(entropy(i), len(i)) for i in split(node, attribute).values()]
    # print("Entropy, length of split values", main_set)
    n_all = sum([subset[1] for subset in main_set])
    gain = sum([(subset[0] * subset[1]) / n_all for subset in main_set])
    return gain


def split(node, attribute_index, remove=False):
    retvals = {}
    all_attributes = set([n[attribute_index] for n in node])
    # print("All attributes", all_attributes)
    for n in node:
        c = n[attribute_index]
        a_list = retvals.get(c, [])
        if remove:
            n.pop(node)
        a_list.append(n)
        retvals[c] = a_list
    return retvals

In [9]:
def is_pure(node):
    classes = [i for i in range(1, len(node[0]))]
    for c in classes:
        if len(set([i[c] for i in node])) > 1:
            return False
    return True


def is_empty(node):
    return len(node[0]) <= 1


def most_common(node):
    label_list = [i[0] for i in node]
    return max(set(label_list), key=label_list.count)


def confidence(node):
    most_common_value = most_common(node)
    return len([i[0] for i in node if i[0] == most_common_value]) / len(node)

In [14]:
actual_classifier = "def classify(data):"


def build_tree(node, spaces="    "):
    global actual_classifier
    if is_empty(node) or is_pure(node):
        # print(f"Empty: {is_empty(node)}, Pure: {is_pure(node)}")
        most_common_value = most_common(node)
        print(f"{spaces}then {most_common_value}")
        print(f"{spaces}# confidence {confidence(node):.2f}")
        actual_classifier += f"\n{spaces}return {most_common_value}" 
        return
    
    highest = get_highest_gain(node)
    d = split(node, highest)
    for key, value in d.items():
        print(f"{spaces}if {key}:")
        actual_classifier += f"\n{spaces}if data[{highest}] == \"{key}\":"
        build_tree(value, spaces + "  ")
              

In [15]:
build_tree(training_data)

    if Degree 3:
      if Medium:
        if Aging:
          then 0
          # confidence 0.57
        if Young:
          then 0
          # confidence 0.86
        if Old:
          then 0
          # confidence 0.50
      if Large:
        if Aging:
          then 1
          # confidence 0.67
        if Old:
          then 0
          # confidence 0.50
        if Young:
          then 1
          # confidence 1.00
      if Small:
        if Young:
          then 0
          # confidence 0.50
        if Aging:
          then 0
          # confidence 0.50
        if Old:
          then 0
          # confidence 0.50
    if Degree 2:
      if Medium:
        if Aging:
          then 1
          # confidence 0.69
        if Young:
          then 1
          # confidence 1.00
        if Old:
          then 1
          # confidence 0.67
      if Large:
        if Old:
          then 1
          # confidence 0.75
        if Young:
          then 1
          # confidence 1.00
        if A

In [12]:
print(actual_classifier)

def classify(data):
    if data[3] == "Degree 3":
      if data[2] == "Medium":
        if data[1] == "Aging":
          return 0
        if data[1] == "Young":
          return 0
        if data[1] == "Old":
          return 0
      if data[2] == "Large":
        if data[1] == "Aging":
          return 1
        if data[1] == "Old":
          return 0
        if data[1] == "Young":
          return 1
      if data[2] == "Small":
        if data[1] == "Young":
          return 0
        if data[1] == "Aging":
          return 0
        if data[1] == "Old":
          return 0
    if data[3] == "Degree 2":
      if data[2] == "Medium":
        if data[1] == "Aging":
          return 1
        if data[1] == "Young":
          return 1
        if data[1] == "Old":
          return 1
      if data[2] == "Large":
        if data[1] == "Old":
          return 1
        if data[1] == "Young":
          return 1
        if data[1] == "Aging":
          return 1
      if data[2] == "Small":
    

# Test classifier

In [13]:
exec(actual_classifier)
correct, wrong = 0, 0

for data in test_data:
    if data[0] == classify(data):
        correct += 1
    else:
        wrong += 1

    
accuracy = int((correct / (correct + wrong)) * 100)    
print(f"Correct classifications {correct}")
print(f"Wrong classifications {wrong}")
print(f"Accuracy {accuracy}%")

Correct classifications 58
Wrong classifications 25
Accuracy 69%
