-
Notifications
You must be signed in to change notification settings - Fork 0
/
DecisionTree.py
135 lines (112 loc) · 3.97 KB
/
DecisionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import math
def find(item, list):
for i in list:
if item(i):
return True
else:
return False
#znajdowanie najpopularniejszej wartosci atrybutu
def majority(attributes, data, target):
valFreq = {}
index = attributes.index(target)
#oblicz czestosc wartosci
for tuple in data:
if (valFreq.has_key(tuple[index])):
valFreq[tuple[index]] += 1
else:
valFreq[tuple[index]] = 1
max = 0
major = ""
for key in valFreq.keys():
if valFreq[key]>max:
max = valFreq[key]
major = key
return major
#obliczanie entropii
def entropy(attributes, data, targetAttr):
valFreq = {}
dataEntropy = 0.0
i = 0
for entry in attributes:
if (targetAttr == entry):
break
++i
#oblicza czestosc kazdej z wartosci danego atrubutu
for entry in data:
if (valFreq.has_key(entry[i])):
valFreq[entry[i]] += 1.0
else:
valFreq[entry[i]] = 1.0
#obliczanie entropii zgodnie ze wzorem
for freq in valFreq.values():
dataEntropy += (-freq/len(data)) * math.log(freq/len(data), 2)
return dataEntropy
def gain(attributes, data, attr, targetAttr):
valFreq = {}
subsetEntropy = 0.0
i = attributes.index(attr)
for entry in data:
if (valFreq.has_key(entry[i])):
valFreq[entry[i]] += 1.0
else:
valFreq[entry[i]] = 1.0
for val in valFreq.keys():
valProb = valFreq[val] / sum(valFreq.values())
dataSubset = [entry for entry in data if entry[i] == val]
subsetEntropy += valProb * entropy(attributes, dataSubset, targetAttr)
return (entropy(attributes, data, targetAttr) - subsetEntropy)
#wybiera najlepszy atrybut
def chooseAttr(data, attributes, target):
best = attributes[0]
maxGain = 0;
for attr in attributes:
newGain = gain(attributes, data, attr, target)
if newGain>maxGain:
maxGain = newGain
best = attr
return best
#pobiera wartosci "kolumny" danego atrybutu
def getValues(data, attributes, attr):
index = attributes.index(attr)
values = []
for entry in data:
if entry[index] not in values:
values.append(entry[index])
return values
def getExamples(data, attributes, best, val):
examples = [[]]
index = attributes.index(best)
for entry in data:
#znajdowanie wejscia o odpowiedniej wartosci
if (entry[index] == val):
newEntry = []
#dodaj wartosc jezeli nie jest ona w kolumnie najlepszych
for i in range(0,len(entry)):
if(i != index):
newEntry.append(entry[i])
examples.append(newEntry)
examples.remove([])
return examples
def makeTree(data, attributes, target, recursion):
recursion += 1
data = data[:]
vals = [record[attributes.index(target)] for record in data]
default = majority(attributes, data, target)
#Sprawdzam, czy sa jakiekolwiek dane oraz czy lista atrybutow nie jest pusta.
#Jezeli zajdzie ktorys z warunkow zwrocona zostanie domyslna wartosc.
if not data or (len(attributes) - 1) <= 0: return default
elif vals.count(vals[0]) == len(vals): return vals[0]
else:
#Wybieramy najlepszy atrybut
best = chooseAttr(data, attributes, target)
#Tworzymy drzewo a danym atrybucie, wartosci dopisujemy pozniej.
tree = {best:{}}
#Tutaj tworzymy poddrzea dla kazdej wartosci odpowiadajacej najlepszemu atrybutowi
for val in getValues(data, attributes, best):
examples = getExamples(data, attributes, best, val)
newAttr = attributes[:]
newAttr.remove(best)
subtree = makeTree(examples, newAttr, target, recursion)
#dodajemy nowe poddrzewo do pustego slownika obiektow w naszym drzewie/wierzcholku, ktory utworzylismy
tree[best][val] = subtree
return tree