## Penerapan Algoritma C45 Menggunakan Python

Oleh Kelompok 5


Diberikan data dengan 4 Kategori yaitu:
1. Berat Badan
2. Jenis Kelamin
3. Tekanan Darah
4. Kadar Gula

dengan variabel tujuan atau target yaitu:
- Tidak Turunan
- Turunan

In [1]:
import pandas as pd
import copy
from anytree import Node as Noda
from anytree import RenderTree
from anytree.exporter import DotExporter


In [2]:
# Penentuan Fungsi" Bantuan:

class Hitung:
    @staticmethod
    def ln(x):
        n = 1000.0
        return n * ((x ** (1/n)) - 1)
    
    @staticmethod
    def log2(x):
        return Hitung.ln(x)/Hitung.ln(2)

class OlahData:
    @staticmethod
    def getUniqueSet(dataset,tujuan:str=None,done:list=[]):
        unique = {}
        
        for column in dataset['columns']:
            unique[column] = set({})
        
        for data in dataset['data']:
            for col,dt in zip(dataset['columns'],data):
                unique[col].add(dt)
        
        for key,val in unique.items():
            unique[key] = sorted(list(val))
        
        # Tujuan itu targetnya
        if tujuan:
            del unique[tujuan]
        else:
            unique.popitem()
        
        for done_col in done:
            del unique[done_col]
        
        return unique  




In [3]:

def getEntropy(daftar_target:list,jumlah:int) -> float:
    """Mencari Nilai Entropi Suatu Daftar Jumlah Target Suatu Kategori

    Args:
        daftar_target (list): Daftar Banyaknya kemunculan suatu kategori pada kriteria

    Returns:
        float: Besar Suatu Entropi
    """
    entropi =  0
    
    for target_qty in daftar_target:
        if target_qty ==0:
            return 0
        fraction = target_qty/jumlah
        entropi += -1*(Hitung.log2(fraction))*fraction
    
    return entropi


def getGain(daftar_entropi:list,entropi_utama:float,banyak_data_utama:int) ->float:
    """Mencari banyaknya gain

    Args:
        daftar_entropi (list): [banyak data, entropi]
        nilai_entropi_utama (float): nilai entropi
        banyak_data_utama (int): _description_

    Returns:
        float: _description_
    """
    
    gain = 0
    
    for qty, entropi in daftar_entropi:
        gain += (qty/banyak_data_utama)*entropi
    
    return entropi_utama - gain
    

# Pencarian Node Pertama

def akumulasi(BIGDATASET:dict) -> dict:
    """Menghitung banyak kemunculan suatu kriteria atau kategori terhadap tujuan

    Args:
        startpoint : {'nama': 'total', 'total': [15, 5, 20, 0.8106059411573272]},
        dataset (list): Suatu set yang berisi tiap baris dari data

    Returns:
        dict: {
            "meta":{"tujuan":[...ket, entropi]},
            "data":{
                "kriteria1":{
                    kategori1:[],
                    kategori2:[],
                    "gain":
                }
            }
        }
    """
    dataset = BIGDATASET['data']
    hasil = {
        "meta":{},
        "start_point":{
            'nama':"total",
            "total":[]},
        "data":{}
    }

    
    tmp = [ datum[-1] for datum in dataset]
    hasil['meta']['tujuan'] = list(set(tmp)) + ['total','entropi']
    tujuan_list = hasil['meta']['tujuan'][:-2]
    data = hasil['data']
    index_tujuan = -1
        
        
    
    
    # Menghitung total
    
    
    start_point  = BIGDATASET['start_point']
    if start_point:
        hasil['start_point']['nama'] = '\n'.join(start_point)
        
    # Menghitung y/n saja
    for tujuan_total in hasil['meta']['tujuan'][:-2]:
        hasil['start_point']['total'].append(tmp.count(tujuan_total))
        
    total = sum(hasil['start_point']['total'])
    entropi = getEntropy(hasil['start_point']['total'],total)
    
    hasil['start_point']['total'].extend([total,entropi])
    
    
    # Melakukan Iterasi terhadap Dataset
    for nama_kriteria,kriteria in BIGDATASET['unique'].items():

        index = BIGDATASET['columns'].index(nama_kriteria)
        
        data[nama_kriteria] = {} 
        
        l_entropi_kriteria = []
        
        for kategori in kriteria:
            
            data[nama_kriteria][kategori] = []
            
            total = 0
            for idx_pencatatan, tujuan in enumerate(tujuan_list):
                collection = [kategori,tujuan]
                data[nama_kriteria][kategori].append(0)
                
                
                for datum in BIGDATASET['data']:
                    if collection == [datum[index],datum[index_tujuan]]:
                        data[nama_kriteria][kategori][idx_pencatatan] +=1
                
                total += data[nama_kriteria][kategori][idx_pencatatan]
            
            # Menambahkan total dan entropi di belakang
            daftar_target = data[nama_kriteria][kategori]
            
            
            entropi = getEntropy(daftar_target, total)
            l_entropi_kriteria.append([total,entropi])
            

            
            data[nama_kriteria][kategori].extend([total,entropi])
            
        index +=1
        
        ENTROPI_UTAMA = hasil['start_point']['total']
        data[nama_kriteria]['gain'] = getGain(l_entropi_kriteria, ENTROPI_UTAMA[-1],ENTROPI_UTAMA[-2])


    return hasil
        
         
    

In [4]:
def find_next(hasil_akumulasi:dict)->dict:
    data = hasil_akumulasi['data']
    
    if len(data) == 1:
        hasil_akumulasi['next_head'] = None
        return hasil_akumulasi
    
    big_gain = 0
    curr_gain = ""
    
    
    for kriteria, values in data.items():
        for kategori, counts in values.items():
            if kategori=='gain':
                if counts > big_gain:
                    curr_gain = kriteria
                    big_gain = counts
    
    big_entropy = 0 
    curr_entropy = ""
    elim = []
    for kategori, count in data[curr_gain].items():
        if kategori == 'gain':
            continue
        
        if count[-1] > big_entropy:
            curr_entropy = kategori
            big_entropy = count[-1]
        
        elif count[-1] == 0.0:
            elim.append(kategori)
        
    

    hasil_akumulasi = hasil_akumulasi.copy()
    hasil_akumulasi['next_head'] = {
        'index':[curr_gain,curr_entropy],
        'eliminate':elim
    }
    
    return hasil_akumulasi
                    

In [5]:
def normalisasi_dict(hasil_akumulasi:dict)-> dict:
    """Mengubah menjadi bentuk dictionary dataframe

    Args:
        hasil_akumulasi (dict): {
            "meta":...,
            "utama:...,
            "data":{
                "kat":
            }
        }

    Returns:
        dict: {
            "row":,
            "data":[]
        }
    """
    hasil = {'column':["idx",],
             "data":[]
             }
    
    for key in hasil_akumulasi['meta']['tujuan']:
        hasil['column'].append(key)
    
    hasil['column'].append("gain")
    
    # START POINT
    start = hasil_akumulasi['start_point']
    baris0 = [start['nama']] + start['total']+['']
    hasil['data'].append(baris0)
    
    
    for key,dic_value in hasil_akumulasi['data'].items():
        baris = [key] + len(list(dic_value.values())[0])*[""] + [dic_value['gain']]
        hasil['data'].append(baris)
        
        for category,value in dic_value.items():
            if category == 'gain':
                continue
            baris = [category] + value + ['']
            hasil['data'].append(baris)
            

    
    return hasil     


In [6]:
def eliminasi(hasil_akumulasi:dict,dataset:dict)-> dict:
    """Mereduksi dataset yang ada sesuai dengan nilai gain yang diperoleh

    Args:
        hasil_akumulasi (dict): _description_
        dataset (list): _description_

    Returns:
        list: _description_
    """
    dataset = copy.deepcopy(dataset)
    
    id = hasil_akumulasi['next_head']['index']
    dataset['start_point'] = id
    
    
    
    col = hasil_akumulasi['next_head']['index'][0]
    idx_col_eliminasi  = dataset['columns'].index(col)
    elim = hasil_akumulasi['next_head']['index'][1]
    
    data_copy = []

    for i,v in enumerate(dataset['data']):
        if v[idx_col_eliminasi] == elim:
            
            data_copy.append(v)
    
    dataset['data'] = data_copy
    dataset['unique'] = OlahData.getUniqueSet(dataset)
    
    if len(dataset['unique'][id[0]])<=1:
        dataset['unique'] = OlahData.getUniqueSet(dataset,done=[id[0]])
    
    return dataset

In [7]:
# PENERAPAN ALGORITMA C45
class Node:
    
    def __init__(self,nama,values:str,step,parent=None,isHead=False):
        self.nama = nama
        self.value = values
        self.tahapan = step
        self.parent = parent
        self.isHead = isHead
    
    
    def __str__(self):
        if self.isHead:
            return f"{self.nama}"
        else:
            return f"{self.nama} \n{self.value}"
    def __repr__(self):
        return f"{self.nama} : {self.value}"

class Step:
    
    def __init__(self,dataset_akumulasi,parent=None):
        
        self.dataset = dataset_akumulasi
        self.head = self._cariHead(self.dataset,parent)
        self.parent = parent

        tmp = normalisasi_dict(dataset_akumulasi)
        self.df = pd.DataFrame(tmp['data'],columns=tmp['column']).set_index('idx')
        self.node_list = self._addNode(dataset_akumulasi,self.head)

    
    def _addNode(self, dataset_lama,parent):
        daftar_node = []
        tujuan_list = dataset_lama['meta']['tujuan']
        idx = dataset_lama['next_head']['index']
        eliminasi = dataset_lama['next_head']['eliminate']

        for key,datum in dataset_lama['data'][idx[0]].items():
            value = ""
            if key in eliminasi:
                for i,d in enumerate(datum[:-2]):
                    if d != 0:
                        value += f"{tujuan_list[i]} ({d})"
            
                node = Node(key,value,self,parent)
                daftar_node.append(node)
        return daftar_node
    def _cariHead(self,dataset,parent): 
        idx = dataset['next_head']['index']
        gain = dataset['data'][idx[0]]['gain']
        val = "{} - g({:0.4f})".format(idx[1],gain)
        return Node(idx[0],val,self,parent,True)

    def __str__(self):
        return f"{self.head}"
    
    def __repr__(self):
        return f"Head : {self.head}"

In [8]:
class C45:
    """C45 Tugasnya nanti :
        a. Kerjakan C45 //init
            1. Koleksi Step
        
        b. Koleksi Node yang ada
        c. Buat TreeGraph
        e. Buat Tree
            1. Return Headnya
    """
    def __init__(self,DATASET:dict):
        self.dataset = DATASET
        self.koleksi_step = self._koleksiStep()


    def _koleksiStep(self):
        step_obj = []
        parent = None
        dataset=self.dataset
        while True:
            c45_table = akumulasi(dataset)
            try:
                c45_table = find_next(c45_table)
            except KeyError:
                return step_obj
                
            if len(step_obj) ==0:
                step = Step(c45_table,None)
                parent = step.head
            else:
                step = Step(c45_table,parent)
            step_obj.append(step)
            
            dataset = eliminasi(c45_table,dataset)            

                

    def buatTree(self,nama_file,output:str='png'):
        
        nodes = []

        parent_head = None
        parent_step = None
        for step in self.koleksi_step:
            if parent_step:
                tulisan =  f"{str(parent_step.head.value)}\n{str(step.head)}"
                parent_node = Noda(tulisan,parent=parent_head)
            else:
                parent_node = Noda(str(step.head),parent=parent_head)
            nodes.append(parent_node)
            for node in step.node_list:
                noda = Noda(str(node),parent=parent_node)
                nodes.append(noda)

            parent_head = parent_node
            parent_step = step
        
        if output=='png':
            DotExporter(nodes[0]).to_picture(nama_file+".png")
        elif output =='dot':
            DotExporter(nodes[0]).to_dotfile(nama_file+".dot")
    

## Memperoleh hasil c45

In [9]:
df = pd.read_csv('Riwayat_Diabetes.csv',sep=';')

BIGDATASET = {
    'columns':list(df.columns),
    "data":df.to_numpy().tolist(),
    "start_point":None,
    
}
# Pengolahan Data Untuk Mencari Set Kategori yang unik
BIGDATASET['unique'] = OlahData.getUniqueSet(BIGDATASET)

In [10]:
#MENGAMBIL DATASET MENJADI C45
Tugas = C45(BIGDATASET)


In [11]:
Tugas.koleksi_step[0].dataset

{'meta': {'tujuan': ['Turunan', 'Tidak Turunan', 'total', 'entropi']},
 'start_point': {'nama': 'total', 'total': [15, 5, 20, 0.8106059411573272]},
 'data': {'Berat Badan': {'Average': [6, 0, 6, 0],
   'Over Weight': [5, 2, 7, 0.8624399537332654],
   'Under Weight': [4, 3, 7, 0.9845358993822122],
   'gain': 0.16416439256691007},
  'Jenis Kelamin': {'LakiLaki': [8, 3, 11, 0.8446729652841833],
   'Perempuan': [7, 2, 9, 0.7635419341577332],
   'gain': 0.002441939880046373},
  'Tekanan Darah': {'Normal': [4, 2, 6, 0.9176085880841003],
   'Rendah': [6, 0, 6, 0],
   'Tinggi': [5, 3, 8, 0.953743659099993],
   'gain': 0.15382590109209993},
  'Kadar Gula': {'Normal': [7, 0, 7, 0],
   'Rendah': [4, 0, 4, 0],
   'Tinggi': [4, 5, 9, 0.9903835395904672],
   'gain': 0.36493334834161695}},
 'next_head': {'index': ['Kadar Gula', 'Tinggi'],
  'eliminate': ['Normal', 'Rendah']}}

In [13]:
# KE JPG
# Tugas.buatTree('hasil') 
Tugas.buatTree('hasil',output='dot')
# https://dreampuf.github.io/GraphvizOnline


In [14]:
steps = Tugas.koleksi_step
print("Banyaknya tingkatan ada : ",len(steps))
print("Yaitu :")
for i,step in enumerate(steps):
    print(i,". ",step)


Banyaknya tingkatan ada :  3
Yaitu :
0 .  Kadar Gula
1 .  Berat Badan
2 .  Tekanan Darah


In [16]:
print("Dataset Awal")
# Melihat Setiap Stepnya
steps[0].df

Dataset Awal


Unnamed: 0_level_0,Turunan,Tidak Turunan,total,entropi,gain
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
total,15.0,5.0,20.0,0.810606,
Berat Badan,,,,,0.164164
Average,6.0,0.0,6.0,0.0,
Over Weight,5.0,2.0,7.0,0.86244,
Under Weight,4.0,3.0,7.0,0.984536,
Jenis Kelamin,,,,,0.002442
LakiLaki,8.0,3.0,11.0,0.844673,
Perempuan,7.0,2.0,9.0,0.763542,
Tekanan Darah,,,,,0.153826
Normal,4.0,2.0,6.0,0.917609,


In [17]:
print("Tingkatan 1: Kadar Gula Tinggi")
steps[1].df

Tingkatan 1: Kadar Gula Tinggi


Unnamed: 0_level_0,Turunan,Tidak Turunan,total,entropi,gain
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kadar Gula\nTinggi,4.0,5.0,9.0,0.990384,
Berat Badan,,,,,0.546247
Average,2.0,0.0,2.0,0.0,
Over Weight,2.0,2.0,4.0,0.999307,
Under Weight,0.0,3.0,3.0,0.0,
Jenis Kelamin,,,,,0.007214
LakiLaki,2.0,3.0,5.0,0.970259,
Perempuan,2.0,2.0,4.0,0.999307,
Tekanan Darah,,,,,0.324245
Normal,1.0,2.0,3.0,0.917609,


In [18]:
print("Tingkatan 2: Berat badan Overweight")
steps[2].df

Tingkatan 2: Berat badan Overweight


Unnamed: 0_level_0,Turunan,Tidak Turunan,total,entropi,gain
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Berat Badan\nOver Weight,2.0,2.0,4.0,0.999307,
Jenis Kelamin,,,,,0.0
LakiLaki,1.0,1.0,2.0,0.999307,
Perempuan,1.0,1.0,2.0,0.999307,
Tekanan Darah,,,,,0.999307
Rendah,2.0,0.0,2.0,0.0,
Tinggi,0.0,2.0,2.0,0.0,
Kadar Gula,,,,,0.0
Tinggi,2.0,2.0,4.0,0.999307,


In [19]:
# Setelah ini tidak ada tingkatan lagi karena cuma ada 3 steps
print(steps)

[Head : Kadar Gula, Head : Berat Badan, Head : Tekanan Darah]
