In [52]:
import pandas as pd
from matplotlib import pyplot as plt
from mpl_toolkits import mplot3d
from collections import namedtuple #Untuk dict ke objek

def get_first_word(words:str)->str:
    all = ""
    for w in words.split(' '):
        try:
            int(w)
            all +=w
        except:
            all +=w[0]
    return all

def buat_dataset(filename,index_name,sep=','):
    
    df = pd.read_csv(filename,sep=sep)
    
    index = list(df[index_name])
    del df[index_name]
    
    dframe = {
        'nama_full':index,
        'kolom':list(df.columns),
        'akronim':[get_first_word(x) for x in index ],
        'data':df.to_numpy().tolist()
    }

    return dframe



In [53]:
import time
def randomN(x:int,N:int)->int:
    """Menghasilkan N Angka Random

    Args:
        x: 

    Returns:
        int: 
    """
    daftarN = []
    count = 0
    while count < N:
        time.time()
        random=int(time.time()*1000)
        random %= x
        if random not in daftarN:
            daftarN.append(random)
            count +=1
    return daftarN


In [54]:
def hitung_jarak(point_start:list,point:list)->float:
    if len(point_start)==len(point):
        res = 0
        for St,pt in zip(point_start,point):
            res += (St-pt)**2
        
        return res**(0.5)
    else:
        print("Panjang Acuan dan poin tidak sesuai")
        print(len(point_start),"!=",len(point))
        raise ValueError

In [55]:
class Cluster:

    def __init__(self,cluster:int,data,nama):
        self.id = cluster
        self.data = data
        self.nama = nama
    
    @property
    def df(self):
        output = {"nama":self.nama}
        for key,val in self.data.items():
            output['axis'+str(key)] = val
        output['cluster'] = [self.id] * len(self.nama)
        print(self)
        return pd.DataFrame(output)

    def __repr__(self):
        return f"Cluster {self.id} ({len(self.nama)}item)"
    
    def __str__(self):
        return f"Cluster {self.id} ({len(self.nama)}item)"

In [56]:
def combination(lst, n):
	
	if n == 0:
		return [[]]
	
	l =[]
	for i in range(0, len(lst)):
		
		m = lst[i]
		remLst = lst[i + 1:]
		
		for p in combination(remLst, n-1):
			l.append([m]+p)
			
	return l

In [57]:
class Iterasi:
    _COLOR = "#f403a8"

    def __init__(self,dataset,start_point,hasil_hitung:dict,nama_full=True):
        self.dataset = dataset
        self.start_point = start_point
        self.data = hasil_hitung
        if nama_full:
            self.nama = self.dataset["nama_full"]
        else:
            self.nama = self.dataset["akronim"]        
        self.cluster = self._grouping()
        self.next_point = self._cari_next_point()
        self.wcv = sum(self.data['WCV'])
        self.bcv = self._hitungbcv()
        self.ratio = self.bcv/self.wcv


    def _hitungbcv(self):
        ds = self.start_point
        lst = list(range(len(ds)))
        pos = combination(lst,2) # 01, 12, 02

        daftar_jarak_centroid = [hitung_jarak(ds[comb[0]],ds[comb[1]]) for comb in pos]
        return sum(daftar_jarak_centroid)
    

    def _grouping(self)->dict:
        hasil = []

        banyak_cluster = len(self.data)-2
        data =self.dataset['data']
        nama = self.nama
        banyak_kolom = range(len(data[0]))

        for id in range(1,banyak_cluster+1):
            # Penentuan banyak kolom
            kolom_ke = {}
            for x in banyak_kolom:
                kolom_ke[x] = []

            hasil.append(Cluster(id,kolom_ke,[]))
        
        
        for i,id_cluster in enumerate(self.data['terdekat']):
            cluster = hasil[id_cluster-1]
            cluster.nama.append(nama[i])
            for k in banyak_kolom:
                cluster.data[k].append(data[i][k])

        
        return hasil
    
    @property
    def df(self):
        return pd.DataFrame(self.data).sort_values(by=["terdekat"])
    def _cari_next_point(self):
        next_point = []
        for cluster in self.cluster:
            N = len(cluster.nama)
            coordinate = []
            for axis in cluster.data.values():
                try:
                    coordinate.append(sum(axis)/N)
                except:
                    coordinate.append(0)

            next_point.append(coordinate)
        return next_point

    def to_pict(self,kolom=[0,1]):
        clm = [self.dataset['kolom'][x] for x in kolom]
        
        col = 'rgbcmyk'
        fig = plt.figure()
        ax1 = fig.add_subplot(111)
        for i, cluster in enumerate(self.cluster):
            color = col[i%len(col)]
            d = cluster.data
            ax1.scatter(d[kolom[0]], d[kolom[1]], s=10, 
            c=color, label="Cluster "+str(cluster.id))
        
        sp_x = []
        sp_y = []
        for Ax_point in self.start_point:
            sp_x.append(Ax_point[kolom[0]])
            sp_y.append(Ax_point[kolom[1]])

        ax1.scatter(sp_x, sp_y, s=15, 
        c=Iterasi._COLOR, label="Centroid")

        plt.xlabel(clm[0])
        plt.ylabel(clm[1])
        plt.title("Cluster 2D")    
        plt.legend(loc='upper left')    
        plt.show()
    def to_3d(self,kolom=[0,1,2]):
        try:
            clm = [self.dataset['kolom'][x] for x in kolom]
        except:
            print("JUMLAH KOLOM KURANG")
            print("PLOT 3D GAGAL")
            return ""
            
        col = 'rgbcmyk'
        fig = plt.figure(figsize = (16, 9))
        ax1 = plt.axes(projection ="3d")
        for i, cluster in enumerate(self.cluster):
            color = col[i%len(col)]

            d = cluster.data

            ax1.scatter(d[kolom[0]], d[kolom[1]],d[kolom[2]], 
            color=color, label="Cluster "+str(cluster.id))
        
        sp_x = []
        sp_y = []
        sp_z = []
        for Ax_point in self.start_point:
            sp_x.append(Ax_point[kolom[0]])
            sp_y.append(Ax_point[kolom[1]])
            sp_z.append(Ax_point[kolom[2]])

        ax1.scatter(sp_x, sp_y, sp_z, s=15, 
        c=Iterasi._COLOR, label="Centroid")

        plt.title("Cluster 3D")
        ax1.set_xlabel(clm[0])
        ax1.set_ylabel(clm[1])
        ax1.set_zlabel(clm[2])
            
        plt.legend(loc='upper left')    
        plt.show()
    
    def __repr__(self):
        return str(self.cluster)

In [58]:
def proses_jarak(dataset,banyak_cluster:int,start_point=None):
    """_summary_

    Args:
        dataset (_type_): _description_
        parent (list, None):[[v1],[v2],[v3]]. Defaults to None.
    """

    hasil = {}
    for x in range(1,banyak_cluster+1):
        hasil['Cluster'+str(x)] = []

    
    hasil['terdekat'] = []
    hasil['WCV'] = []

    data = dataset['data']
    if start_point !=None:
        list_start = start_point
    else:
        n = len(data)
        rdm = randomN(len(data),banyak_cluster)
        list_start = [data[x] for x in rdm ]

    for point in data:
        shortest =None
        for i,acuan in enumerate(list_start):
            jarak = hitung_jarak(acuan,point)
            if shortest==None or shortest>jarak:
                shortest = jarak
                id  = i+1
            hasil['Cluster'+str(i+1)].append(jarak)

        hasil['terdekat'].append(id)
        hasil['WCV'].append(shortest**2)

    return Iterasi(dataset,list_start,hasil)


In [59]:
def proses(dataset,banyak_cluster,parent=None,start_point=None):
    if parent:
        start_point = parent.next_point
    elif start_point:
        start_point = start_point
    else:
        start_point = None
    
    proses = proses_jarak(dataset,banyak_cluster,start_point)
    
    if parent!=None and proses.ratio <= parent.ratio:
        print("Rasio BCV/WCV Iterasi ini Lebih kecil atau sama dengan Rasio Sebelumnya ")
        # print("Proses Sebaiknya Dihentikan")
        print("Rasio BCV/WCV Iterasi ini :",proses.ratio)
        print("Rasio BCV/WCV Sebelumnya :",parent.ratio)
        
        # try:
        #     file = open('start_point.txt')
        #     file.close()
        # except:
        #     with open("start_point.txt",'w') as file:
        #         file.write(str(parent.next_point))

    # print(proses.df)
    # print("BCV :",proses.bcv)   
    # print("WCV :",proses.wcv)    
    # print("Ratio:",proses.ratio)    
    return proses

In [60]:
def SemuaProses(dataset,banyak_cluster,start_point=None):
    # try:
    #     with open('start_point.txt','r') as file:
    #         a = eval(file.read())
    #         if not isinstance(a, list):
    #             start_point = None
    # except:
    #     start_point = None
    if start_point:
        banyak_cluster = len(start_point)
    dump = {}
    parent = None
    i = 1
    while True:
        pr = proses(dataset,banyak_cluster,parent,start_point)
        dump["iter_"+str(i)] = pr
        
        if parent!=None and pr.ratio <= parent.ratio:
            print(f"Terdapat {i} Iterasi")
            return namedtuple('KMeans',dump.keys())(*dump.values())
        parent = pr
        i +=1



In [61]:
banyak_cluster = 3
Dataset = buat_dataset('kmeans_panen.csv','Kabupaten',',')

In [66]:
KMeans = SemuaProses(Dataset,banyak_cluster)

Rasio BCV/WCV Iterasi ini Lebih kecil atau sama dengan Rasio Sebelumnya 
Rasio BCV/WCV Iterasi ini : 0.06162129878972426
Rasio BCV/WCV Sebelumnya : 0.06162129878972426
Terdapat 5 Iterasi


In [63]:
KMeans.iter_1.df

Unnamed: 0,Cluster1,Cluster2,Cluster3,terdekat,WCV
34,3.937331,6.60177,6.189108,1,15.502572
16,0.76993,2.686373,2.220476,1,0.592792
22,3.398541,3.539507,3.832445,1,11.550084
25,1.061656,2.865162,2.650037,1,1.127113
26,0.0,2.913818,2.355944,1,0.0
11,1.711801,1.940154,1.714724,1,2.930262
33,3.781137,6.045282,5.625613,1,14.297
8,2.147441,3.054522,3.279538,1,4.611503
29,3.795417,6.446043,6.056049,1,14.405188
6,2.424677,2.541961,2.542526,1,5.879056


In [65]:
KMeans.iter_3.cluster[0].df

Cluster 1 (13item)


Unnamed: 0,nama,axis0,axis1,axis2,axis3,axis4,axis5,axis6,cluster
0,Banjarnegara,0.25439,-0.03113,-0.57567,0.1299,-1.48741,0.45335,-1.38647,1
1,Magelang,-1.63861,-0.00254,-1.55115,0.93173,-1.48741,-0.07023,1.22662,1
2,Boyolali,-0.41586,-0.44125,0.24454,-0.0348,-1.48741,0.02301,-0.56775,1
3,Rembang,0.16086,-0.35964,-0.344,-0.67554,0.6165,-0.80966,-0.03958,1
4,Temanggung,-1.3769,-0.38905,1.26636,0.74168,-1.48741,0.29859,0.66557,1
5,Pekalongan,-0.68629,-0.82817,-0.22003,0.20049,0.12786,-0.69715,-0.48145,1
6,Pemalang,-0.02524,-0.6517,-0.64078,-0.40042,0.09467,-0.90593,-0.20546,1
7,Kota Magelang,-1.90661,-2.15423,-1.55115,-2.32262,-1.48741,-1.33796,-1.38647,1
8,Kota Surakarta,-0.11975,-2.15423,-1.55115,-0.33164,-1.48741,-0.94571,-1.38647,1
9,Kota Salatiga,-1.39241,-0.8435,0.69159,-2.32262,-1.48741,1.32413,-0.24667,1
