# Implementasi Algoritma LVQ Pada Prediksi Diagnosis Kanker

## Normalisasi Data

In [58]:
import pandas as pd
df = pd.read_csv('breast-cancer-wisconsin.csv')

#Definisikan format missing values yang mungkin ada
missing_value_format = ['N.A', 'na', 'n.a.','n/a','?','-']

#Tambahkan parameter na_values untuk memformat missing values
df2 = pd.read_csv('breast-cancer-wisconsin.csv', na_values = missing_value_format)
df2.isnull().sum()

#Menghapus baris yang mengandung missing values pada kolom Bare Nuclei
df2.dropna(subset=['Bare Nuclei'], axis=0, inplace=True)

#Me-Reset indeks karena ada data yang terhapus
df2.reset_index(drop=True, inplace=True)

df2

Unnamed: 0,Code Number,Clump Thickness,Cell Size,Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
678,776715,3,1,1,1,3,2.0,1,1,1,2
679,841769,2,1,1,1,2,1.0,1,1,1,2
680,888820,5,10,10,3,7,3.0,8,10,2,4
681,897471,4,8,6,4,3,4.0,10,6,1,4


# Algoritma LVQ

In [59]:
import pandas as pd
import numpy as np

sample_dataset = df2.drop(['Code Number'], axis=1)

# Inisialisasi vector
def initiate_vectors(df2, n_vectors, y='Class'):
    bobot = []
    features = df2.columns[df2.columns != y]
    for i in range(n_vectors):
        sample_vectors = df[features].apply(lambda row: float(row.sample()))
        bobot.append(sample_vectors)
        
    bobot_df = pd.concat(bobot, axis=1)
    columnrename = [f'vector_{i}' for i in range(1, n_vectors+1)]
    bobot_df.columns = columnrename
    bobot_df = bobot_df.T
    bobot_df[y] = df[y].unique()

    return bobot_df
  
random_vectors = initiate_vectors(sample_dataset, 2)
print('Random vectors:')
display(random_vectors)

def train_bobot(learning_rate, n_epochs, initial_bobot, training_df, y_name):
    # Pilih features
    y_idx = training_df.columns.get_loc(y_name)
    feature_names = training_df.columns[training_df.columns != y_name]
    feature_loc = [i for i in range(len(training_df.columns)) if i != y_idx]
    
    best_matching_vector = initial_bobot.copy() # Untuk perbandingan
    
    for epoch in range(n_epochs):
        # Learning rate akan memengaruhi jumlah epoch
        rate = learning_rate * (1 - (epoch / float(n_epochs)))
        
        # Mulai iterasi dari dataset training
        for idx in range(len(training_df)):
            baris = training_df.iloc[[idx], feature_loc]
            
            # Menghitung euclidean distance terhadap baris data training dan memilih kolom dengan jarak terdekat             
            bmu = baris.apply(lambda row: np.sqrt(((best_matching_vector.iloc[:, feature_loc] - row)**2).sum(axis=1)), axis=1).idxmin(axis='columns').iloc[0]
            bmu_terpilih = best_matching_vector.loc[[bmu], feature_names]
            
            #Menghitung error pada baris data training 
            error = (baris.reset_index(drop=True) - bmu_terpilih.reset_index(drop=True))
            
            # Mengatur BMU vector untuk mendekati baris pada data training
            if best_matching_vector.loc[bmu, y_name] == training_df.iloc[idx, y_idx]:
                best_matching_vector.loc[[bmu], feature_names] = best_matching_vector.loc[bmu, feature_names].values + (error.values * rate)
            else:
                best_matching_vector.loc[[bmu], feature_names] = best_matching_vector.loc[bmu, feature_names].values - (error.values * rate)
    
    return best_matching_vector
    
trained_vectors = train_bobot(0.3, 10, random_vectors, sample_dataset, 'Class')
print('Trained vectors:')
display(trained_vectors)

Random vectors:


Unnamed: 0,Clump Thickness,Cell Size,Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
vector_1,3.0,1.0,10.0,4.0,2.0,1.0,3.0,1.0,1.0,2
vector_2,4.0,10.0,1.0,1.0,10.0,1.0,9.0,10.0,10.0,4


Trained vectors:


Unnamed: 0,Clump Thickness,Cell Size,Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
vector_1,2.745132,1.161237,1.329779,1.251063,2.116943,1.114776,1.600549,1.151565,1.22451,2
vector_2,6.57303,7.785649,7.37754,6.53239,5.115006,7.110616,7.334341,6.20108,2.353568,4


## Prediksi

In [60]:
def predict_lvq(test_df, trained_vectors_df, y_name):
    idx_y = test_df.columns.get_loc(y_name)
    features_iloc = [i for i in range(len(test_df.columns)) if i != idx_y]
    
    filt = test_df.iloc[:, features_iloc].apply(lambda row: np.sqrt(((trained_vectors_df.iloc[:, features_iloc] - row)**2).sum(axis=1)), axis=1).idxmin(axis=1)
    
    return trained_vectors_df.loc[filt, y_name].values
    
data_test = pd.read_excel('testing.xlsx')
print('Data test:')
display(data_test)
hasil = predict_lvq(data_test, trained_vectors, 'Class')
print('Hasil prediksi class nya adalah:')
print(hasil)
print('Keterangan: 2 untuk benign, 4 untuk malignant')

Data test:


Unnamed: 0,Clump Thickness,Cell Size,Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,6,4,7,9,2,4.0,7,3,1,
1,7,6,5,3,4,10.0,1,1,1,
2,9,2,3,1,1,6.0,3,5,2,
3,4,2,10,9,8,1.0,8,10,2,
4,4,3,5,7,2,3.0,7,3,2,
5,1,9,3,4,4,8.0,3,8,1,
6,4,8,5,2,4,5.0,8,5,1,
7,6,10,10,9,9,1.0,5,10,1,
8,6,10,4,9,2,9.0,8,9,2,
9,6,7,4,1,9,3.0,5,10,2,


Hasil prediksi class nya adalah:
[4 4 2 4 4 4 4 4 4 4 4 4 4 4 4 4]
Keterangan: 2 untuk benign, 4 untuk malignant


## Hasil Akhir

In [61]:
hasil_df = pd.DataFrame(hasil)
gabung_df = pd.concat([data_test, hasil_df], axis=1)
hasil_akhir = gabung_df.drop('Class', axis = 1)
hasil_akhir.rename(columns = {0: 'Class'})

Unnamed: 0,Clump Thickness,Cell Size,Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,6,4,7,9,2,4.0,7,3,1,4
1,7,6,5,3,4,10.0,1,1,1,4
2,9,2,3,1,1,6.0,3,5,2,2
3,4,2,10,9,8,1.0,8,10,2,4
4,4,3,5,7,2,3.0,7,3,2,4
5,1,9,3,4,4,8.0,3,8,1,4
6,4,8,5,2,4,5.0,8,5,1,4
7,6,10,10,9,9,1.0,5,10,1,4
8,6,10,4,9,2,9.0,8,9,2,4
9,6,7,4,1,9,3.0,5,10,2,4
