# <center>Radial Basis Function (RBF) Implementation</center>
<center>Ankush Bhayekar (gq8442)</center>

In [None]:
import numpy as np
import pandas as pd
import math

#### The breast cancer data csv file is downloaded from :
https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

In [None]:
# read the breast cancer csv file
data = pd.read_csv('data.csv', usecols=[*range(1, 31)])

In [None]:
# converting diagnosis column to the binary dummies - if malignant then M = 1 else 0
diagnosis = pd.get_dummies(data['diagnosis'],drop_first=True)
# dropping original column with M and N values
data.drop(['diagnosis'],axis=1,inplace=True)
# concatinate the Binary diagnosis data column to dataframe
data = pd.concat([data,diagnosis],axis=1)

In [None]:
# convert pandas dataframe to numpy array
data_np = data.to_numpy()

*For training purpose 60% of the data set split and used. Test data is kept at 20%*

In [None]:
train_per = int(round(data_np.shape[0] * 0.6, -1))
test_per = int(round(data_np.shape[0] * 0.2, -1))
vald_per = int(round(data_np.shape[0] * 0.2, -1))

# 60% of the data is splitted for the training of the model
y_train = data_np[0:train_per, -1] 
X_train = data_np[0:train_per, 0:-1]
# 20% of the data is splitted for the testing purpose
y_test = data_np[train_per+1:train_per+test_per, -1]
X_test = data_np[train_per+1:train_per+test_per, 0:-1]

In [None]:
def euclid_dist(centr, x):
    '''
    function to calculate the Euclidian distance between the random centroid 
    and cluster data points
    '''
    summ = 0
    for indx in range(len(centr)):
        summ += (centr[indx] - x[indx]) ** 2
    return np.sqrt(summ)


def k_means_centroid(X, k, max_iters):
    """
    This function returns the cluster centers and standard deviation of the clusters.
    The function converges when the previous centroids sum and current centroids sum goes to zero
    """
  
    clust_centroids = X[np.random.choice(range(len(X)), k, replace=False)]

    convergd = False
    
    cur_iteration = 0

    while (not convergd) and (cur_iteration < max_iters):

        cluster_list = [[] for i in range(len(clust_centroids))]

        for x in X:  # Go through each data point
            euc_distances = []
            for c in clust_centroids:
                euc_distances.append(euclid_dist(c, x))
            cluster_list[int(np.argmin(euc_distances))].append(x)

        cluster_list = list((filter(None, cluster_list)))

        prior_centroids = clust_centroids.copy()

        clust_centroids = []

        for j in range(len(cluster_list)):
            clust_centroids.append(np.mean(cluster_list[j], axis=0))

        converg_crit = np.abs(np.sum(prior_centroids) - np.sum(clust_centroids))

        convergd = (converg_crit == 0)

        cur_iteration += 1

    return np.array(clust_centroids), [np.std(x) for x in cluster_list]

In [None]:
class rbf_net:

    def __init__(self, X_train, y_train, X_tr, y_tr, num_of_classes, k, clusters_std=True):
        self.X_tr = X_train
        self.y_tr = y_train
        self.X_test = X_tr
        self.y_test = y_tr
        self.clusters_std = clusters_std
        self.no_of_class = num_of_classes
        self.k = k

    def eucd_rbf(self, x, sent, std):
        euc_dis = euclid_dist(x, sent)
        return 1 / np.exp(-euc_dis / std ** 2)

    def rbflayer(self, X, grp_cent, stand_dev):
        layer_list = []
        for x in X:
            layer_list.append([self.eucd_rbf(x, or_g, std) for (or_g, std) in zip(grp_cent, stand_dev)])
        return np.array(layer_list)
    
    def dummy_var(self, x, no_of_class):
        dum_arr = np.zeros((len(x), no_of_class))
        for idx in range(len(x)):
            gcent = int(x[idx])
            dum_arr[idx][gcent] = 1
        return dum_arr
    
    def netfit(self):

        self.clust_centroids, self.stand_dev = k_means_centroid(self.X_tr, self.k, max_iters=1000)

        if not self.clusters_std:
            Max_eucd = np.max([euclid_dist(c1, c2) for c1 in self.clust_centroids for c2 in self.clust_centroids])
            self.stand_dev = np.repeat(Max_eucd / np.sqrt(2 * self.k), self.k)
        
        # using same Beta for all the cluster centroids
        rbfnet_x = self.rbflayer(self.X_tr, self.clust_centroids, self.stand_dev)
        # get the rbfnet of the input x and apply LS Optimization to estimate weight matrix 'weights'
        self.weights = np.linalg.pinv(rbfnet_x.T @ rbfnet_x) @ rbfnet_x.T @ self.dummy_var(self.y_tr, self.no_of_class)

        rbf_test = self.rbflayer(self.X_test, self.clust_centroids, self.stand_dev)

        self.y_pred = rbf_test @ self.weights

        self.y_pred = np.array([np.argmax(x) for x in self.y_pred])

        error = self.y_pred - self.y_test
        
        # Confusion Matrix calculation
        
        fp = 0
        tp = 0
        fn = 0
        
        for i in range(len(self.y_pred)):
            if self.y_pred[i] == 1 and self.y_test[i] == 0:
                fp += 1
            elif self.y_pred[i] == 0 and self.y_test[i] == 1:
                fn += 1
            else:
                tp += 1
        
        precision = tp / (tp+fp)
        recall = tp / (tp+fn)
        F_1_score = (2*precision*recall) / (precision+recall)

        print('\n***** Confusion Matrix ******\n')
        print('Accuracy: ', round(len(np.where(error == 0)[0]) / len(error) ,3))
        print('Precision: ', round(precision,3))
        print('Recall: ', round(recall,3))
        print('F-1 Score: ', round(F_1_score,3))

In [None]:
rbf_class = rbf_net(X_train, y_train, X_test, y_test, num_of_classes=29, k=10, clusters_std=False)
rbf_class.netfit()