In [8]:
#Final Project Phase 3
#Program implements k-means clustering algorithm without using SKLearn package and calculates error rates of my clustering algorithm
#date: 12/5/2020
#author: Glenn Haag

import numpy as np
import pandas as pd
import math
import random
import warnings



def initialize(df):
    
    #I played around with the seed for the RNG and found most seeds would get me 15 or 16 correct predicted classes
    #this one gets 16 correct, so I stuck with it
    random.seed(33333)
    
    #pick a random row of the dataframe to serve as the initial u2 based on the seed of the RNG
    u2 = df.loc[random.randint(0,698)]
    
    #set u4 = u2 so that I can use the below logic
    u4 = u2
    
    #make sure it doesnt pick the same data
    while(u4.equals(u2)):
        #pick random row for the initial u4 the same as above
        u4 = df.loc[random.randint(0,698)]
    
    return u2, u4

def assignment(u2, u4, df):
    
    #pred_class is a list of all the predicted class values
    #if a row of the df has pred_class == 2, it is in cluster 2, pred_class == 4 means cluster 4
    pred_class = []
    
    #dist2 is distance from u2, dist4 is distance from u4
    dist2 = 0
    dist4 = 0
    
    #for each data point in the dataframe
    for j in range(len(df.index)):
        
        #for each of the 9 characteristics
        for k in range(9):
            
            #calculate the distance to each of the means
            dist2 += (u2[k+1] - df.iloc[j,k+1]) ** 2 
            dist4 += (u4[k+1] - df.iloc[j, k+1]) ** 2
            
        #finish distance calculation by taking sqrt of the sum of the distances from each of the dimensions
        dist2 = math.sqrt(dist2)
        dist4 = math.sqrt(dist4)
        
        #if the distance to u2 is less than or equal to the distance to u4, assign to cluster 2 and vice versa
        #Cluster 4 gets the data point in the event the distances are exactly equal
        #I chose for cluster 4 to get the equidistant ones because I think it's better to be safe
        #and classify the tumor as malignant in the case where it's not clear
        
        if(dist2 < dist4):
            pred_class.append(2)
            
        else:
            pred_class.append(4)
            
        #reset the distance variables after every datapoint is assigned
        dist2 = 0
        dist4 = 0
    
    return pred_class
    
def recalculation(df, u2, u4):
       
    #Counter variables for the column sums and lengths of each cluster
    c2_sum = 0
    c4_sum = 0
    c2_len = 0
    c4_len = 0
    
    for n in range(9):
        for m in range(len(df.index)):
            #if the predicted class == 2 (the data is in cluster 2)
            if(df.iloc[m, 11] == 2):
                #sum each column of the a2-a10 data
                c2_sum += df.iloc[m, n+1]
                if(n == 0):
                    #on the first pass find how many points are in cluster 2
                    c2_len += 1
            #elif the predicted class == 4 (the data is in cluster 4)
            elif(df.iloc[m, 11] == 4):
                #sum each column of the a2-a10 data
                c4_sum += df.iloc[m, n+1]
                #on the first pass find how many points are in cluster 4
                if(n == 0):
                    c4_len += 1
        #the new mean for cluster 2 = the sum of the column(a2, a3 etc), divided by the number of pts in that cluster
        #protect against divide by 0, in the very rare case all points are put in one cluster
        if(c2_len > 0):
            u2[n+1] = c2_sum/c2_len
        if(c4_len > 0):
            u4[n+1] = c4_sum/c4_len
        #reset the column sum value and continue until a2-a10 means are all calculated
        c2_sum = 0
        c4_sum = 0
        
    #return the means (list of means of each column)  
    return u2, u4

#~~~~~~~~~THIS FUNCTION CORRESPONDS TO FP PART 3~~~~~~~~~~
def errorRate(df):
    
    #counter variables
    errorB_count = 0
    errorM_count = 0
    totalPredB = 0
    totalPredM = 0
    totalB = 0
    totalM = 0
    
    #for all rows in the dataframe
    for n in range(len(df.index)):
        
        #if the predicted class is 4 and the actual class is 2, increment the number of erroneous B
        #this corresponds to my algorithm thinking it's malign when it's really benign
        if((df.iloc[n, 11] == 4) and (df.iloc[n, 10] == 2)):
            errorB_count += 1
            
        #if the predicted class is 2 and the actual class is 4, increment the number of erroneous M
        #this corresponds to my algorithm thinking it's benign when it's really malign
        if((df.iloc[n, 11] == 2) and (df.iloc[n, 10] == 4)):
            errorM_count += 1
       
        #count the total number of malign tumors predicted by my algorithm
        if(df.iloc[n, 11] == 4):
            totalPredM += 1
        
        #count the total number of benign tumors predicted by my algoritm
        if(df.iloc[n, 11] == 2):
            totalPredB += 1
            
        if(df.iloc[n, 10] == 2):
            totalB += 1
        else:
            totalM += 1
            
        
    
    #calculate the error rates based on the formulas given in the prompt
    errorB = errorB_count / totalPredB
    errorM = errorM_count/totalPredM
    total_error = (errorB_count + errorM_count) / (totalPredB + totalPredM)
    print(totalB, totalM)
    print(totalPredB, totalPredM)
    #return the error rates to be displayed in the main function
    return errorB, errorM, total_error
    
def main():
    
    #suppress scientific notation for both pandas and numpy printing 
    np.set_printoptions(suppress = True)
    
    warnings.filterwarnings("ignore")
    
    #pred_class_list keeps track of the predicted class of each of the rows of the df
    pred_class_list = []
    
    #cluster variables
    my_cluster2 = []
    my_cluster4 = []
    actual_cluster2 = []
    actual_cluster4 = []
    #read the csv into a dataframe
    df = pd.read_csv('breast_cancer_wisconsin-1.csv', na_values = '?')
    
    #rename the 'scn' column to ID to match the desired output
    df.rename(columns = {'scn' : 'ID'}, inplace = True)
    
    #replace all the missing values with the median value of the a7 column
    df['a7'] = df['a7'].fillna(df['a7'].median(axis = 0))
    
    #initialize the means for each cluster
    u2, u4 = initialize(df)
    
    #run up to 1500 times (it won't take that many)
    for j in range(1500):
        #assign each point to a cluster initially
        pred_class = assignment(u2, u4, df)
        
        #keep track of each list of predicted classes in an array
        pred_class_list.append(pred_class)
        
        #if it has run at least twice, check to see if the predicted class list has changed since the last run
        if(j >= 2):
            #if the most recent predicted class list is the same as the previous, break the loop
            if(pred_class_list[j] == pred_class_list[j-1]):
                break
                
        #add the Predicted Class column to our dataframe
        df['Predicted Class'] = pred_class
        
        #recalculate the means
        u2, u4 = recalculation(df, u2, u4)
        
    #calculate the error rates        
    errorB, errorM, total_error = errorRate(df)
    
    #display the error rates rounded to 3 decimals and displayed as percents rather than decimals
    print("Error B: " + str(round(errorB * 100, 3)) + "%\t\tError M: " + str(round(errorM * 100, 3)) +"%\t\tTotal Error: " + str(round(total_error * 100, 3)) + '%')
    
    
    
    
    #commented out the display portion from part 2 so you wouldn't have to look through that again
    #print in the format of the desired output
    print("---------------------------------------Final Means-------------------------------------------")
    
    #I chose to round to 12 decimal places because that's the number that would have the output fit nicely on one line
    #in my Jupyter notebook. I do no rounding in calculations, just in display. 
    
    #FOR THE TA/PROFESSOR, if you've looked this far, you can see I updated the display to include a10 after I had submitted P2 :(
    print("mu2: ", *u2[1:10].to_numpy().round(12))
    print("mu4: ", *u4[1:10].to_numpy().round(12))
    print('\n')
    print(df[['ID', 'class', 'Predicted Class']].head(20))
       
    
main()

458 241
465 234
Error B: 2.366%		Error M: 7.692%		Total Error: 4.149%
---------------------------------------Final Means-------------------------------------------
mu2:  3.043010752688 1.301075268817 1.443010752688 1.337634408602 2.088172043011 1.296774193548 2.103225806452 1.251612903226 1.109677419355
mu4:  7.149572649573 6.777777777778 6.713675213675 5.726495726496 5.457264957265 7.837606837607 6.089743589744 6.076923076923 2.542735042735


         ID  class  Predicted Class
0   1000025      2                2
1   1002945      2                4
2   1015425      2                2
3   1016277      2                4
4   1017023      2                2
5   1017122      4                4
6   1018099      2                2
7   1018561      2                2
8   1033078      2                2
9   1033078      2                2
10  1035283      2                2
11  1036172      2                2
12  1041801      4                2
13  1043999      2                2
14  1044572 

In [None]:
458 241
463 236
Error B: 2.376%		Error M: 6.78%		Total Error: 3.863%
---------------------------------------Final Means-------------------------------------------
mu2:  2.956331877729 1.325327510917 1.443231441048 1.364628820961 2.120087336245 1.336244541485 2.100436681223 1.2903930131 1.063318777293
mu4:  7.195020746888 6.572614107884 6.560165975104 5.547717842324 5.298755186722 7.572614107884 5.979253112033 5.863070539419 2.589211618257
