In [1]:
import cluster #another py file in same directory
import random, pylab, numpy

In [7]:
class Patient(cluster.Example):
    pass

def scaleAttrs(vals): # Z scalling
    vals = pylab.array(vals) #convert list to array
    mean = sum(vals)/len(vals) #compute mean
    sd = numpy.std(vals) #use numpy built in function to compute stdeviation
    vals = vals - mean #subtract mean from each values, centering the distribution around 0
    return vals/sd #scales the values ensuring distribution with stdev of 1

def getData(toScale = False):
    #read in data
    hrList, stElevList, ageList, prevACSList, classList = [],[],[],[],[]
    cardiacData = open('cardiacData.txt', 'r')
    for l in cardiacData:
        l = l.split(',')
        hrList.append(int(l[0]))
        stElevList.append(int(l[1]))
        ageList.append(int(l[2]))
        prevACSList.append(int(l[3]))
        classList.append(int(l[4]))
    if toScale:
        hrList = scaleAttrs(hrList)
        stElevList = scaleAttrs(stElevList)
        ageList = scaleAttrs(ageList)
        prevACSList = scaleAttrs(prevACSList)
    #Build points
    points = []
    for i in range(len(hrList)):
        features = pylab.array([hrList[i], prevACSList[i],\
                                stElevList[i], ageList[i]])
        pIndex = str(i)
        points.append(Patient('P'+ pIndex, features, classList[i]))
    return points
    
def kmeans(examples, k, verbose = False):
    #Get k randomly chosen initial centroids, create cluster for each
    initialCentroids = random.sample(examples, k)
    clusters = []
    for e in initialCentroids:
        clusters.append(cluster.Cluster([e]))
        
    #Iterate until centroids do not change
    converged = False
    numIterations = 0
    while not converged:
        numIterations += 1
        #Create a list containing k distinct empty lists
        newClusters = []
        for i in range(k):
            newClusters.append([])
            
        #Associate each example with closest centroid
        for e in examples:
            #Find the centroid closest to e
            smallestDistance = e.distance(clusters[0].getCentroid())
            index = 0
            for i in range(1, k):
                distance = e.distance(clusters[i].getCentroid())
                if distance < smallestDistance:
                    smallestDistance = distance
                    index = i
            #Add e to the list of examples for appropriate cluster
            newClusters[index].append(e)
            
        for c in newClusters: #Avoid having empty clusters
            if len(c) == 0:
                raise ValueError('Empty Cluster')
        
        #Update each cluster; check if a centroid has changed
        converged = True
        for i in range(k):
            if clusters[i].update(newClusters[i]) > 0.0:
                converged = False
        if verbose:
            print('Iteration #' + str(numIterations))
            for c in clusters:
                print(c)
            print('') #add blank line
    return clusters

def trykmeans(examples, numClusters, numTrials, verbose = False):
    """Calls kmeans numTrials times and returns the result with the
          lowest dissimilarity"""
    best = kmeans(examples, numClusters, verbose)
    minDissimilarity = cluster.dissimilarity(best)
    trial = 1
    while trial < numTrials:
        try:
            clusters = kmeans(examples, numClusters, verbose)
        except ValueError:
            continue #If failed, try again
        currDissimilarity = cluster.dissimilarity(clusters)
        if currDissimilarity < minDissimilarity:
            best = clusters
            minDissimilarity = currDissimilarity
        trial += 1
    return best

def printClustering(clustering):
    """Assumes: clustering is a sequence of clusters
       Prints information about each cluster
       Returns list of fraction of pos cases in each cluster"""
    posFracs = []
    for c in clustering:
        numPts = 0
        numPos = 0
        for p in c.members():
            numPts += 1
            if p.getLabel() == 1:
                numPos += 1
        fracPos = numPos/numPts
        posFracs.append(fracPos)
        print('Cluster of size', numPts, 'with fraction of positives =',
              round(fracPos, 4))
    return pylab.array(posFracs)

def testClustering(patients, numClusters, seed = 0, numTrials = 5):
    random.seed(seed)
    bestClustering = trykmeans(patients, numClusters, numTrials)
    posFracs = printClustering(bestClustering)
    return posFracs

In [13]:
patients = getData() #scaling = false
for k in (2,):
    print('\n     Test k-means (k = ' + str(k) + ')')
    posFracs = testClustering(patients, k, seed = 2, numTrials = 5)


     Test k-means (k = 2)
Cluster of size 118 with fraction of positives = 0.3305
Cluster of size 132 with fraction of positives = 0.3333


In [16]:
patients = getData(toScale = True) 
for k in (2,):
    print('\n     Test k-means (k = ' + str(k) + ')')
    posFracs = testClustering(patients, k, seed = 2, numTrials = 5)


     Test k-means (k = 2)
Cluster of size 224 with fraction of positives = 0.2902
Cluster of size 26 with fraction of positives = 0.6923


In [18]:
patients = getData(toScale = True) 
for k in (2,4,6):
    print('\n     Test k-means (k = ' + str(k) + ')')
    posFracs = testClustering(patients, k, seed = 2, numTrials = 5)


     Test k-means (k = 2)
Cluster of size 224 with fraction of positives = 0.2902
Cluster of size 26 with fraction of positives = 0.6923

     Test k-means (k = 4)
Cluster of size 26 with fraction of positives = 0.6923
Cluster of size 86 with fraction of positives = 0.0814
Cluster of size 76 with fraction of positives = 0.7105
Cluster of size 62 with fraction of positives = 0.0645

     Test k-means (k = 6)
Cluster of size 49 with fraction of positives = 0.0204
Cluster of size 26 with fraction of positives = 0.6923
Cluster of size 45 with fraction of positives = 0.0889
Cluster of size 54 with fraction of positives = 0.0926
Cluster of size 36 with fraction of positives = 0.7778
Cluster of size 40 with fraction of positives = 0.675


In [17]:
numPos = 0
for p in patients:
   if p.getLabel() == 1:
       numPos += 1
print('Total number of positive patients =', numPos)

Total number of positive patients = 83


In [5]:
#scaling features

import numpy as np

def interpolate_scaling(vals, new_min=0, new_max=1):
    # Assuming vals is a list or array of values
    vals = np.array(vals)

    # Original range (min and max values of vals)
    old_min, old_max = np.min(vals), np.max(vals)
  
    # Interpolate and scale the values to the new range
    scaled_vals = new_min + (vals - old_min) * (new_max - new_min) / (old_max - old_min)

    return scaled_vals

# Example usage:
original_values = [2, 5, 8, 12, 18]
scaled_values = interpolate_scaling(original_values)

print("Original Values:", original_values)
print("Scaled Values:", scaled_values)

2
18
Original Values: [2, 5, 8, 12, 18]
Scaled Values: [0.     0.1875 0.375  0.625  1.    ]
