# Question 3: Redo Question 1 with clustering (classification label unknown), compare the 2 results.
## Methodology: 
### K-means

In [1]:
# Import lib
# ===========================================================
import csv
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
import random
import time
import collections
import math
import sys
from tqdm import tqdm
from time import sleep
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')

from datascience import *
from scipy import stats

import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Procedure of K-means
1. Select the number of clusters you want to identify in your data 
2. Randomly select k distinct data points 
3. Measure the distance between the 1st point and the k initial clusters 
4. Assign the 1st point to the nearest cluster 
5. Iterate through all points and do step 3 & 4 
6. Calculate the mean of each cluster 
7. Use the calculated mean of each cluster as k new initial data points and restart from 3 
8. Loop until the mean converge 
9. Do Step 1 - 8 for n times, select the best one 

### 1. Init Data

In [18]:
# Initialize useful data
# ===========================================================
df = pd.read_csv('clinvar_conflicting_mapped.csv', low_memory=False)
df = df.fillna(value=0)
columns_backup = df.columns
attributes = list(df.columns)
attribute_dimension = len(attributes) - 1 # eliminate the CLASS column
all_rows = df.values.tolist()
row_num = len(all_rows)
df = df.sample(n = df.shape[0])
columns_backup = df.columns
df.head()

Unnamed: 0,CHROM,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDN,CLNHGVS,...,Codons,STRAND,BAM_EDIT,SIFT,PolyPhen,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62,CLASS
50633,0.625,0.815749,0.815242,0.746725,0.0,0.0,0.0,0.663201,0.906263,0.461389,...,0.10986,0.9375,0.666667,0.8,0.8,0.420858,0.002976,0.523939,0.0,0
5345,0.0,0.602646,0.684758,0.799127,0.0,0.0,0.0,0.637643,0.740497,0.356385,...,0.238631,0.0625,0.333333,0.6,0.2,0.122966,0.860503,0.440743,0.999949,0
62672,0.833333,0.249576,0.684758,0.799127,0.0,0.772911,0.0,0.946177,0.448596,0.188424,...,0.340387,0.9375,0.333333,0.8,0.8,0.378698,0.943548,0.2382,0.0,0
4358,0.875,0.817888,0.684758,0.799127,0.066502,0.0,0.0,0.78969,0.787257,0.473722,...,0.328231,0.9375,0.333333,0.0,0.2,0.481324,0.718414,0.33377,0.0,1
46170,0.958333,0.488885,0.815242,0.661572,0.0,0.763312,0.0,0.144033,0.298488,0.450881,...,0.884737,0.9375,0.666667,0.4,0.6,0.858358,0.508833,0.101934,2.5e-05,0


### 2. Balancing

In [20]:
# my balancing
# ===========================================================
df_zero = df.loc[df['CLASS'] == 0]
df_zero = df_zero.sample(n=10000)
df_one = df.loc[df['CLASS'] == 1]
df_one = df_one.sample(n=10000)

df = pd.concat([df_zero, df_one])
df = df.sample(n = df.shape[0])
all_rows = df.values.tolist()
row_num = len(all_rows)
df.head()

Unnamed: 0,CHROM,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDN,CLNHGVS,...,Codons,STRAND,BAM_EDIT,SIFT,PolyPhen,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62,CLASS
24695,0.333333,0.519718,0.578522,0.746725,0.228712,0.337483,0.06804,0.559887,0.147948,0.561499,...,0.444845,0.0625,0.666667,0.8,0.8,0.110762,0.9327,0.487742,0.0,0
22386,0.541667,0.680599,0.578522,0.746725,0.066502,0.653367,0.0,0.552848,0.89838,0.163772,...,0.425484,0.9375,0.666667,0.8,0.8,0.973558,0.590246,0.150104,0.0,1
3644,0.875,0.871568,0.684758,0.746725,0.066502,0.550472,0.0,0.940113,0.941037,0.336902,...,0.330032,0.0625,0.666667,0.8,0.8,0.008691,0.16321,0.544526,0.0,1
63445,0.041667,0.618268,0.815242,0.567686,0.066502,0.763312,0.187829,0.552848,0.89838,0.568924,...,0.486267,0.0625,0.333333,0.6,0.2,0.244268,0.508929,0.682271,0.000429,1
45200,0.958333,0.743025,0.764434,0.567686,0.0,0.0,0.0,0.552848,0.89838,0.533871,...,0.336335,0.9375,0.333333,0.8,0.8,0.423077,0.321909,0.496017,0.0,1


In [None]:
# One hot encoding
# ===========================================================
X = df.drop('CLASS', axis=1)
y = df['CLASS']

X = pd.get_dummies(X, drop_first=True)
y = pd.get_dummies(y, drop_first=True)



### 3. Split Train Test

In [23]:
# Divide whole dataset into training set and testing set
# ===========================================================
training_percentage = 0.5  # percent of partition of training dataset
training_size = int(row_num * training_percentage)
testing_size = row_num - training_size
training_data = all_rows[: training_size]  # training data should include header row
testing_data = all_rows[training_size: ]   # testing data don't need to include header row

### 4. K-means Structure Implementation

In [25]:
# i have rescaled all data to confirm that each input dimension ranges within [0, 1]

def gen_random_pivots(k, dim):
    return [np.random.rand(1, dim) for _ in range(k)]

def euclidean_distance(pivot, point):
    # point can have a CLASS entry, we don't want to use it, so eliminate it outside of the function
    
    return np.linalg.norm(np.subtract(pivot, point))

def find_nearest_pivot(point):
    winner = np.random.rand(1, attribute_dimension)
    min_dist = float('inf')
    for i in range(len(pivots)):
        pivot = pivots[i][0]
        temp_dist = euclidean_distance(pivot, point[: -1])
        idx = -2
        if temp_dist < min_dist:
            winner, min_dist, idx = pivot, temp_dist, i
    return winner, idx

def cluster_mean_point(cluster):
    mean_point = np.zeros((1, attribute_dimension))
    for point in cluster:
        mean_point = np.add(mean_point, point[: -1])
    mean_point = np.divide(mean_point, len(cluster) + 0.00000001)
    return mean_point

def clusters_acc(clusters):
    counter1, counter2 = collections.Counter([point[-1] for point in clusters[0]]), collections.Counter([point[-1] for point in clusters[1]])
    
    one_rate1 = counter1.get(1, 0) / (counter1.get(1, 0) + counter1.get(0, 0) + 0.00000001)
    zero_rate1 = counter1.get(0, 0) / (counter1.get(1, 0) + counter1.get(0, 0) + 0.00000001)
    one_rate2 = counter2.get(1, 0) / (counter2.get(1, 0) + counter2.get(0, 0) + 0.00000001)
    zero_rate2 = counter2.get(0, 0) / (counter2.get(1, 0) + counter2.get(0, 0) + 0.00000001)
    dataset_size = sum(counter1.values()) + sum(counter2.values())
    if one_rate1 > one_rate2:
        # counter1 -> label 1, counter2 -> label 0
        return (counter1.get(1, 0) + counter2.get(0, 0)) / dataset_size
    else:
        return (counter1.get(0, 0) + counter2.get(1, 0)) / dataset_size

### 5. Train

In [26]:
# unsupervised clustering
# ==============================================
K = 2
n = 20 # number of try to run
converge_radius = 0.002

print("Training_size: %d" % training_size)
winner = []
min_var = float('inf')
max_acc = -float('inf')

for i in range(n):
    
    # randomly select K distinct data points
    pivots = gen_random_pivots(K, attribute_dimension)
    
    sys.stdout.write('\r')
    # the exact output you're looking for:
    sys.stdout.write("Training: [%-20s] %d%%" % ('='*int((i + 1) * (100 / n) / 5), int((i + 1) * (100 / n))))
    sys.stdout.flush()
#     sleep(0.25)
    
    while True:
        # clusters = dict() # maybe it would be better to instantize this with a dictionary
        clusters = [[] for _ in range(K)] # init K empty clusters
    
        for point in training_data:
            
            # each row of training_data is a high-dimensional point
            _, idx_of_pivot = find_nearest_pivot(point)
            clusters[idx_of_pivot].append(point)
        temp_pivots = [cluster_mean_point(cluster) for cluster in clusters]
        
        # loop until init_pivots converge
        if np.linalg.norm(np.subtract(pivots, temp_pivots)) < converge_radius:
            break
        pivots = list(temp_pivots)
    
    # pick the best clustering with the most difference between each other
    cluster_sizes = [len(cluster) for cluster in clusters]
    temp_var = np.var(cluster_sizes)
    temp_acc = clusters_acc(clusters)
    if temp_acc > max_acc:
        winner = list(clusters)
        max_acc = temp_acc

'''
* TP: Prediction is True + Predicted value is Positive
* FP: Prediction is False + Predicted value is Positive
* TN: Prediction is True + Predicted value is Negative
* FN: Prediction is False + Predicted value is Negative

* Accuracy = $\frac{TP + TN}{TP + FN + FP + TN}$
* Sensitivity (TPR) = $\frac{TP}{TP + FN}$
* Specificity (FPR) = $\frac{TN}{TN + FP}$
'''

# cause we don't really have cutoff points here, so computing TP, .. would be meaningless
# here we only care about Acc
print("\nwinner: ", len(winner[0]), len(winner[1]))
print("Training Acc: ", max_acc)
counter1, counter2 = collections.Counter([point[-1] for point in winner[0]]), collections.Counter([point[-1] for point in winner[1]])
print(counter1, counter2)

Training_size: 10000
winner:  5100 4900
Training Acc:  0.5208
Counter({1.0: 2661, 0.0: 2439}) Counter({0.0: 2547, 1.0: 2353})


### 6. Testing

In [27]:
# testing
# ===================================================================
# pivots
test_clusters = [[] for _ in range(K)]
cutoff = 0.5
for point in testing_data:
    if euclidean_distance(pivots[0], point[: -1]) / euclidean_distance(pivots[1], point[: -1]) < cutoff / (1 - cutoff):
        test_clusters[0].append(point)
    else:
        test_clusters[1].append(point)
print("testing Acc", clusters_acc(test_clusters))
print(len(test_clusters[0]), len(test_clusters[1]))

testing Acc 0.5087
4369 5631


In [36]:
# Compute TN, TP, FN, FP, etc.
# ===========================================================
ROC = Table(make_array('CUTOFF', 'TN', 'FN', 'FP', 'TP', 'ACC'))
step_size = 0.05
for cutoff in np.arange(0, 1 + step_size, step_size):
    
    sys.stdout.write('\r')
    # the exact output you're looking for:
    sys.stdout.write("Testing: [%-20s] %d%%" % ('='*int(cutoff * 100 / 5), int(cutoff * 100)))
    sys.stdout.flush()
    
    test_clusters = clusters = [[] for _ in range(K)]
    for point in testing_data:
        if euclidean_distance(pivots[0], point[: -1]) / euclidean_distance(pivots[1], point[: -1]) < cutoff / (1 - cutoff):
            test_clusters[0].append(point)
        else:
            test_clusters[1].append(point)
    counter1, counter2 = collections.Counter([point[-1] for point in test_clusters[0]]), collections.Counter([point[-1] for point in test_clusters[1]])
    one_rate1 = counter1.get(1, 0) / (counter1.get(1, 0) + counter1.get(0, 0) + 0.00000001)
    zero_rate1 = counter1.get(0, 0) / (counter1.get(1, 0) + counter1.get(0, 0) + 0.00000001)
    one_rate2 = counter2.get(1, 0) / (counter2.get(1, 0) + counter2.get(0, 0) + 0.00000001)
    zero_rate2 = counter2.get(0, 0) / (counter2.get(1, 0) + counter2.get(0, 0) + 0.00000001)
    dataset_size = sum(counter1.values()) + sum(counter2.values())
    if one_rate1 > one_rate2:
        # counter1 -> label 1, counter2 -> label 0
        TP = counter1.get(1, 0)
        FN = counter1.get(0, 0)
        FP = counter2.get(1, 0)
        TN = counter2.get(0, 0)
    else:
        # counter1 -> label 0, counter2 -> label 1
        TP = counter1.get(1, 0)
        FN = counter1.get(0, 0)
        FP = counter2.get(1, 0)
        TN = counter2.get(0, 0)
    output = [cutoff, TN, FN, FP, TP]
    acc = (output[1] + output[4]) / (sum(output[1:]) + 0.00000001)
    output.append(acc)
    ROC = ROC.with_row(output)
ROC = ROC.with_columns('SENSITIVITY', ROC.apply(lambda TP, FN: TP / (TP + FN + 0.00000001), 'TP', 'FN'))
ROC = ROC.with_columns('FPR', ROC.apply(lambda TN, FP: FP / (TN + FP + 0.00000001), 'TN', 'FP'))
ROC = ROC.with_column('FMEAS', ROC.apply(lambda TP, FP, FN: 2 * (TP / (TP + FN)) * (TP / (TP + FP)) / (TP / (TP + FN) + TP / (TP + FP)), 'TP', 'FP', 'FN'))



  


In [37]:
ROC.show()

CUTOFF,TN,FN,FP,TP,ACC,SENSITIVITY,FPR,FMEAS
0.0,5014,0,4986,0,0.5014,0.0,0.4986,
0.05,5014,0,4986,0,0.5014,0.0,0.4986,
0.1,5014,0,4986,0,0.5014,0.0,0.4986,
0.15,5014,0,4986,0,0.5014,0.0,0.4986,
0.2,5014,0,4986,0,0.5014,0.0,0.4986,
0.25,5014,0,4986,0,0.5014,0.0,0.4986,
0.3,5014,0,4986,0,0.5014,0.0,0.4986,
0.35,5014,0,4986,0,0.5014,0.0,0.4986,
0.4,5009,5,4984,2,0.5011,0.285714,0.498749,0.000801122
0.45,4138,876,4179,807,0.4945,0.479501,0.502465,0.242015


In [38]:
# Acc Curve by cutoff
# ===========================================================
matplotlib.use('TkAgg')
fig = plt.figure()
plt.xlabel('Cutoff')
plt.ylabel('Accuracy')
plt.title('Accuracy - Cutoff of K means')
plt.plot(np.arange(0, 1.1, 0.1), [0.5 for i in np.arange(0, 1.1, 0.1)], color='black')
plt.plot(ROC.column('CUTOFF'), ROC.column('ACC'), color='orange')
plt.axis([0, 1, 0, 1])
plt.show()
fig.savefig('K means ACC.png', bbox_inches='tight')

In [39]:
# ROC_CURVE
# ===========================================================
fig = plt.figure()
plt.xlabel('False Positive Rate')
plt.ylabel('Sensitivity')
plt.title('ROC - Curve of K means')
plt.plot(ROC.column('FPR'), ROC.column('SENSITIVITY'), color='orange')
plt.plot(np.arange(0, 1.1, 0.1), np.arange(0, 1.1, 0.1), color='black')
plt.legend(['K means', 'Null'])
plt.axis([0, 1, 0, 1])
plt.show()
fig.savefig('K means ROC.png', bbox_inches='tight')

In [40]:
# Compute AUC
# ===========================================================
length = len(ROC.column('FPR'))
auc = 0
for i in range(length - 1):
    auc += 0.5 * abs(ROC.column('FPR')[i + 1] - ROC.column('FPR')[i]) * (ROC.column('SENSITIVITY')[i] + ROC.column('SENSITIVITY')[i + 1])
print("auc = %.03f" %auc)

auc = 0.757


### Apparently it's not working, just as discussed in the presentation, 
### I also tried to drop categorical data and train only with numerical data.
### The results are similar

### 1. Init and Drop Categorical Data

In [41]:
# Initialize useful data
# ===========================================================
cate_columns = [0, 2, 3, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26, 27, 29, 30, 31]
df = pd.read_csv('clinvar_conflicting_clean.csv', low_memory=False)
df = df.fillna(value=0)
columns_backup = df.columns
df = df.drop([columns_backup[i] for i in cate_columns], axis=1) # eliminate categorical columns
attributes = list(df.columns)
attribute_dimension = len(attributes) - 1 # eliminate the CLASS column
all_rows = df.values.tolist()
row_num = len(all_rows)
df = df.sample(n = df.shape[0])
columns_backup = df.columns
df.head()

Unnamed: 0,POS,AF_ESP,AF_EXAC,AF_TGP,ORIGIN,EXON,INTRON,STRAND,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62,CLASS
41076,32912935,0.0,0.0,0.0,1.0,0.407407,0.0,1.0,0.0896,0.0,0.0,0.0,0
16678,149356516,0.0,0.0,0.0,1.0,0.222222,0.0,-1.0,0.0298,0.098,-0.644187,0.0,0
3266,201063084,0.0007,0.00029,0.0,1.0,0.068182,0.0,-1.0,0.03,9.556,0.814941,0.0,1
11726,207012392,0.0023,0.00195,0.001,1.0,0.0,0.333333,-1.0,0.208,10.93,1.048666,0.0,1
62394,29091761,0.0,0.0,0.0,1.0,0.75,0.0,-1.0,0.356,11.77,1.203834,-3.0,0


### 2. Balancing

In [42]:
# my balancing
# ===========================================================
df_zero = df.loc[df['CLASS'] == 0]
df_zero = df_zero.sample(n=10000)
df_one = df.loc[df['CLASS'] == 1]
df_one = df_one.sample(n=10000)

df = pd.concat([df_zero, df_one])
df = df.sample(n = df.shape[0])
all_rows = df.values.tolist()
row_num = len(all_rows)
df.head()

Unnamed: 0,POS,AF_ESP,AF_EXAC,AF_TGP,ORIGIN,EXON,INTRON,STRAND,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62,CLASS
47409,3646275,0.0044,0.00664,0.008,1.0,0.533333,0.0,-1.0,0.0,12.43,1.330972,0.0,0
63240,51065757,0.0,4e-05,0.0,1.0,0.333333,0.0,-1.0,0.00558,29.9,6.454018,-1.0,0
57146,8367427,0.0,0.0,0.0,1.0,1.0,0.0,-1.0,0.125,7.78,0.546868,1.0,0
32971,6638370,0.0,2e-05,0.0,1.0,0.461538,0.0,-1.0,0.115,33.0,7.097407,-3.0,0
42128,32971146,0.0,0.0,0.0,1.0,0.962963,0.0,1.0,0.0896,21.0,2.730815,-1.0,0


### 3. Split Train Test

In [43]:
# Divide whole dataset into training set and testing set
# ===========================================================
training_percentage = 0.5  # percent of partition of training dataset
training_size = int(row_num * training_percentage)
testing_size = row_num - training_size
training_data = all_rows[: training_size]  # training data should include header row
testing_data = all_rows[training_size: ]   # testing data don't need to include header row
# np.bincount([row[-1] for row in training_data])
# np.bincount([row[-1] for row in testing_data])

### 4. K-means Structure Implementation

In [44]:
# i have rescaled all data to confirm that each input dimension ranges within [0, 1]

def gen_random_pivots(k, dim):
    return [np.random.rand(1, dim) for _ in range(k)]

def euclidean_distance(pivot, point):
    # point can have a CLASS entry, we don't want to use it, so eliminate it outside of the function
    
    return np.linalg.norm(np.subtract(pivot, point))

def find_nearest_pivot(point):
    winner = np.random.rand(1, attribute_dimension)
    min_dist = float('inf')
    for i in range(len(pivots)):
        pivot = pivots[i][0]
        temp_dist = euclidean_distance(pivot, point[: -1])
        idx = -2
        if temp_dist < min_dist:
            winner, min_dist, idx = pivot, temp_dist, i
    return winner, idx

def cluster_mean_point(cluster):
    mean_point = np.zeros((1, attribute_dimension))
    for point in cluster:
        mean_point = np.add(mean_point, point[: -1])
    mean_point = np.divide(mean_point, len(cluster) + 0.00000001)
    return mean_point

def clusters_acc(clusters):
    counter1, counter2 = collections.Counter([point[-1] for point in clusters[0]]), collections.Counter([point[-1] for point in clusters[1]])
    
    one_rate1 = counter1.get(1, 0) / (counter1.get(1, 0) + counter1.get(0, 0) + 0.00000001)
    zero_rate1 = counter1.get(0, 0) / (counter1.get(1, 0) + counter1.get(0, 0) + 0.00000001)
    one_rate2 = counter2.get(1, 0) / (counter2.get(1, 0) + counter2.get(0, 0) + 0.00000001)
    zero_rate2 = counter2.get(0, 0) / (counter2.get(1, 0) + counter2.get(0, 0) + 0.00000001)
    dataset_size = sum(counter1.values()) + sum(counter2.values())
    if one_rate1 > one_rate2:
        # counter1 -> label 1, counter2 -> label 0
        return (counter1.get(1, 0) + counter2.get(0, 0)) / dataset_size
    else:
        return (counter1.get(0, 0) + counter2.get(1, 0)) / dataset_size
    '''
    return (counter1.get(0, 0) + counter2.get(1, 0)) / (sum(counter1.values()) + sum(counter2.values()))
    '''

### 5. Train

In [45]:
# unsupervised clustering
# ==============================================
K = 2
n = 20 # number of try to run
converge_radius = 0.002

print("Training_size: %d" % training_size)
winner = []
min_var = float('inf')
max_acc = -float('inf')

for i in range(n):
    
    # randomly select K distinct data points
    pivots = gen_random_pivots(K, attribute_dimension)
    
    sys.stdout.write('\r')
    # the exact output you're looking for:
    sys.stdout.write("Training: [%-20s] %d%%" % ('='*int((i + 1) * (100 / n) / 5), int((i + 1) * (100 / n))))
    sys.stdout.flush()
#     sleep(0.25)
    
    while True:
        # clusters = dict() # maybe it would be better to instantize this with a dictionary
        clusters = [[] for _ in range(K)] # init K empty clusters
    
        for point in training_data:
            
            # each row of training_data is a high-dimensional point
            _, idx_of_pivot = find_nearest_pivot(point)
            clusters[idx_of_pivot].append(point)
        temp_pivots = [cluster_mean_point(cluster) for cluster in clusters]
        
        # loop until init_pivots converge
        if np.linalg.norm(np.subtract(pivots, temp_pivots)) < converge_radius:
            break
        pivots = list(temp_pivots)
    
    # pick the best clustering with the most difference between each other
    cluster_sizes = [len(cluster) for cluster in clusters]
    temp_var = np.var(cluster_sizes)
    temp_acc = clusters_acc(clusters)
    if temp_acc > max_acc:
        winner = list(clusters)
        max_acc = temp_acc

'''
* TP: Prediction is True + Predicted value is Positive
* FP: Prediction is False + Predicted value is Positive
* TN: Prediction is True + Predicted value is Negative
* FN: Prediction is False + Predicted value is Negative

* Accuracy = $\frac{TP + TN}{TP + FN + FP + TN}$
* Sensitivity (TPR) = $\frac{TP}{TP + FN}$
* Specificity (FPR) = $\frac{TN}{TN + FP}$
'''

# cause we don't really have cutoff points here, so computing TP, .. would be meaningless
# here we only care about Acc
print("\nwinner: ", len(winner[0]), len(winner[1]))
print("Training Acc: ", max_acc)
counter1, counter2 = collections.Counter([point[-1] for point in winner[0]]), collections.Counter([point[-1] for point in winner[1]])
print(counter1, counter2)

Training_size: 10000
winner:  3567 6433
Training Acc:  0.5068
Counter({0.0: 1794, 1.0: 1773}) Counter({1.0: 3274, 0.0: 3159})


### 6. Test

In [46]:
# testing
# ===================================================================
# pivots
test_clusters = [[] for _ in range(K)]
cutoff = 0.5
for point in testing_data:
    if euclidean_distance(pivots[0], point[: -1]) / euclidean_distance(pivots[1], point[: -1]) < cutoff / (1 - cutoff):
        test_clusters[0].append(point)
    else:
        test_clusters[1].append(point)
print("testing Acc", clusters_acc(test_clusters))
print(len(test_clusters[0]), len(test_clusters[1]))

testing Acc 0.5047
3520 6480


In [47]:
# Compute TN, TP, FN, FP, etc.
# ===========================================================
ROC = Table(make_array('CUTOFF', 'TN', 'FN', 'FP', 'TP', 'ACC'))
step_size = 0.05
for cutoff in np.arange(0, 1 + step_size, step_size):
    
    sys.stdout.write('\r')
    # the exact output you're looking for:
    sys.stdout.write("Testing: [%-20s] %d%%" % ('='*int(cutoff * 100 / 5), int(cutoff * 100)))
    sys.stdout.flush()
    
    test_clusters = clusters = [[] for _ in range(K)]
    for point in testing_data:
        if euclidean_distance(pivots[0], point[: -1]) / euclidean_distance(pivots[1], point[: -1]) < cutoff / (1 - cutoff):
            test_clusters[0].append(point)
        else:
            test_clusters[1].append(point)
    counter1, counter2 = collections.Counter([point[-1] for point in test_clusters[0]]), collections.Counter([point[-1] for point in test_clusters[1]])
    one_rate1 = counter1.get(1, 0) / (counter1.get(1, 0) + counter1.get(0, 0) + 0.00000001)
    zero_rate1 = counter1.get(0, 0) / (counter1.get(1, 0) + counter1.get(0, 0) + 0.00000001)
    one_rate2 = counter2.get(1, 0) / (counter2.get(1, 0) + counter2.get(0, 0) + 0.00000001)
    zero_rate2 = counter2.get(0, 0) / (counter2.get(1, 0) + counter2.get(0, 0) + 0.00000001)
    dataset_size = sum(counter1.values()) + sum(counter2.values())
    if one_rate1 > one_rate2:
        # counter1 -> label 1, counter2 -> label 0
        TP = counter1.get(1, 0)
        FN = counter1.get(0, 0)
        FP = counter2.get(1, 0)
        TN = counter2.get(0, 0)
    else:
        # counter1 -> label 0, counter2 -> label 1
        TP = counter1.get(1, 0)
        FN = counter1.get(0, 0)
        FP = counter2.get(1, 0)
        TN = counter2.get(0, 0)
    output = [cutoff, TN, FN, FP, TP]
    acc = (output[1] + output[4]) / (sum(output[1:]) + 0.00000001)
    output.append(acc)
    ROC = ROC.with_row(output)
ROC = ROC.with_columns('SENSITIVITY', ROC.apply(lambda TP, FN: TP / (TP + FN + 0.00000001), 'TP', 'FN'))
ROC = ROC.with_columns('FPR', ROC.apply(lambda TN, FP: FP / (TN + FP + 0.00000001), 'TN', 'FP'))
ROC = ROC.with_column('FMEAS', ROC.apply(lambda TP, FP, FN: 2 * (TP / (TP + FN)) * (TP / (TP + FP)) / (TP / (TP + FN) + TP / (TP + FP)), 'TP', 'FP', 'FN'))



  


In [48]:
ROC.show()

CUTOFF,TN,FN,FP,TP,ACC,SENSITIVITY,FPR,FMEAS
0.0,5047,0,4953,0,0.5047,0.0,0.4953,
0.05,4838,209,4735,218,0.5056,0.510539,0.49462,0.0810409
0.1,4742,305,4629,324,0.5066,0.515103,0.493971,0.116087
0.15,4553,494,4461,492,0.5045,0.498986,0.494897,0.165684
0.2,4207,840,4049,904,0.5111,0.518349,0.490431,0.269972
0.25,4092,955,3920,1033,0.5125,0.519618,0.489266,0.297652
0.3,3931,1116,3764,1189,0.512,0.515835,0.489149,0.327638
0.35,3685,1362,3531,1422,0.5107,0.510776,0.489329,0.367584
0.4,3463,1584,3426,1527,0.499,0.490839,0.497315,0.37872
0.45,3348,1699,3338,1615,0.4963,0.487326,0.499252,0.39071


In [49]:
# Acc Curve by cutoff
# ===========================================================
matplotlib.use('TkAgg')
fig = plt.figure()
plt.xlabel('Cutoff')
plt.ylabel('Accuracy')
plt.title('Accuracy - Cutoff of K means')
plt.plot(np.arange(0, 1.1, 0.1), [0.5 for i in np.arange(0, 1.1, 0.1)], color='black')
plt.plot(ROC.column('CUTOFF'), ROC.column('ACC'), color='orange')
plt.axis([0, 1, 0, 1])
plt.show()
fig.savefig('K means ACC - numer only.png', bbox_inches='tight')

In [50]:
# ROC_CURVE
# ===========================================================
fig = plt.figure()
plt.xlabel('False Positive Rate')
plt.ylabel('Sensitivity')
plt.title('ROC - Curve of K means')
plt.plot(ROC.column('FPR'), ROC.column('SENSITIVITY'), color='orange')
plt.plot(np.arange(0, 1.1, 0.1), np.arange(0, 1.1, 0.1), color='black')
plt.legend(['K means', 'Null'])
plt.axis([0, 1, 0, 1])
plt.show()
fig.savefig('K means ROC - numer only.png', bbox_inches='tight')

In [51]:
# Compute AUC
# ===========================================================
length = len(ROC.column('FPR'))
auc = 0
for i in range(length - 1):
    auc += 0.5 * abs(ROC.column('FPR')[i + 1] - ROC.column('FPR')[i]) * (ROC.column('SENSITIVITY')[i] + ROC.column('SENSITIVITY')[i + 1])
print("auc = %.03f" %auc)

auc = 0.306


### It's still not working, maybe it's because the dataset mainly relies on categorical data, and numerical data columns are not giving much useful information