In [2]:
# Import lib
# ===========================================================
import csv
import pandas as pd
import numpy as np
import random
import time
import collections
import math
import sys
from tqdm import tqdm
from time import sleep

import matplotlib.pyplot as plt
# %matplotlib inline
plt.style.use('fivethirtyeight')

from datascience import *
from scipy import stats

import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [29]:
# Initialize useful data
# ===========================================================
# with open('clinvar_conflicting_mapped.csv', 'r') as f:
#     reader = csv.reader(f)
#     temp_rows = list(reader)
df = pd.read_csv('clinvar_conflicting_mapped.csv', low_memory=False)
attributes = list(df.columns)
attribute_dimension = len(attributes) - 1 # eliminate the CLASS column
all_rows = df.values.tolist()
row_num = len(all_rows)
df = df.sample(n = df.shape[0])
df.head()

Unnamed: 0,CHROM,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDN,CLNHGVS,...,Codons,STRAND,BAM_EDIT,SIFT,PolyPhen,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62,CLASS
31271,0.583333,0.888061,0.578522,0.746725,0.066502,0.333733,0.0,0.470327,0.165551,0.336626,...,0.339937,0.9375,0.333333,0.0,0.0,0.264423,0.261809,0.841436,0.0,0
59240,0.5,0.470649,0.578522,0.746725,0.292752,0.555122,0.095352,0.351852,0.879698,0.830015,...,0.662765,0.0625,0.666667,0.8,0.8,0.608543,0.868568,0.444518,0.0,0
56618,0.5,0.208144,0.764434,0.567686,0.0,0.550472,0.0,0.773879,0.037581,0.344174,...,0.622692,0.0625,0.666667,0.8,0.8,0.362426,0.080645,0.627075,2.5e-05,0
17193,0.291667,0.837582,0.815242,0.661572,0.180859,0.291135,0.643987,0.552848,0.89838,0.403893,...,0.577668,0.9375,0.333333,0.6,0.2,0.420673,0.584197,0.6999,0.0,1
28561,0.791667,0.806845,0.684758,0.799127,0.0,0.0,0.0,0.37167,0.929266,0.038259,...,0.143629,0.0625,0.666667,0.0,0.6,0.817308,0.167819,0.325202,0.999924,1


In [37]:
# Divide whole dataset into training set and testing set
# ===========================================================
training_percentage = 1  # percent of partition of training dataset
training_size = int(row_num * training_percentage)
testing_size = row_num - training_size
training_data = all_rows[: training_size]  # training data should include header row
testing_data = all_rows[training_size: ]   # testing data don't need to include header row

# Instruction
1. Select the number of clusters you want to identify in your data 
2. Randomly select k distinct data points 
3. Measure the distance between the 1st point and the k initial clusters 
4. Assign the 1st point to the nearest cluster 
5. Iterate through all points and do step 3 & 4 
6. Calculate the mean of each cluster 
7. Use the calculated mean of each cluster as k new initial data points and restart from 3 
8. Loop until the mean converge 
9. Do Step 1 - 8 for n times, select the best one 

In [31]:

# i have rescaled all data to confirm that each input dimension ranges within [0, 1]

def gen_random_pivots(k, dim):
    return [np.random.rand(1, dim) for _ in range(k)]

def euclidean_distance(pivot, point):
#     print(pivot, point)
    # point can have a CLASS entry, we don't want to use it
    return np.linalg.norm(np.subtract(pivot, point[: -1]))

def find_nearest_pivot(point):
    winner = np.random.rand(1, attribute_dimension)
    min_dist = float('inf')
    for i in range(len(pivots)):
        pivot = pivots[i][0]
        temp_dist = euclidean_distance(pivot, point)
        idx = -2
        if temp_dist < min_dist:
            winner, min_dist, idx = pivot, temp_dist, i
    return winner, idx

def cluster_mean_point(cluster):
    mean_point = np.zeros((1, attribute_dimension))
    for point in cluster:
        mean_point = np.add(mean_point, point[: -1])
    mean_point = np.divide(mean_point, len(cluster) + 0.00000001)
    return mean_point

def clusters_acc(clusters):
    counter1, counter2 = collections.Counter([point[-1] for point in clusters[0]]), collections.Counter([point[-1] for point in clusters[1]])
    one_rate1 = counter1.get(1, 0) / (counter1.get(1, 0) + counter1.get(0, 0) + 0.00000001)
    zero_rate1 = counter1.get(0, 0) / (counter1.get(1, 0) + counter1.get(0, 0) + 0.00000001)
    one_rate2 = counter2.get(1, 0) / (counter2.get(1, 0) + counter2.get(0, 0) + 0.00000001)
    zero_rate2 = counter2.get(0, 0) / (counter2.get(1, 0) + counter2.get(0, 0) + 0.00000001)
    dataset_size = sum(counter1.values()) + sum(counter2.values())
    if one_rate1 > one_rate2:
        # counter1 -> label 1, counter2 -> label 0
        return (counter1.get(1, 0) + counter2.get(0, 0)) / dataset_size
    else:
        return (counter1.get(0, 0) + counter2.get(1, 0)) / dataset_size

In [38]:
# unsupervised clustering
# ==============================================
K = 2
n = 20
converge_radius = 0.04

print("training_size: %d" % training_size)
winner = []
min_var = float('inf')
max_acc = -float('inf')

for i in range(n):
    
    # randomly select K distinct data points
    pivots = gen_random_pivots(K, attribute_dimension)
    
    while True:
        # clusters = dict() # i think it would be better to instantize this with a dictionary
        clusters = [[] for _ in range(K)] # init K empty clusters
    
        for point in training_data:
            
            # each row of training_data is a high-dimensional point
            _, idx_of_pivot = find_nearest_pivot(point)
            clusters[idx_of_pivot].append(point)
        temp_pivots = [cluster_mean_point(cluster) for cluster in clusters]
        
        # loop until init_pivots converge
        if np.linalg.norm(np.subtract(pivots, temp_pivots)) < converge_radius:
            break
        pivots = list(temp_pivots)
    
    # pick the best clustering with the most difference between each other
    cluster_sizes = [len(cluster) for cluster in clusters]
#     temp_var = np.var(cluster_sizes)
    temp_acc = clusters_acc(clusters)
    if temp_acc > max_acc:
        winner = list(clusters)
        max_acc = temp_acc

'''
* TP: Prediction is True + Predicted value is Positive
* FP: Prediction is False + Predicted value is Positive
* TN: Prediction is True + Predicted value is Negative
* FN: Prediction is False + Predicted value is Negative

* Accuracy = $\frac{TP + TN}{TP + FN + FP + TN}$
* Sensitivity (TPR) = $\frac{TP}{TP + FN}$
* Specificity (FPR) = $\frac{TN}{TN + FP}$
'''

# cause we don't really have cutoff points here, so computing TP, .. would be meaningless
# here we only care about Acc
print("winner: ", len(winner[0]), len(winner[1]))
print("Acc: ", max_acc)
counter1, counter2 = collections.Counter([point[-1] for point in winner[0]]), collections.Counter([point[-1] for point in winner[1]])
print(counter1, counter2)

training_size: 65188
winner:  48053 17135
Acc:  0.6217555378290482
Counter({0.0: 36075, 1.0: 11978}) Counter({0.0: 12679, 1.0: 4456})


651