In [1]:
# Import lib
# ===========================================================
import csv
import pandas as pd
import numpy as np
import random
import time
import collections
import math
import sys
from tqdm import tqdm
from time import sleep

import matplotlib.pyplot as plt
# %matplotlib inline
plt.style.use('fivethirtyeight')

from datascience import *
from scipy import stats

import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [2]:
# Initialize useful data
# ===========================================================
# with open('clinvar_conflicting_mapped.csv', 'r') as f:
#     reader = csv.reader(f)
#     temp_rows = list(reader)
df = pd.read_csv('clinvar_conflicting_mapped.csv', low_memory=False)
attributes = list(df.columns)
attribute_dimension = len(attributes) - 1 # eliminate the CLASS column
all_rows = df.values.tolist()
row_num = len(all_rows)
df.head()

Unnamed: 0,CHROM,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDN,CLNHGVS,...,Codons,STRAND,BAM_EDIT,SIFT,PolyPhen,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62,CLASS
0,0.875,0.293401,0.578522,0.799127,0.0,0.0,0.0,0.239983,0.194276,0.527505,...,0.806394,0.0625,0.666667,0.8,0.8,0.535503,0.001056,0.650358,0.999949,0
1,0.875,0.293908,0.578522,0.661572,0.0,0.515074,0.379013,0.552848,0.89838,0.452553,...,0.588023,0.0625,0.666667,0.8,0.8,0.535503,0.931644,0.090793,0.0,0
2,0.875,0.294209,0.578522,0.799127,0.0,0.822409,0.771921,0.239983,0.194276,0.759112,...,0.366952,0.0625,0.666667,0.8,0.8,0.535503,0.259697,0.077541,2.5e-05,1
3,0.875,0.312446,0.815242,0.661572,0.865939,0.701065,0.51126,0.239983,0.194276,0.467724,...,0.766772,0.0625,0.666667,0.8,0.8,0.535503,0.884697,0.25908,0.0,0
4,0.875,0.469365,0.815242,0.661572,0.0,0.084446,0.643987,0.552848,0.89838,0.519298,...,0.21837,0.0625,0.666667,0.8,0.8,0.535503,0.261041,0.072687,0.0,1


In [3]:
# Divide whole dataset into training set and testing set
# ===========================================================
training_percentage = 0.001  # percent of partition of training dataset
training_size = int(row_num * training_percentage)
testing_size = row_num - training_size
training_data = all_rows[: training_size]  # training data should include header row
testing_data = all_rows[training_size: ]   # testing data don't need to include header row

# Instruction
1. Select the number of clusters you want to identify in your data 
2. Randomly select k distinct data points 
3. Measure the distance between the 1st point and the k initial clusters 
4. Assign the 1st point to the nearest cluster 
5. Iterate through all points and do step 3 & 4 
6. Calculate the mean of each cluster 
7. Use the calculated mean of each cluster as k new initial data points and restart from 3 
8. Loop until the mean converge 
9. Do Step 1 - 8 for n times, select the best one 

In [4]:

# i have rescaled all data to confirm that each input dimension ranges within [0, 1]

def gen_random_pivots(k, dim):
    return [np.random.rand(1, dim) for _ in range(k)]

def euclidean_distance(pivot, point):
#     print(pivot, point)
    # point can have a CLASS entry, we don't want to use it
    return np.linalg.norm(np.subtract(pivot, point[: -1]))

def find_nearest_pivot(point):
    winner = np.random.rand(1, attribute_dimension)
    min_dist = float('inf')
    for i in range(len(pivots)):
        pivot = pivots[i][0]
        temp_dist = euclidean_distance(pivot, point)
        idx = -2
        if temp_dist < min_dist:
            winner, min_dist, idx = pivot, temp_dist, i
    return winner, idx

def cluster_mean_point(cluster):
    mean_point = np.zeros((1, attribute_dimension))
    for point in cluster:
        mean_point = np.add(mean_point, point[: -1])
    mean_point = np.divide(mean_point, len(cluster) + 0.00000001)
    return mean_point

In [13]:
# unsupervised clustering
# ==============================================
K = 2
n = 10

print("training_size: %d" % training_size)
winner = []
min_var = float('inf')

for i in range(n):
    
    # randomly select K distinct data points
    pivots = gen_random_pivots(K, attribute_dimension)
    
    print("pivots:", pivots)
    
    

    while True:
        # clusters = dict() # i think it would be better to instantize this with a dictionary
        clusters = [[] for _ in range(K)] # init K empty clusters
    
        for point in training_data:
            
            # each row of training_data is a high-dimensional point
            _, idx_of_pivot = find_nearest_pivot(point)
            clusters[idx_of_pivot].append(point)
        temp_pivots = [cluster_mean_point(cluster) for cluster in clusters]
        if np.linalg.norm(np.subtract(pivots, temp_pivots)) < 0.1:
            break
        pivots = list(temp_pivots)
    
    # # pick the result with smallest variance
    cluster_sizes = [len(cluster) for cluster in clusters]
    print(cluster_sizes)
    temp_var = np.var(cluster_sizes)
    if temp_var < min_var:
        winner = list(clusters)
        min_var = temp_var

print("winner:", len(winner[0]), len(winner[1]))
for point in winner[0]:
    print(point[-1])

training_size: 65
pivots: [array([[3.18003550e-01, 2.17696297e-01, 9.48091181e-01, 8.78752195e-01,
        2.18202555e-01, 1.33609249e-01, 3.80668236e-01, 5.69853212e-01,
        5.30136438e-01, 9.49368270e-01, 2.25282623e-01, 1.73957070e-02,
        5.26425961e-01, 3.11164347e-02, 4.49468128e-01, 7.30783355e-01,
        2.29475414e-01, 1.42105580e-01, 5.76756470e-01, 1.17900808e-01,
        6.96351400e-01, 2.23281182e-01, 9.39543277e-01, 1.70626919e-04,
        9.71100722e-01, 7.00333874e-01, 2.29772737e-01, 1.82039973e-01,
        9.88322223e-01, 1.62990795e-01, 1.58441269e-01, 4.92435117e-01,
        4.85090383e-01, 9.64375386e-01, 1.22919369e-01, 4.72999077e-01]]), array([[0.32045071, 0.78342707, 0.99996467, 0.93327034, 0.56403276,
        0.46543535, 0.32942179, 0.49131686, 0.84908677, 0.6795334 ,
        0.89600242, 0.31448221, 0.17003332, 0.86986931, 0.05864774,
        0.6755308 , 0.72179166, 0.22918899, 0.74451131, 0.04505028,
        0.94817193, 0.52392991, 0.45000271, 0.0045

In [6]:
np.var([556, 549])

12.25

In [7]:
np.var?

[0;31mSignature:[0m [0mnp[0m[0;34m.[0m[0mvar[0m[0;34m([0m[0ma[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mout[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mddof[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mkeepdims[0m[0;34m=[0m[0;34m<[0m[0mno[0m [0mvalue[0m[0;34m>[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compute the variance along the specified axis.

Returns the variance of the array elements, a measure of the spread of a
distribution.  The variance is computed for the flattened array by
default, otherwise over the specified axis.

Parameters
----------
a : array_like
    Array containing numbers whose variance is desired.  If `a` is not an
    array, a conversion is attempted.
axis : None or int or tuple of ints, optional
    Axis or axes along which the variance is computed.  The default is to
    compute the variance of the flattened array.

    .. versionadd