In [1]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

import random
from scipy.spatial.distance import cdist
from sklearn.preprocessing import Imputer
from collections import defaultdict
import itertools
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('allPatients.csv',low_memory=False)

In [3]:
X = df.drop('Classes',axis=1)
features = X.columns
X.replace(to_replace='?', value = 'NaN', inplace=True)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X=imp.fit_transform(X)
Y = df['Classes']
X = pd.DataFrame(X,columns=features)

In [4]:
clf = RandomForestClassifier(n_estimators=300, criterion = 'entropy', max_features = 100)
clf.fit(X,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=100, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [5]:
freqFeatures = np.zeros(shape=(8560), dtype=int)
for tree in clf:
    for i in tree.tree_.feature:
        if i !=-2:
            freqFeatures[i] = freqFeatures[i]+1

In [14]:
top200Features = abs(freqFeatures).argsort()[::-1][:200]
pd.DataFrame(top200Features[:100]).to_csv('top100Features.csv')
featureSelectedX = X.ix[:,top200Features]

array([5248, 7369, 1577, 4820, 5182, 2221, 2606, 6535, 3756, 7413, 4668,
       8064, 8035, 4081, 5125, 4100,  616, 5640, 3418,  429, 3272, 5991,
       3745, 2410, 1812, 4470, 2017, 7175, 2991, 6036, 2282, 2485, 6062,
       1409,  683, 6258, 6557, 2435,  604, 5327, 6115, 4561,  352, 4920,
       7284, 3817, 6841, 5323,  802, 2855, 6737,  520, 7754, 2734, 7247,
       6717, 1630, 7549, 6796, 3109, 5708, 2159, 4072, 4658, 3192, 6755,
        320, 3789, 6361,  177, 5440, 1016, 5948, 1665, 1025,  416, 5978,
       2831, 1671,  974, 3633, 3643, 8512, 1586, 7324, 1573, 4864, 4127,
       3650, 2096, 5892, 8338, 1454, 3388, 1832, 6536, 5486,  446, 4446,
       5506, 8330, 3822, 6364, 5497, 2153, 3436, 8320, 1305, 1306, 4434,
       8218, 1777, 7582, 5584, 7540, 7478, 3401, 6481,  421, 6489,  411,
       5053, 1553, 1805, 2430, 6496, 3387, 4507, 7537, 3408, 5529, 2194,
       7492, 7495, 2671, 6532, 7398, 1545, 5076, 1814, 1349,  125, 2271,
       5850, 1903, 7897, 4775, 5996, 3635, 3638, 48

In [16]:
def getClusterGroups(labels,data,k):
    d = dict(); 
    for i in range(0,k):
        d["cluster"+str(i)] =  data.ix[np.nonzero(labels==i)]
    return d

def getDistances(clusters,distanceMeasure):
    distDict = dict()
    for cluster1 in itertools.combinations(clusters, 2):
        dist = cdist(clusters[cluster1[0]],clusters[cluster1[1]], distanceMeasure)
        n,m = dist.shape
        distDict[cluster1[0]+" " +cluster1[1] + " Single Link"] = np.nanmin(dist)
        distDict[cluster1[0]+" " +cluster1[1] + " complete Link"] = np.nanmax(dist)
        dist = np.nan_to_num(dist)
        distDict[cluster1[0]+" " + cluster1[1] + " average"] = sum(sum(dist))/(n*m)
        distDict[cluster1[0] + " "+ cluster1[1] + " centriod"] = np.amax(cdist(pd.DataFrame(clusters[cluster1[0]].mean()).transpose(),pd.DataFrame(clusters[cluster1[1]].mean()).transpose(),distanceMeasure))
    return distDict

In [17]:
estimatorsEuclidean = {'k_means_euclidean_2': KMeans(n_clusters=2).fit(featureSelectedX),
              'k_means_euclidean_3': KMeans(n_clusters=3).fit(featureSelectedX),
              'k_means_euclidean_4': KMeans(n_clusters=4).fit(featureSelectedX)}

def new_euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False): 
    return cosine_similarity(X,Y)

# monkey patch (ensure cosine dist function is used)
KMeans.euclidean_distances = new_euclidean_distances 
estimatorsDot = {'k_means_dot_2': KMeans(n_clusters=2).fit(featureSelectedX),
              'k_means_dot_3': KMeans(n_clusters=3).fit(featureSelectedX),
              'k_means_dot_4': KMeans(n_clusters=4).fit(featureSelectedX)}

In [18]:
for cluster in estimatorsEuclidean:
    k = len(np.unique(estimatorsEuclidean[cluster].labels_))
    groups = getClusterGroups(estimatorsEuclidean[cluster].labels_,featureSelectedX,len(np.unique(estimatorsEuclidean[cluster].labels_)))
    dist = getDistances(groups,'euclidean')
    pd.DataFrame([dist]).transpose().to_csv('ResultsEuclideanK({0}).csv'.format(k))
for cluster in estimatorsDot:
    k = len(np.unique(estimatorsDot[cluster].labels_))
    groups = getClusterGroups(estimatorsDot[cluster].labels_,featureSelectedX,k)
    dist = getDistances(groups,'cosine')
    pd.DataFrame([dist]).transpose().to_csv('ResultsDotK({0}).csv'.format(k))
