In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_pickle('df_1518.pkl')

In [3]:
spatial_label = pd.read_pickle('../data_processeing/spatial_labels.pkl')

In [5]:
df_new = pd.concat([df,spatial_label], axis=1)

In [6]:
df_select = df_new[(df_new['stars'] >= 4) & (df_new['spatial_label'] == 7) & (df_new['cuisine_Chinese'] == 2)]

# Spectual Clustering

In [7]:
X = df_select.loc[:, 'AgesAllowed': 'review_count_greater_median']
X = pd.concat([X, df_select[['stars']]], axis=1)
X['stars'] = X['stars'].apply(str)
X = pd.get_dummies(X, dummy_na=False, drop_first=True)
def true_false(x):
    if x == True:
        return 1
    else:
        return 0
X['review_count_greater_median'] = X['review_count_greater_median'].apply(true_false)

In [8]:
for n_clusters in range(2,5):  
    spectural_clustering = SpectralClustering(n_clusters=n_clusters, random_state=100, affinity='sigmoid').fit(X)
    labels = spectural_clustering.labels_
    print silhouette_score(X, labels, metric='cityblock')
    print list(labels).count(0)
    print list(labels).count(1)
    print list(labels).count(2)
    print list(labels).count(3)
    print list(labels).count(4)

0.0604747245331
161
8
0
0
0
0.0179897292672
160
8
1
0
0
0.043740098015
7
153
8
1
0


In [9]:
spectural_clustering = SpectralClustering(n_clusters=2, random_state=999, affinity='sigmoid').fit(X)
labels_spectural = spectural_clustering.labels_

In [10]:
df_select.shape

(169, 67)

# KMeans

In [11]:
## function that gets silhouette scores for a clustering method
def get_silhouette_score(X, cluster_method, method='complete'):
    
    #Choose a range(list) of clusters I would like to try:
    range_n_clusters = range(2,10)
    
    if cluster_method == 'kmeans':
        for n_clusters in range_n_clusters: 
            km_result = KMeans(n_clusters=n_clusters, random_state=22).fit(X) # fit model
            cluster_labels = km_result.labels_

            # clustering model & clustering result to variable "cluster_labels". 
            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))

        
    elif cluster_method == 'gaussian_mix':
        for n_clusters in range_n_clusters: 
            gm = GaussianMixture(n_components=n_clusters, random_state=22).fit(X) # fit model
            cluster_labels = gm.predict(X)

            # clustering model & clustering result to variable "cluster_labels". 
            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))
        
        # res_mix_cluster.predict(X)
        
    elif cluster_method == 'hierarchical':
        # Define Z
        Z = linkage(X, method)

        for n_clusters in range_n_clusters:   
            cluster_labels=  fcluster(Z, n_clusters, criterion='maxclust') 

            silhouette_avg = silhouette_score(X, cluster_labels, random_state=22)
            print("For n_clusters ={},".format(n_clusters)+"{} - average silhouette_score :{}".\
                  format(cluster_method, silhouette_avg))

In [12]:
## choose the number of clusters using silhouette method
get_silhouette_score(X, 'kmeans')

For n_clusters =2,kmeans - average silhouette_score :0.153933490359
For n_clusters =3,kmeans - average silhouette_score :0.0895060167385
For n_clusters =4,kmeans - average silhouette_score :0.0960596183143
For n_clusters =5,kmeans - average silhouette_score :0.0941486644697
For n_clusters =6,kmeans - average silhouette_score :0.0916804580175
For n_clusters =7,kmeans - average silhouette_score :0.0956989968796
For n_clusters =8,kmeans - average silhouette_score :0.0840999060067
For n_clusters =9,kmeans - average silhouette_score :0.0849166946159


In [13]:
n_clusters = 2  # number of clusters
#XX= X.ix[:, ] # hour of day data

#train the model.
km=KMeans(n_clusters=n_clusters, random_state=22).fit(X)
labels_km = km.labels_

print list(labels_km).count(0)
print list(labels_km).count(1)

114
55


In [14]:
## Distance 
# KM=KMeans(n_clusters=k,random_state=9)
scor=km.fit_predict(X)

res_p=pd.DataFrame(km.transform(X))  ## distances of each data point to each cluster center!!!
res_p=pd.concat((res_p, pd.DataFrame(km.fit_predict(X))),axis=1)

res_p.columns=list(range(n_clusters))+["cluster"]
res_p.loc[:,"score"]=res_p.apply(lambda x: x[int(x["cluster"])],axis=1)
res_p.sort_values("score",ascending=False)[:5]

Unnamed: 0,0,1,cluster,score
69,3.367713,3.04555,1,3.04555
83,3.033338,3.125105,0,3.033338
92,3.421975,2.898732,1,2.898732
46,2.930376,2.730786,1,2.730786
166,3.182917,2.710738,1,2.710738


In [15]:
X_df = X.copy()

In [16]:
X_df['km'] = labels_km

In [17]:
X_df['distance_KM'] = res_p.score.values

In [18]:
## Get Anomalies
def get_anomaly(df, label_col, dist_or_likelihood, thres):
    
    df_anomalies = pd.DataFrame(columns=df.columns)
    
    clustermean = df.groupby(label_col)[dist_or_likelihood].mean()
    clusterstd = df.groupby(label_col)[dist_or_likelihood].std()
    
    for label in df[label_col].unique():
        anomaly = df[(df[label_col]==label) \
               & (abs(df[dist_or_likelihood] - clustermean[label]) / clusterstd[label] > thres)]
        
        df_anomalies = pd.concat([df_anomalies, anomaly], axis=0)
    
    return df_anomalies

In [19]:
## get anomalies based on kmeans distance
km_anomalies = get_anomaly(X_df, 'km', 'distance_KM', 2)

In [20]:
km_anomalies['distance_KM']

y8d90Pt16Nip-B5UXWBP-w    2.650604
UTXNr62dGSK-tet8OeXUcQ    3.033338
Ec9CBmL3285XkeHaNp-bSQ    2.549389
A5Rkh7UymKm0_Rxm9K2PJw    2.653911
-Uix-n4Jqo4W7ERagC5qAA    2.590350
2xbxXWeu3tpZEJdGTTGbLg    1.579099
zjvnqTjBp56NhMp1GrlO5g    3.045550
3pSUr_cdrphurO6m1HMP9A    2.898732
Name: distance_KM, dtype: float64

# Gaussian Mixture

In [21]:
##### Gaussian Mixture #########

## choose the number of clusters using silhouette method
get_silhouette_score(X, 'gaussian_mix')

For n_clusters =2,gaussian_mix - average silhouette_score :0.177347637193
For n_clusters =3,gaussian_mix - average silhouette_score :0.102471274593
For n_clusters =4,gaussian_mix - average silhouette_score :0.0660608778246
For n_clusters =5,gaussian_mix - average silhouette_score :0.0610811965445
For n_clusters =6,gaussian_mix - average silhouette_score :0.0764446920948
For n_clusters =7,gaussian_mix - average silhouette_score :0.0604045046468
For n_clusters =8,gaussian_mix - average silhouette_score :0.0647091305484
For n_clusters =9,gaussian_mix - average silhouette_score :0.0650270024766


In [22]:
k = 2
GM=GaussianMixture(n_components=k,random_state=22)
GM.fit(X)

label_gm = GM.predict(X)

In [28]:
print list(label_gm).count(0)
print list(label_gm).count(1)
print list(label_gm).count(2)
print list(label_gm).count(3)
print list(label_gm).count(4)

54
122
0
0
0


# Isolation Forest

In [23]:
### 3) Isolation Foreset
from sklearn.ensemble import IsolationForest

# fit the model
ISF = IsolationForest(max_samples=100, random_state=22)
ISF.fit(X)

## compute anomaly score of the input. The lower, the more abnormal.
score_isf = ISF.decision_function(X)

In [24]:
score_isf.argmin()

69

# KNN Distance

In [25]:
def dist2knn(x, nn, k):
    dist_ = []
    for i in range(len(nn)):
        dist_.append(distance.euclidean(x, nn.iloc[i,:]))
    dist_.sort()
    return sum(dist_[:k+1]) # +1: remove 0 self

In [26]:
dist_sum_knn = []
for i in range(len(X)):
    print '\r{}%'.format(100.0*(i+1)/len(X)),
    dist_sum_knn.append(dist2knn(X.iloc[i,:], X, 5))

100.0%


# Output

In [27]:
result = pd.DataFrame(index=X.index)

In [28]:
result['clusters_sp'] = labels_spectural
result['clusters_km'] = labels_km
result['distance_km'] = km_anomalies['distance_KM']
result['clusters_gm'] = label_gm
result['scores_isf'] = score_isf
result['distance_knn'] = dist_sum_knn

In [29]:
result

Unnamed: 0,clusters_sp,clusters_km,distance_km,clusters_gm,scores_isf,distance_knn
01aNlDhbMObjc9OdAHuNpQ,0,0,,0,0.089663,9.196152
VzUo-RURV3VnfNItAYM8yg,0,0,,0,0.013683,10.944272
3GfdCuI0YCc5U3rLLLPHUw,0,0,,0,0.045325,10.236068
H_eO04NZAQIDcbtFQ4BUag,1,0,,0,0.087450,8.660254
2xbxXWeu3tpZEJdGTTGbLg,0,1,1.579099,0,0.086864,9.668288
N3zuaqGESF5iZsi_md9c1Q,0,0,,0,0.072284,9.732051
BKg8YIGX_5YyUczmBAyyCQ,0,0,,0,0.105013,8.928203
kABF0hYfAEnl166mn1zR1A,0,0,,0,0.141218,7.071068
5OqrwhtZ3mcmUSwLINZTWQ,0,1,,1,-0.036444,11.391274
ky5L-EfUwU9chSPcIeXM5w,0,0,,0,0.127891,6.656854


In [30]:
result.to_csv('LasVegas_chi_results.csv', index_label=False, encoding='utf-8' )