### Player group (team) clusterization

In [None]:
import os
import random
from time import time
from shutil import copy2
from collections import Counter

import numpy as np
import pandas as pd
import cv2
from cv2 import compareHist
from scipy.spatial.distance import (
    cosine, euclidean,
    braycurtis, canberra, chebyshev, correlation, dice, hamming, jaccard,
    kulsinski, mahalanobis, matching, minkowski, rogerstanimoto, russellrao, 
    seuclidean, sokalmichener, sokalsneath, sqeuclidean, yule
)
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.metrics import pairwise_distances
from sklearn import manifold
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering, KMeans, DBSCAN, AgglomerativeClustering
from sklearn.neighbors import LocalOutlierFactor
import warnings; warnings.filterwarnings('ignore')
import matplotlib.pylab as plt
%matplotlib inline

In [None]:
# whether to use Embeddings or Histograms as image feature-vectors
use_embeddings = False

In [None]:
# load image feature-vectors dataset
if use_embeddings:
    x_data = "X_df_embed"
    y_data = "y_df_embed"
else:
    x_data = "X_df"
    y_data = "y_df"
X_df = pd.read_pickle(x_data)
y_df = pd.read_pickle(y_data)

X_12 = X_df[y_df.isin([1,2])]
y_12 = list(y_df[y_df.isin([1,2])])
Xy_12_df = X_12.copy()
Xy_12_df['target'] = y_12
Counter(y_df), Counter(y_12)

### TSNE

In [None]:
def draw_tsne(X_12, y_12, perplexity, n_components):
    tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0, perplexity=perplexity)
    t0 = time()
    X_tsne = tsne.fit_transform(X_12)

    print('Samples in each team', X_12.shape[0] // 2, ', perplexity', perplexity)
    color_dict = {1: 'red', 2: 'yellow', 0: 'black'}
    colors = [color_dict[label] for label in y_12]
    plt.scatter([x[0] for x in X_tsne], [x[1] for x  in X_tsne], color=colors)
    return X_tsne

In [None]:
X_tsne_15 = draw_tsne(X_12[0:200], y_12[:200], 5, 2)

In [None]:
X_tsne_10 = draw_tsne(X_12[0:210], y_12[:210], 10, 2)

In [None]:
X_12.shape, len(y_12)

### PCA

In [None]:
color_dict = {1: 'red', 2: 'yellow', 0: 'black', 3: 'orange', 4:'green'}
colors = [color_dict[x] for x in y_12[:210]]
pca = PCA(n_components=5)
pca = pca.fit_transform(X_12[:210])
fig = plt.figure(figsize=(20,5))
a=fig.add_subplot(1, 3, 1)
a.scatter([x[0] for x in pca], [x[1] for x  in pca], color=colors)
a.set_title('PCA 1st and 2nd components')
a=fig.add_subplot(1, 3, 2)
a.scatter([x[1] for x in pca], [x[2] for x  in pca], color=colors)
a.set_title('PCA 2nd and 3rd components')
a=fig.add_subplot(1, 3, 3)
a.scatter([x[0] for x in pca], [x[2] for x in pca], color=colors)
a.set_title('PCA 1st and 3rd components')

## Clusterization

!["cluster-comparison"](extras/cluster_comparison_shadow.png)

### Clusterization - KMeans

In [None]:
def clusterization(model, group_size, number_of_images=1000):
    '''
    Cuts feature dataset on groups of size 'group_size', and performs clusterization for each group.  
    '''
    for i in range(0, number_of_images, 2*group_size):
        model = model.fit(X_12[i: i + 2*group_size])
        labels = model.labels_
        evaluate(Xy_12_df[i: i + 2*group_size], labels)
        print('===========================================================')

def print_clusters(person_type, labels):
    if not len(labels):
        return
    print(person_type, len(labels), 'images:')
    counts = Counter(labels)
    print('>>  ' + '; '.join(["cluster_" + str(k) + ": " + str(counts[k]) for k in counts]))
    print('-----------------------------------')

def evaluate(df, labels):
    '''
    Compares clusterization labels with true labels. 
    '''
    labels0 = labels[df['target']==0]
    print_clusters('Referee:', labels0)
    labels1 = labels[df['target']==1]
    print_clusters('Team1 -', labels1)
    labels2 = labels[df['target']==2]
    print_clusters('Team2 -', labels2)

In [None]:
kmeans = KMeans(max_iter = 100, n_clusters=2, random_state=0).fit(X_12[:200])

In [None]:
clusterization(kmeans, group_size=10, number_of_images=100)

In [None]:
clusterization(kmeans, group_size=100, number_of_images=1000)

### Clusterization - DBScan

In [None]:
eps = 0.1
dbscan = DBSCAN(eps=eps, metric=cosine, algorithm='brute', n_jobs=4)
print('results for epsilon {}\n'.format(eps))
clusterization(dbscan, group_size=10, number_of_images=100)

In [None]:
clusterization(dbscan, group_size=100, number_of_images=1000)

### Clusterization - Agglomerative

![](extras/Hierarchical-Clustering-Dendrogram.png)

[Histogram Comparison in OpenCV](https://docs.opencv.org/2.4/modules/imgproc/doc/histograms.html)

#### Hellinger distance (Bhattacharyya distance)

![HELLINGER](extras/metrics_hellinger.png)
where

H1 – 1st compared histogram,

H2 – 2nd compared histogram of the same size as H1,

N – total number of histogram bins.




In [None]:
metric_hellinger = lambda x, y: compareHist(x.astype(np.float32), y.astype(np.float32), cv2.HISTCMP_HELLINGER)
affinity = lambda x, m=metric_hellinger: pairwise_distances(x, metric=m)

In [None]:
print('with *scikit*')
print('Metric HELLINGER')
if use_embeddings:
    affinity = 'cosine'
agglo = AgglomerativeClustering(n_clusters=2, affinity=affinity, memory=None, linkage='average')
clusterization(agglo, 100, 1000)

In [None]:
default_metric = metric_hellinger
default_metric_name = 'HELLINGER'
if use_embeddings:
    default_metric = 'cosine'
    default_metric_name = 'COSINE'

In [None]:
print('With *scipy*')
print(f'Metric {default_metric_name}')
group_size = 100
number_of_images = 1000
print('Team size: ', group_size, '\n')
for i in range(0, number_of_images, 2*group_size):
    y = pdist(X_12[i: i + 2*group_size], default_metric)
    linkage_matrix = linkage(y, method='average', optimal_ordering=True)
    dd = dendrogram(linkage_matrix, no_plot=True)
    labels = [1] * group_size + [2] * group_size
    idx = dd['leaves']
    idx_df = pd.Series(labels, index=idx).sort_index()
    evaluate(Xy_12_df[i: i + 2*group_size], idx_df.values)
    print('============================================================================================')

In [None]:
y = pdist(X_12[120: 140], default_metric)
linkage_matrix = linkage(y, method='average', optimal_ordering=True)
dd = dendrogram(linkage_matrix, no_plot=False)

### Clusterization - Spectral

In [None]:
spectral = SpectralClustering(n_clusters=2, n_init=100, affinity=default_metric, n_neighbors=10, n_jobs=4)
clusterization(spectral, 100, 1000)

### Referee and Team identification

In [None]:
def agglomerative_clustering(vectors, metric):
    '''
    Applies Agglomerative clustering to identify teams.
    '''
    linkage_matrix = linkage(vectors, method="average", metric=metric, optimal_ordering=True)
    return dendrogram(linkage_matrix, no_plot=True)["leaves"]

def find_outlier(X, metric):
    '''
    Applies Outlier model to identify referee.
    '''
    referee_classifier = LocalOutlierFactor(n_neighbors=3, contamination=0.04, metric=metric)
    preds = referee_classifier.fit_predict(X)
    return preds.argmin()

def cluster(X, y, group_size, outlier_metric, team_metric):
    '''
    Clusters X feature dataset on 2 teams and referee. Returns whether referee and teams were indetified. 
    '''
    team_1_features = X[y == 1]
    team_2_features = X[y == 2]
    referee_features = X[y == 0]
    team_1_true_ids = np.array(range(group_size))[y == 1]
    team_2_true_ids = np.array(range(group_size))[y == 2]
    referee_true_id = np.array(range(group_size))[y == 0][0]

    referee_predicted_id = find_outlier(X, outlier_metric)
    referee_identified = referee_true_id == referee_predicted_id
    
    team_1_true_ids[team_1_true_ids > referee_true_id] -= 1
    team_2_true_ids[team_2_true_ids > referee_true_id] -= 1    
    features = np.concatenate([team_1_features, team_2_features])
    idx = agglomerative_clustering(features, team_metric)
    team_1_predicted_ids = idx[: (group_size - 1) // 2]
    team_2_predicted_ids = idx[(group_size - 1) // 2:]
    team_clusters_case_1 = (set(team_1_predicted_ids) == set(team_1_true_ids)) and (
        set(team_2_predicted_ids) == set(team_2_true_ids))
    team_clusters_case_2 = (set(team_1_predicted_ids) == set(team_2_true_ids)) and (
        set(team_2_predicted_ids) == set(team_1_true_ids))
    return referee_identified, team_clusters_case_1 or team_clusters_case_2

In [None]:
def indentify_people(metrics):
    '''
    Evaluates accuracy of the team and referee clusterization.
    '''
    group_size = 21
    clusterization_results = []
    for i in range(0, len(X_df), group_size):
        X, y = X_df[i: i + group_size], y_df[i: i + group_size]
        clusterization_results.append(cluster(X, y, group_size, **metrics))

    print(f"Referee identified: {sum([x[0] for x in clusterization_results])} out of {len(clusterization_results)}")
    print(f"2 teams clustered: {sum([x[1] for x in clusterization_results])} out of {len(clusterization_results)}")

#### canberra distance
![](extras/canberra.png)
where

u - 1st histogram,

v - 2nd histogram.

In [None]:
metrics = {'outlier_metric': canberra, 'team_metric': canberra}
indentify_people(metrics)

In [None]:
metrics = {'outlier_metric': cosine, 'team_metric': cosine}
indentify_people(metrics)

In [None]:
metrics = {'outlier_metric': default_metric, 'team_metric': default_metric}
indentify_people(metrics)