<a href="https://colab.research.google.com/github/gali1998/t-sne-tool/blob/master/Tool_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to run the notebook?

1.  Click "Runtime"->"run all" in the upper toolbar of the notebook.
2.  Under "Run for GUI", click the link, choose the google account which contains the files in its google drive.
3.	Click "allow", copy the code into the text box under " Enter your authorization code" in the notebook and enter.
4.	Upload the data file:
* Option one – upload the file to your Colab local directory (notice files are automatically deleted when runtime is recycled), then choose the file from dropdown menu.
>For new uploaded files to appear in the dropdown menu – run the third section under 'Code Section' again.
* Option two – copy path from your drive:
Find the file in the directory UI on the left side of the window – "drive" folder displays your google drive directories.
Then right click on the file or folder and 'copy path', paste it to the text box.

5.	Skip this step if data file is in .csv format.
If file is in .xlsx format, write the name of the sheet that contains the data matrix.
6.	Choose method and follow the next instruction according to the method you chose.

**Please notice –** the plots that are saved in the Colab local directory are overridden when the same visualization method is applied on the same data file – please download the file if you do not want it to get lost.


# Code Section

In [None]:

#@title Code
from tqdm.notebook import tqdm
import os
import sys
import matplotlib.pyplot as plt
from matplotlib.offsetbox import TextArea, DrawingArea, OffsetImage, AnnotationBbox
import matplotlib.image as mpimg
import matplotlib.patches as patches
import scipy.spatial.distance as ssd
from collections import Counter
from PIL import Image, ImageOps
from collections import Counter
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.manifold import TSNE
import sklearn.cluster
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score, normalized_mutual_info_score, auc
from ipywidgets.widgets.interaction import show_inline_matplotlib_plots
import csv

def resize_images_func(size, path):
    """
    Resize all the pictures in path to sizeXsize
    :param size: the pictures' length and width will be adjusted to sizeXsize
    :param path: the path to the pictures' folder
    :param names: a set of names
    
    :return: None
    """

    dirs = os.listdir(path)
    print('Resizing images for TSNE visualization...')
    for item in tqdm(dirs):
        if os.path.isfile(f"{path}/{item}"):
            try:
              im = Image.open(f"{path}/{item}")
            except:
              sys.exit("Error: failed open images. Make sure path " + \
                       " to pictures folder is corrrect")
            f, e = os.path.splitext(f"{path}/{item}")
            imResize = im.resize((size, size), Image.ANTIALIAS)
            imResize.save(f"{f}.jpg", 'JPEG', quality=90)
    print('Done resizing.')



def plot_grid_faces_per_face(data_df, num_faces, pictures_folder_path, query_face):
    """
    a plotting function to plot the closest faces to the query_face
    :param data_df = the data frame
    :param num_faces = the number of faces to plot in the grid-view
    :param pictures_folder_path = the path to the pictures folder
    :param query_face = the identity name we want to see the images that most similar to her
    
    :return: None
    """

    closest_faces = data_df.nsmallest(num_faces, query_face)
    faces = closest_faces.index.values
    show_image_selection(closest_faces, pictures_folder_path, False)



def get_real_clusters(df):
  """
  Generate a list in which each image in df has a cluster number.
  Notice: the character of each image is defined by its first
  two letters as identification.
  :param df: a dataframe in which each row represents an identity

  :return: a dictionaty in which keys are identity and values are cluster numbers.
  """

  name_to_label_dic = {}
  for name in df.index.values:
    if name[:2] not in name_to_label_dic:
      name_to_label_dic[name[:2]] = len(name_to_label_dic) + 1

  return name_to_label_dic



def give_number_to_identity(df, labels_dic):
    """
    Generate for each identity in labels_dic label number which represents the 
    real cluster number of this identity
    :param df: dataframe in which each row represents an identity
    :param labels_dic: a dic with key=index, value=picture name
    
    :return: a list with the real cluster labels for all identities in labels_dic
    """
    global REAL_IDENTITIES
    name_to_label_dic = REAL_IDENTITIES = get_real_clusters(df)

    real_clusters = []
    for name in labels_dic.values():
      name = name[:2]
      val = name_to_label_dic[name]
      real_clusters.append(val)

    return real_clusters



def plot_color_ledgend(colors, title):
    """
    a plotting function to plot the result of TSNE dimensional reduction with
    colors that match the chosen cluster method
    :param colors: a list with number of colors that match to the number of clusters
    :param title: title for saving the plot
    
    :return: None
    """
    global REAL_IDENTITIES
    num_colors = len(colors)

    func = lambda m,c: plt.plot([],[],marker=m, color=c, ls="none")[0]
    handles = [func("s", colors[i]) for i in range(num_colors)]

    if (COLOR_REAL):
      cluster_to_identity = dict((cluster,name) for name,cluster in REAL_IDENTITIES.items())
      labels = [cluster_to_identity[i+1] for i in range(num_colors)]
    else:
      labels = [i+1 for i in range(num_colors)]

    legend = plt.legend(handles, labels, loc='upper center', 
                        bbox_to_anchor=(0.5, -0.08), ncol=num_colors)



def tsne_scatter(x, pictures_folder_path, index_to_picture_name, title,
                 clusters_array, scale_tsne):
    """
    a plotting function to plot the result of TSNE dimensional reduction
    :param x =  a two dimensional array, X[:,0] is x_axis, X[:,1] is y_axis
    :param pictures_folder_path = the path of the pictures that correspond to
                                  each x,y point in the plot
    :param index_to_picture_name = the labels_dic - a dic with key=index,
                                   value=picture name
    :param title = title for saving the plot
    :param clusters_array = the array with the clustering labels
    :param scale_tsne = a boolean parameter indicates whether to scale tsne 
                        automaticly or to display axes in range [-500, 500]
    :return: None
    """
    
    clusters_colors= ['crimson', 'blue', 'yellowgreen','orange', 'turquoise',
                      'pink',  'gold', 'silver', 'orchid', 'black'] # 10 colors

    # create a scatter plot.
    f = plt.figure(figsize=(10, 10))
    ax = plt.subplot(aspect='equal')
    sc = ax.scatter(x[:, 0], x[:, 1], lw=0, s=40)
    ax.axis('on')
    ax.set_facecolor('xkcd:white')
    ax.grid(color='xkcd:light grey')
    # default limits of tsne plot axis
    if scale_tsne:
        xmin, xmax, ymin, ymax = -500, 500, -500, 500
        ax.set(xlim=(xmin,xmax), ylim=(ymin,ymax))

    ax.set_title(f'TSNE for {title}', fontsize=12)
    print('Plotting the TSNE plot...')
    for index in tqdm(index_to_picture_name.keys()):
        pic_name = index_to_picture_name[index]
        image_path = f'{pictures_folder_path}/{pic_name}'
        arr_img = try_imread(image_path)
        # create an image box with a certain zoom
        imagebox = OffsetImage(arr_img, zoom=0.3)

        # create an annotation box, which is at XY on graph
        if clusters_array is None:
            ab = AnnotationBbox(imagebox, xy=(x[:,0][index], x[:,1][index]), 
                                pad=0.07)  
        else:
            ab = AnnotationBbox(imagebox, xy=(x[:,0][index], x[:,1][index]),pad=0.07, 
                                bboxprops=dict(edgecolor=clusters_colors[clusters_array[index]-1]))

        ax.add_artist(ab)

    if clusters_array is not None:
        if (COLOR_REAL):
          cluster_to_identity = dict((cluster,name) for name,cluster in REAL_IDENTITIES.items())
          global NUM_CLUSTERS
          NUM_CLUSTERS = len(cluster_to_identity)

        plot_color_ledgend(clusters_colors[:NUM_CLUSTERS], title)

    show_inline_matplotlib_plots()
    f.savefig(f"TSNE ({title}).png")
   
    return (f, ax, sc)



def prepare_df_for_linkage_clustering(df):
    """
    preparing the df for linkage clustering
    converts the df from redundant distance matrix to condensed one
    :param df = the data frame
    
    :return: a new df after the transformations 
    """

    distArray = ssd.squareform(df)
    return distArray



def prepare_df_for_spectral_clustering(df):
    """
    preparing the df for spectral clustering
    the fit function of spectral clustering needs to get affinity matrix
    thus, we do the transformation such that small distance will be represented 
    by a larger value of affinity, and big distance will be represented by a 
    smaller value of affinity
    :param df = the data frame
    
    :return: copy_df = a copy of the df after the transformations 
    """
    copy_df = df.copy()
    for i in range(len(df)):
      for j in range(len(df.columns)):
        dist_val = df.iloc[i,j]
        sim_val = 1/(1+dist_val)
        copy_df.iloc[i,j] = sim_val
    
    return copy_df



def majority_identity(images_list):
    """
    finds the majority identity in the images_list
    :param images_list = a list with the images names 
    
    :return: the name of the majority image in the images list
    """
    
    images_list = [img[:2] for img in images_list]
    return max(images_list, key=images_list.count)



def plot_clustered_faces(data_df, pictures_folder_path ,labels_dic, clusters):
    """
    a plotting function to plot the result of clustered data
    :param data_df = the data frame
    :param pictures_folder_path = the path of the pictures folder
    :param labels_dic = a dic with key=index, value=picture name
    :param clusters = an array with the clustering labels
    
    :return: None
    """

    cluster_sizes_counter = 0
    for cluster in Counter(clusters):
      cluster_size = Counter(clusters)[cluster]
      all_images = [labels_dic[j+cluster_sizes_counter] for j in range(cluster_size)]
      representative = majority_identity(all_images)

      if cluster_size > 1:
        fig, ax = plt.subplots(ncols=cluster_size, figsize=(cluster_size, cluster_size))
        fig.subplots_adjust(hspace=0, wspace=0)
        for j in range(cluster_size):
            image_path = f'{pictures_folder_path}/{labels_dic[j+cluster_sizes_counter]}'
            ax[j].xaxis.set_major_locator(plt.NullLocator())
            ax[j].yaxis.set_major_locator(plt.NullLocator())
            try:
                img = Image.open(image_path).convert('RGB')
            except:
              show_inline_matplotlib_plots()
              sys.exit("Error: Failed open images. Make sure path to picture " + \
                       "is corrrect")
            color = 'green' if labels_dic[j+cluster_sizes_counter][:2]==representative else 'red'
            img_with_border = ImageOps.expand(img,border=7,fill=color)
            ax[j].set_xlabel(labels_dic[j+cluster_sizes_counter][:2], size='x-small')
            ax[j].imshow(img_with_border, cmap="bone")
      else:
        fig, ax = plt.subplots(ncols=1, figsize=(1, 1))
        fig.subplots_adjust(hspace=0, wspace=0)
        image_path = f'{pictures_folder_path}/{labels_dic[cluster_sizes_counter]}'
        ax.xaxis.set_major_locator(plt.NullLocator())
        ax.yaxis.set_major_locator(plt.NullLocator())
        try:
            img = Image.open(image_path).convert('RGB')
        except:
              show_inline_matplotlib_plots()
              sys.exit("Error: failed open images. Make sure picture " + \
                       " folder path is corrrect")
        color = 'green' if labels_dic[cluster_sizes_counter][:2]==representative else 'red'
        img_with_border = ImageOps.expand(img,border=7,fill=color)
        ax[j].set_xlabel(labels_dic[cluster_sizes_counter][:2], size='x-small')
        ax.imshow(img_with_border, cmap="bone")
      cluster_sizes_counter+=cluster_size

    show_inline_matplotlib_plots()



def create_labels_pred_and_true(df, labels_dic, clusters):
      """
      creates a list of labels_pred and a list of labels_true
          labels_true = a list with labels that match the real identities
          labels_pred = a list with labels that match the result of the clustering method
      :param labels_dic = a dic with key=index, value=picture name
      :param clusters = an array with the clustering labels
      
      :return: two lists (labels_pred, labels_true)
      """

      name_to_label_dic = get_real_clusters(df)
      labels_pred = labels_dic.copy()
      labels_true = []

      cluster_sizes_counter = 0
      for cluster in Counter(clusters):
        cluster_size = Counter(clusters)[cluster]
        all_images = [labels_dic[j + cluster_sizes_counter] for j in range(cluster_size)]
        representative = majority_identity(all_images)

        for j in range(cluster_size):
          labels_pred[j + cluster_sizes_counter] = name_to_label_dic[representative]

        cluster_sizes_counter += cluster_size
      
      labels_pred = [val for val in labels_pred.values()]

      for name in labels_dic.values():
        name = name[:2]
        val = name_to_label_dic[name]
        labels_true.append(val)

      return labels_pred, labels_true



def find_FP_FN(df, labels, threshold):
    """
    calculates the precentages of FP (false positive) and FN (false negative)

    :param df = the data frame
    :param labels = a dic with key=index, value=picture name
    :param threshold = the current threshold for deciding if it is fp or fn
    
    :return: the precentage of tp (true positive), fp (false positive),
             tn (true negative), fn (false negative), 
             total (how many thresholds were checked)
    """

    cnt_tp = 0
    cnt_fp = 0
    cnt_tn = 0
    cnt_fn = 0
    cnt_total = 0

    for i in range(df.shape[0]):
      for j in range(df.shape[1]):
        cnt_total += 1
        dist = df.iloc[i,j]
        name1 = labels[i][:2]
        name2 = labels[j][:2]

        if name1 != name2:
          if dist <= threshold:
            cnt_fp += 1
          else:
            cnt_tn += 1
        if name1 == name2:
          if dist > threshold:
            cnt_fn += 1
          else:
            cnt_tp += 1
    
    return cnt_tp/cnt_total, cnt_fp/cnt_total, \
           cnt_tn/cnt_total, cnt_fn/cnt_total, \
           cnt_total



def plot_ROC(fpr, tpr, title):
    """
    Plots ROC curve

    :param fpr: array of false positive rate values
    :param tpr: array of true positive rate values
    :param title: title for plot
    :return: None
    """
    # AUC
    fpr, tpr = zip(*sorted(zip(fpr, tpr)))
    # print(f"AUC: {auc(fprs, tprs)}")
    AUC = auc(fpr, tpr)

    # plot ROC
    plt.plot(fpr, tpr, color='orange')
    plt.xlabel(f'False Positive Rate\n\nAUC = {AUC}')
    plt.ylabel('True Positive Rate')
    plt.title(f"ROC ({title})")
    plt.savefig(f"ROC ({title}).png", bbox_inches='tight')
    show_inline_matplotlib_plots()



def plot_error_rate_for_recognition(df, sheet_name, labels, title):
    """
    a plotting function to plot the error rate for the recognition task
        xlabel = thresholds
        ylabel = % Mistakes
        in red fp, in blue fn, in purple the sum of them

    In addition, plots the ROC curve with its AUC
    and creates a csv file for the data, in which each line represents
    a threshold. Columns are:
        threshold, tp (true positive), fp (false positive),
        tn (true negative), fn (false negative),
        tpr (false positive rate), fpr (false positive rate)

    :param df: the data frame
    :param sheet_name: the sheet name of the excel file
    :param labels: a dic with key=index, value=picture name
    :param title: title for plot
    :return: None
    """

    thresholds = []
    max_dist = df.max().max()
    min_dist = df.min().min()
    thresholds = [threshold for threshold in np.linspace(min_dist, max_dist, num=100)]

    tps, fps, tns, fns, sums = [], [] ,[], [] ,[]
    tprs, fprs = [], []
    for threshold in tqdm(thresholds):
        tp, fp, tn, fn, total = find_FP_FN(df, labels, threshold)
        tps.append(tp)
        fps.append(fp)
        tns.append(tn)
        fns.append(fn)
        tprs.append(tp/(tp+fn))
        fprs.append(fp/(tn+fp))
        sums.append(fp+fn)  # sum false
        
    plt_title = f'Error Rate In Recognition Task: {sheet_name}'
    plt.plot(thresholds, fps, color='red')
    plt.plot(thresholds,fns , color='blue')
    plt.plot(thresholds, sums, color='purple')
    plt.legend(['fp','fn','sum'], loc='upper left')
    plt.xlabel('Threshold')
    plt.ylabel('% Mistakes')
    plt.title(plt_title)
    plt.savefig(f"Error Rate ({title}).png", bbox_inches='tight')
    show_inline_matplotlib_plots()

    plot_ROC(fprs, tprs, title)

    # save data to txt file
    with open(f'Error Rate data ({title}).txt', mode='w') as csv_file:
      csv_writer = csv.writer(csv_file, delimiter=',')
      csv_writer.writerow(['threshold', 'tp', 'np', 'tn', 'fn', 'TPR', 'FPR'])
      for i in range(len(thresholds)):
         csv_writer.writerow([thresholds[i], tps[i], fps[i], \
                             tns[i], fns[i], tprs[i], fprs[i]])



def try_imread(image_path):
    """
    trying to read the image
    if unable to read - trying to read it in another format

    :param image_path = the path of the pictures folder

    :return: an array of the images
    """

    try:
      arr_img = mpimg.imread(image_path)  # open the image
    except:
      
      arr_img = mpimg.imread(image_path.replace('jpg','png'), format='jpg')
    return arr_img



def validate_df(df):
    """
    A function to validate a dataframe
    if the df is a triangular matrix - makes it symmetrical and take all the 
                                       NaN values that are not in the traingle 
                                       and fill them with the mean
    otherwise the df just has missing values - fill them with the mean

    Now, if the df has any negative elements we assume normalization in subject,
    so we make transformation to distance value.
    And if the diagonal is missing fill it with 0 - it's for pairs of the same pictures.
    :param df = the data frame

    :return: the new df
    """

    copy_df = df.copy()
    nan_map = df.isna()
    if nan_map.values.any(): # if the df has any NaN values
      print('data has missing values')
      new_df = copy_df.copy().fillna(0)
      if np.allclose(new_df.values, np.tril(new_df.values)) or np.allclose(new_df.values, np.triu(new_df.values)): 
        print('data is a triangular matrix, making it symmetrical')
        ## If the df is upper triangular or lower triangular - fill with zeroes and add the transpose ##
        copy_df.fillna(0, inplace=True)
        copy_df = copy_df.to_numpy().T + copy_df.to_numpy()
        copy_df = pd.DataFrame(copy_df)
        # get all the NaN values that are not in the upper\lower triangle and fill with mean
        for i in range(len(df)):
          for j in range(len(df.columns)):
            if nan_map.iloc[i,j] and nan_map.iloc[j,i] and i!=j:
              copy_df.iloc[i,j]=(df.mean()).mean()
      else:
        # otherwise the df just has missing values- fill with mean
        copy_df.fillna((df.mean()).mean(),inplace=True)

    
    # if the df has any negative elements we assume normalization in subject
    if (df < 0).any().any():
      print('data has negative values')
      copy_df = copy_df- max(df.max())
      copy_df = -1*copy_df

    # if the diagonal is missing fill it with 0 - it's for pairs of the same pictures
    for i in range(len(df)):
        for j in range(len(df.columns)):
          if nan_map.iloc[i,j] and i==j:
            # print('data has missing values on the diagonal')
            np.fill_diagonal(copy_df.values,0)
    copy_df = pd.DataFrame(copy_df)
    
    return copy_df



def visualize(excel_filename, sheet_name, visualization_method, **kwargs):
    """
    Plots a visualization based on the distances given in the excel file.
    The visualization is chosen by the "visualization_method".
    :param excel_filename: the filename of the excel file containing distances 
                           between all faces relevant for analysis.
                           Each sheet - should contain the data for each subgroup,
                           Each line - should be in the following format: 
                           Pic1 Pic2 | distance
    :param sheet_name: the sheet we want to plot
    :param visualization_method: one of the following options:
                1. 'T-SNE' - shows a tsne plot based on distances
                   if COLOR_TSNE is True - shows a tsne plot with colors according to 
                   the chosen method (linkage, spectral, tsne clusters(kmeans))
                   if COLOR_REAL is True - shoes a tsne plot with colors according to 
                   the real identities
                2. 'Closest faces' - shows for a chosen face a grid of chosen number 
                    of similar faces.
                3. 'RDM' - shows a RDM matrix
                4. 'Clustering' - shows the clusters according to chosen 
                   clustering method
                5. 'Error rate for recognition task' - shows an error rate and
                   ROC plots of recognition task
    :param kwargs: optional parameters to specify for certain visualization methods.
          1. pictures_folder_path: used in grid-view, tsne, cluster methods
                the path to the folder containing a folder for each subgroup.
                Each subfolder contains the faces' pictures - to use in the plot.
                The names of the pictures should correspond to the excel data 
                (e.g Pic1 and Pic2)
          2. num_faces: how many faces to display in 'Closest faces'
          3. num_clusters: how many clusters to look for in the data
          4. cluster_method: what clustering method to use
          5. scale_tsne: a boolean parameter indicates whether to scale tsne 
                         automaticly or to display axes in range [-500, 500]


    :return: None
    """

    sns.set_style('darkgrid')
    sns.set_palette('muted')
    sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
      
    # create a dataframe from the excel_filename and sheet_name
    if(excel_filename[-4:] == '.csv'):
        df = pd.read_csv(excel_filename, index_col=0, header=0)
        for col in df.columns:
          df[col] = pd.to_numeric(df[col], errors='coerce')

        title = excel_filename.split('/')[-1]
    else:
        df = pd.read_excel(excel_filename, sheet_name=sheet_name,
                           index_col=0, header=0) # dependency- xlrd
        title = f"{excel_filename.split('/')[-1]}_{sheet_name}"

    labels_dic = {} # label is picture name
    for i in range(df.shape[0]):
        labels_dic[i] = df.columns[i]
    df = validate_df(df)
    df = df.rename(labels_dic, axis='index')
    df = df.rename(labels_dic, axis='columns')

    global DF
    DF = df

    if visualization_method == 'RDM':
        print("Working on RDM Matrix visualization...")
        f, ax = plt.subplots(figsize=(len(df)/2, len(df)/2.8))
        sns.heatmap(df, ax=ax, xticklabels=True, yticklabels=True, square=True)
        ax.set_title(f'RDM for {title}') # Set the title
        ax.xaxis.tick_top() # change the location of x-axis
        plt.xticks(rotation=90)
        show_inline_matplotlib_plots()
        f.savefig(f'RDM ({title}).png', dpi=100, bbox_inches='tight')

    if visualization_method == 'Closest faces':
        print("Working on 'Closest faces' visualization...")
        pictures_folder_path = kwargs.get('pictures_folder_path', '')
        num_faces_to_plot = kwargs.get('num_faces', 5)
        show_image_selection(df, pictures_folder_path, True)

    if visualization_method == 'T-SNE':
        print("Working on TSNE visualization... (might take a couple of minutes)")
        pictures_folder_path = kwargs.get('pictures_folder_path', '')
        scale_tsne = kwargs.get('scale_tsne', True)
        RS=1
        tsne_object = TSNE(method= "exact", metric="precomputed", random_state=RS, 
                           perplexity=5)
        fashion_tsne = tsne_object.fit_transform(df)
        resize_images_func(100, pictures_folder_path)
  
        if COLOR_TSNE:
            if LINKAGE == 'linkage':
              prepared_df = prepare_df_for_linkage_clustering(df)
              clusters_array = fcluster(linkage(prepared_df, method='complete'), 
                                        t=NUM_CLUSTERS, criterion='maxclust')
            elif LINKAGE =='spectral':
              copy_df = prepare_df_for_spectral_clustering(df)
              clustering = SpectralClustering(n_clusters=NUM_CLUSTERS, 
                                              affinity='precomputed').fit(copy_df)
              clusters_array = clustering.labels_ + 1
            else: # LINKAGE == 'T-SNE clusters'
              kmeans = sklearn.cluster.KMeans(n_clusters=NUM_CLUSTERS, 
                                              random_state=0).fit(fashion_tsne)
              clusters_array = kmeans.labels_ + 1
        elif COLOR_REAL:
            clusters_array = give_number_to_identity(df, labels_dic)
        else:
            clusters_array = None
        
        tsne_scatter(fashion_tsne, pictures_folder_path=pictures_folder_path,  
                     index_to_picture_name=labels_dic, title=title, 
                     clusters_array=clusters_array, scale_tsne=scale_tsne)
      
    if visualization_method == 'Clustering':
        print("Working on visualization...")
        cluster_method = kwargs.get('cluster_method', 'Linkage')
        pictures_folder_path = kwargs.get('pictures_folder_path', '')

        if cluster_method == 'Linkage':
            prepared_df = prepare_df_for_linkage_clustering(df)
            clustered_data = fcluster(linkage(prepared_df, method='complete'), 
                                      t=NUM_CLUSTERS, criterion='maxclust')
        else: # cluster_method == 'Spectral'
            copy_df = prepare_df_for_spectral_clustering(df)
            clustering = SpectralClustering(n_clusters=NUM_CLUSTERS, 
                                        affinity='precomputed').fit(copy_df)
            clustered_data = clustering.labels_

        p = plot_clustered_faces(df, pictures_folder_path, labels_dic, 
                                 clustered_data)
        
        # silhouette score
        print(f"Silhouette score: {silhouette_score(df,clustered_data)}")

        # NMI score
        labels_pred, labels_true = create_labels_pred_and_true(df, labels_dic, 
                                                               clustered_data)
        NMI_score = normalized_mutual_info_score(labels_true, labels_pred)
        print(f'NMI score: {NMI_score}')
      

    if visualization_method == 'Error rate for recognition task':
        print("Working on visualization...")
        plot_error_rate_for_recognition(df, sheet_name, labels_dic, title)
        


In [None]:
#@title GUI code section
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual, Layout, \
                       Button, VBox, Label, Box, HBox, Output
import ipywidgets as widgets
from IPython.display import display, clear_output
from google.colab import output


style = {'description_width': 'initial'}

def chose_image(item):
    """
    Shows a grid of #num_faces images most similar to item description's image
    :param item = widget item with an image name as a description
    
    :return: None
    """
    
    p = plot_grid_faces_per_face(data_df=DF, pictures_folder_path=PIC_PATH,
                                 num_faces=NUM_FACES, 
                                 query_face=item.description)



def show_image_selection(df, pictures_folder_path, with_buttons):
    """
    Display all images in data frame as a grid
    :param df = a data frame
    :param pictures_folder_path = a path to the pictures folder contains images
                                  with names as df's rows - each row title
                                  should have an image named as that title
    :param with_buttons = if true, display a button for each image, when clicked
                          displays a grid of #num_faces images most similar to
                          chosen image

    :return: None
    """

    images_names = df.index.values
    num_images = len(images_names)
    hb_arr = []
    button_layout = Layout(height='30px', width='100px')

    if with_buttons:
      print(f'\n\nchoose an image to display most similar:')
      buttons = [Button(layout=button_layout, 
                        description=(images_names[i])) for i in range(num_images)]
    else:
      pic_labels = [widgets.Label(images_names[i][:2]) for i in range(num_images)]
    pics = []
    for i in range(num_images):
      img = open(f'{pictures_folder_path}/{images_names[i]}', 'rb').read()
      w_img = widgets.Image(value=img)
      w_img.layout.object_fit = 'contain'
      if i == 0 and not with_buttons:
          w_img.layout.border = '2px solid orange'
      pics.append(w_img) 
    
    inner_box_layout = Layout(border='1px solid black', flex_flow='column',
                              display='center')
    num_cols = 12
    for j in range(num_images//num_cols + 1):
        carousel = []
        for i in range(num_cols):
            cur = (j*num_cols) + i
            if cur < num_images:
                if with_buttons:
                    buttons[cur].on_click(chose_image)
                    carousel.append(Box(children=[pics[cur], buttons[cur]], 
                                        layout=inner_box_layout))
                else:
                    carousel.append(Box(children=[pics[cur],pic_labels[cur]], 
                                        layout=inner_box_layout))
        hb = HBox(children=carousel)
        hb_arr.append(hb)

    if not with_buttons:
      item1 = widgets.Label(f'{NUM_FACES} most similar faces are:')
      item2 = widgets.Label('Notice: you can select another image and change ' \
                            '# of faces.\nNo need to run again.\n')
      box1 = widgets.VBox([item1])
      box2 = widgets.VBox([item2])
      hb_arr = [widgets.HBox([box1])] + hb_arr + [widgets.HBox([box2])] 
    
    vb = VBox(children=hb_arr)
    GRID.append(vb)
    display(vb)


def collect_data():
    """
    This function collects all the general data by using interaction with the user:
        excel file name - two options: 
            1. dropdown menu of csv and xlsx files of current directory
            2. entering a full path
        sheet name
        visualization method

    :return: None
    """

    # create dropdown list
    data_file_options = []
    data_file_options.append(("choose from local directory...", ""))
    for item in glob.glob("*.csv"):
        data_file_options.append((item, '/content/'+item))
    for item in glob.glob("*.xlsx"):
        data_file_options.append((item, '/content/'+item))

    method_options = ['T-SNE','RDM', 'Closest faces', 'Clustering', 
                      'Error rate for recognition task']
    interact(enter_names, 
            excel_filename_dropdown=widgets.Dropdown(options=data_file_options, 
                                                     description='Data file:',
                                                     style=style),
            excel_filename_path=widgets.Text(description='* or enter data file path:',
                                             style=style),
            sheet_name=widgets.Text(description='Sheet name in file:', style=style), 
            visualization_method=widgets.Dropdown(options=method_options, 
                                                  description='Visualization method:',
                                                  style=style))


def collect_tsne_clusters(num_clusters, linkage):
    """
    collect the relevant data for tsne clusters into global params
    :param num_clusters = the number of clusters the user want
    :param linkage = a boolean that indicates the cluster method for tsne 
                     color visualization

    :return: None
    """
    global NUM_CLUSTERS, LINKAGE
    NUM_CLUSTERS = num_clusters
    LINKAGE = linkage


def collect_tsne_data(pictures_folder_path, scale_tsne, color_method):
    """
    collect the tsne data into global params
    :param pictures_folder_path = the path to the pictures folder
    :param scale_tsne = a boolean parameter indicates whether to scale tsne 
                        automaticly or to display axes in range [-500, 500]
    :param color_tsne = a boolean indicates if the user want regular tsne plot 
                        or colored tsne plot. If True - display another two 
                        fields in order to choose how to color the plot:
                            num_clusters - the user needs to choose number of 
                                           clusters. default is 5
                            clustering method - the user needs to choose a 
                                                clustering method
    :param color_real = a boolean indicates if the user want colored tsne plot 
                        (a plot with color for each identity)

    :return: None
    """
    global PIC_PATH, COLOR_TSNE, COLOR_REAL, SCALE_TSNE, NUM_CLUSTERS
    PIC_PATH = pictures_folder_path
    COLOR_TSNE = COLOR_REAL = False

    if color_method == 'Clusters':
      COLOR_TSNE = True
    elif color_method == 'Real identities':
      COLOR_REAL = True
    
    SCALE_TSNE = scale_tsne
    color_options = ['linkage', 'spectral', 'T-SNE clusters']
    if (COLOR_TSNE):
      interact(collect_tsne_clusters,
               num_clusters=widgets.IntSlider(value=5, min=1, max=10, 
                                              description='# of clusters:', 
                                              style=style),
               linkage=widgets.Dropdown(options=color_options, 
                                        description='Choose the clustering method:',
                                        style=style))
    else:
      interact(empty_collect_data)


def empty_collect_data():
    """
    remove widgets

    :return: None
    """
    return


def collect_grid_view_data(pictures_folder_path, num_faces):
    """
    collect the grid view data into global params
    :param pictures_folder_path = the path to the pictures folder
    :param num_faces = the number of faces the user wants in the grid-view

    :return: None
    """
    global PIC_PATH, NUM_FACES
    PIC_PATH = pictures_folder_path
    NUM_FACES = num_faces


def collect_cluster_linkage_data(pictures_folder_path, num_clusters, 
                                 cluster_method):
    """
    collect the cluster linkage data into global params
    :param pictures_folder_path = the path to the pictures folder
    :param num_clusters = the number of clusters the user wants
    :param cluster_method = which clustering method to apply - 
                            linkage or spectral

    :return: None
    """
    global PIC_PATH, NUM_CLUSTERS, CLUSTER_METHOD
    PIC_PATH = pictures_folder_path
    NUM_CLUSTERS = num_clusters 
    CLUSTER_METHOD = cluster_method


def collect_cluster_spectral_data(pictures_folder_path, num_clusters):
    """
    collect the cluster spectral data into global params
    :param pictures_folder_path = the path to the pictures folder
    :param num_clusters = the number of clusters the user wants

    :return: None
    """
    global PIC_PATH, NUM_CLUSTERS
    PIC_PATH = pictures_folder_path
    NUM_CLUSTERS = num_clusters 


try:
  clear_grid(GRID)
except:
  None
GRID = []
def enter_names(excel_filename_dropdown, excel_filename_path, sheet_name, 
                visualization_method):
    """
    collect the general data into global parameters
    :param excel_filename_dropdown = if not empty, takes the excel file name 
                                     from this param
    :param excel_filename_path =  if not empty, takes the excel file name from 
                                  this param
    :param sheet_name = taking the sheet name from this param
    :param visualization_method = taking the visualization method from this param
                                  for each method interact with the user in order 
                                  to collect the relevant data to the choosen method

    :return: None
    """
    global EXCEL_FILENAME, SHEET_NAME, VIS_METHOD

    if excel_filename_dropdown == "":
        EXCEL_FILENAME = excel_filename_path
    else:
        EXCEL_FILENAME = excel_filename_dropdown

    SHEET_NAME = sheet_name
    VIS_METHOD = visualization_method

    try:
      prev_pic_path = PIC_PATH
    except:
      prev_pic_path = ""

    cluster_color_options = ['None', 'Real identities', 'Clusters']
    if visualization_method == 'T-SNE':
      interact(collect_tsne_data,
               pictures_folder_path=widgets.Text(value=prev_pic_path,
                                                 description='Path to pictures:', 
                                                 style=style),
               scale_tsne=widgets.Checkbox(value=True, 
                                           description='Display plot in [-500,500] axes', 
                                           indent=False),
               color_method=widgets.RadioButtons(options=cluster_color_options,
                                                 value='None',
                                                 style=style,
                                                 description='Coloring method:',
                                                 indent=False))

    if visualization_method == 'RDM':
        interact(empty_collect_data)

    if visualization_method == 'Closest faces':
        interact(collect_grid_view_data,
                pictures_folder_path=widgets.Text(value=prev_pic_path,
                                                  description='Path to pictures:',
                                                  style=style),
                num_faces=widgets.IntSlider(2, description='# of most similar faces:',
                                            style=style))
      
    if visualization_method == 'Clustering':
        interact(collect_cluster_linkage_data,
                pictures_folder_path=widgets.Text(value=prev_pic_path,
                                                  description='Path to pictures:',
                                                  style=style),
                num_clusters=widgets.IntSlider(value=5, min=1, max=10, 
                                               description='# of clusters:',
                                               style=style),
                cluster_method=widgets.RadioButtons(options=['Linkage', 'Spectral'],
                                                    value='Linkage',
                                                    style=style,
                                                    description='Clustering method:',
                                                    indent=False))

    if visualization_method == 'Error rate for recognition task':
      interact(empty_collect_data)


def clear_grid(grid):
  if len(grid) == 0:
    return

  for i in range(len(grid)):
    grid[i].close()
  global GRID
  GRID = []


# important to pass the output object for print() to be captured
def on_button_clicked(b):
    """
    when the user clicked on the 'Run!' button
    this function calls to the visualize function with the collected global params
    :param b = the relevant button

    :return: None
    """

    with out:
      clear_output(wait=True)
      output.clear()
      clear_grid(GRID)

      print(f'Working on {VIS_METHOD}...')
      if EXCEL_FILENAME and VIS_METHOD:
        if VIS_METHOD == 'T-SNE':
          visualize(EXCEL_FILENAME, SHEET_NAME, VIS_METHOD, pictures_folder_path=PIC_PATH,
                    scale_tsne=SCALE_TSNE)
        elif VIS_METHOD == 'RDM':
          visualize(EXCEL_FILENAME, SHEET_NAME, VIS_METHOD)
        elif VIS_METHOD =='Closest faces':
          visualize(EXCEL_FILENAME, SHEET_NAME, VIS_METHOD, pictures_folder_path=PIC_PATH, 
                    num_faces=NUM_FACES)
        elif VIS_METHOD == 'Clustering':
          visualize(EXCEL_FILENAME, SHEET_NAME, VIS_METHOD, pictures_folder_path=PIC_PATH, 
                    num_clusters=NUM_CLUSTERS, cluster_method=CLUSTER_METHOD)
        elif VIS_METHOD == 'Error rate for recognition task':
          visualize(EXCEL_FILENAME, SHEET_NAME, VIS_METHOD)


button_run = widgets.Button(description="Run!")
button_run.on_click(on_button_clicked)


In [None]:
#@title Run for GUI
import glob
from google.colab import drive
from ipywidgets import Output

print("Mount Google Drive- use the account that holds the pictures")
drive.mount('/content/drive')
print("Done mounting Google Drive! \n")

print("Please enter the following:")
collect_data()
display(button_run)

out = Output()
display(out)


Mount Google Drive- use the account that holds the pictures
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Done mounting Google Drive! 

Please enter the following:


interactive(children=(Dropdown(description='Data file:', options=(('choose from local directory...', ''),), st…

Button(description='Run!', style=ButtonStyle())

Output()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
