In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

from koe.model_utils import get_or_error
from koe.models import Database
from koe.sequence_utils import get_sequences
from koe.storage_utils import get_sids_tids
from root.models import User
from koe.cluster_analysis_utils import SimpleNameMerger, NameMerger, get_syllable_labels
from koe.sequence_utils import songs_to_syl_seqs
from koe.sequence_utils import calc_class_ajacency, calc_class_dist_by_adjacency

from nltk import ngrams

from koe.models import Segment, AudioFile
from root.models import ExtraAttr, ExtraAttrValue
from collections import Counter

import os
from django.conf import settings
from koe.ts_utils import extract_tensor_metadata, write_metadata, bytes_to_ndarray, get_rawdata_from_binary
from koe.model_utils import natural_order
from scipy.cluster.hierarchy import cut_tree

import colorlover as cl
nCategoricalColours = 11

from PIL import Image


In [None]:
database = get_or_error(Database, dict(name='Bellbird_TMI'))
annotator_name = 'wesley'
label_level = 'label'

sids, tids = get_sids_tids(database)
annotator = get_or_error(User, dict(username__iexact=annotator_name))

label_arr, syl_label_enum_arr = get_syllable_labels(annotator, label_level, sids)

enum2label = {enum: label for enum, label in enumerate(label_arr)}
sid2enumlabel = {sid: enum_label for sid, enum_label in zip(sids, syl_label_enum_arr)}

adjacency_mat_sym, _ = calc_class_ajacency(database, syl_label_enum_arr, enum2label, sid2enumlabel,
                                           count_style='symmetric', count_circular=False)

adjacency_mat_asym, _ = calc_class_ajacency(database, syl_label_enum_arr, enum2label, sid2enumlabel,
                                            count_style='asymmetric', count_circular=False)

adjacency_mat_sep, _ = calc_class_ajacency(database, syl_label_enum_arr, enum2label, sid2enumlabel,
                                           count_style='separate', count_circular=False)

In [None]:
counter = Counter(syl_label_enum_arr)
nlabels = len(counter)
frequencies = np.array([counter[i] for i in range(nlabels)])

adjacency_mat_sym_norm = adjacency_mat_sym / frequencies[:, None]
adjacency_mat_asym_norm = adjacency_mat_asym / frequencies[:, None]
adjacency_mat_sep_norm = adjacency_mat_sep / frequencies[:, None]

# adjacency_mat_bw_norm = adjacency_mat / frequencies
# df_norm = pd.DataFrame(distmat, columns=label_arr, index=label_arr)
# df_norm.style.background_gradient(cmap='Blues')

In [None]:
import plotly.plotly as py
import plotly.figure_factory as ff
from scipy.cluster import hierarchy

import numpy as np


def plot_spectrogram(Z, name, color_threshold=None, pdf=None):
    fig = plt.figure(figsize=(18,18))
    ax = fig.gca()
    ax.set_title(name)
    hierarchy.dendrogram(
        Z,
        labels=label_arr,
        leaf_font_size=9.,
        orientation='right',
        color_threshold=color_threshold,
    )
    if pdf:
        pdf.savefig(fig)
        plt.close()
    else:
        plt.show()

pdf = PdfPages('dendrogram-new.pdf')
mats = {
    'Symmetric': adjacency_mat_sym_norm,
    'Asymmetric': adjacency_mat_asym_norm,
    'Separate': adjacency_mat_sep_norm,
}
for name, mat in mats.items():
    Z = hierarchy.linkage(mat, 'average')
    plot_spectrogram(Z, name, pdf=pdf)

pdf.close()

In [None]:
def visualise_syntactical_similar_syllables(merged_classes_names, pdf=None):
    nClasses = len(merged_classes_names) + 1
    if nClasses <= nCategoricalColours:
        #     colours = cl.to_numeric(cl.scales[str(nClasses)]['div']['Spectral'])
        colours = cl.to_numeric(cl.scales[str(nClasses)]['div']['Spectral'])
    else:
        colours = cl.to_numeric(cl.interp(cl.scales[str(nCategoricalColours)]['div']['Spectral'], nClasses))
    colours = (np.array(colours) / 255.).tolist()
    
    
    # Display clustering:
    fig = plt.figure(figsize=(10,10))
    ax = fig.gca()

    syl_inds_unused = np.ones((len(sids),))
    for cls, colour in zip(merged_classes_names, colours[1:]):
        syl_inds = np.where(label_col==cls)
        syl_inds_unused[syl_inds] = 0
        x=ordination_data[syl_inds, 0]
        y=ordination_data[syl_inds, 1]
        c = colour

        ax.scatter(x=x, y=y, s=100, c=[c], edgecolors=(0,0,0), linewidths=1, label=cls, alpha=0.5)

    syl_inds_unused = np.where(syl_inds_unused==1)
    x=ordination_data[syl_inds_unused, 0]
    y=ordination_data[syl_inds_unused, 1]
    c = colours[0]

    ax.scatter(x=x, y=y, s=10, c=[c], linewidths=0, label='other', alpha=0.2)
    plt.legend(loc=2)
    
    if pdf:
        pdf.savefig(fig)
        plt.close()
    else:
        plt.show()

def visualise_syllables(merged_classes_names, label_col, tid_col, pdf=None):
    fig_w_in = 18
    dpi = 72
    fig_w_px = int(fig_w_in * dpi)

    final_imgs_combs = []
    subplot_cols = []
    current_subplot_col = 0
    for cls in merged_classes_names:
        subplot_cols.append([current_subplot_col])

        syl_inds = np.where(label_col==cls)
        selected_tids = tid_col[syl_inds]
        img_dir = 'user_data/spect/fft/syllable'
        selected_syl_imgpth = [img_dir + '/' + tid + '.png' for tid in selected_tids]

        images = [Image.open(i) for i in selected_syl_imgpth]
        widths, heights = zip(*(i.size for i in images))
        max_height = max(heights)
        total_height = max_height
        offset = 20
        imgs_combs = []
        imgs_comb = np.full((max_height, fig_w_px, 3), 255, dtype=np.uint8)
        current_col = 0
        col_count = 1
        for img in images:
            img_arr = np.asarray(img)
            width, height = img.size
            if current_col + width > fig_w_px:
                imgs_combs.append(imgs_comb)
                col_count += 1
                if col_count <= 2:
                    imgs_comb = np.full((max_height, fig_w_px, 3), 255, dtype=np.uint8)
                    total_height += (max_height + offset)
                    current_subplot_col += 1
                    current_col = 0
                else:
                    imgs_comb = None
            
            if col_count > 2:
                break
            imgs_comb[:, current_col:current_col + width] = img_arr
            current_col = current_col + width + offset

        if imgs_comb is not None:
            imgs_combs.append(imgs_comb)
        final_imgs_comb = np.full((total_height, fig_w_px, 3), 255, dtype=np.uint8)

        current_subplot_col += 1
        subplot_cols[-1].append(current_subplot_col)

        current_row = 0
        for imgs_comb in imgs_combs:
            final_imgs_comb[current_row:current_row + max_height, :] = imgs_comb
            current_row = current_row + max_height + offset

        final_imgs_comb = Image.fromarray(final_imgs_comb)
        final_imgs_combs.append(final_imgs_comb)

    max_height = max([x.size[1] for x in final_imgs_combs])
    total_height_px = max_height * len(merged_classes_names)

    total_height_in = total_height_px / dpi

    fig = plt.figure(figsize=(fig_w_in, total_height_in))
    for i, cls in enumerate(merged_classes_names):
        subplot_col = subplot_cols[i]
        start_col = subplot_col[0]
        span = subplot_col[1] - start_col
        ax = plt.subplot2grid((current_subplot_col, 1), (start_col, 0), rowspan=span)
#         print('plt.subplot2grid(({}, 1), ({}, 0), rowspan={})'.format(current_subplot_col, start_col, span))
        final_imgs_comb = final_imgs_combs[i]
        ax.imshow(np.asarray(final_imgs_comb))
        ax.axis('off')
        ax.set_title(cls)

    if pdf:
        pdf.savefig(fig)
        plt.close()
    else:
        plt.show()

In [None]:
def analyse_percentile(pct, base_pdf_name, tree, label_col, tid_col):
    heights = tree[:, 2]
    cut_off = np.percentile(heights, pct)
    clusters = cut_tree(tree, height=cut_off)

    pdf = PdfPages('{}_at_{}pct.pdf'.format(base_pdf_name, pct))

    plot_spectrogram(tree, name, color_threshold=cut_off, pdf=pdf)
    clustering = clusters[:, 0]

    merged_classes_info = {}

    for cls_ind, clst_ind in enumerate(clustering):
        if clst_ind in merged_classes_info:
            merged_classes_info[clst_ind].append(cls_ind)
        else:
            merged_classes_info[clst_ind] = [cls_ind]

    merged_classes_info = {x: y for x, y in merged_classes_info.items() if len(y) > 1}
    merged_classes_names_list = [
        [label_arr[yi] for yi in y] for y in merged_classes_info.values()
    ]

    for merged_classes_names in merged_classes_names_list:
        visualise_syntactical_similar_syllables(merged_classes_names, pdf)
        visualise_syllables(merged_classes_names, label_col, tid_col, pdf)
    pdf.close()

def analyse_adjacency_matrix(mat, base_pdf_name, label_col, tid_col):
    tree = hierarchy.linkage(mat, 'average')
    for pct in [5, 10]:
        analyse_percentile(pct, base_pdf_name, tree, label_col, tid_col)        

In [None]:
for ord_id in [24, 127]:
    ordination = Ordination.objects.get(id=ord_id)
    bytes_path = ordination.get_bytes_path()
    bytes_path = os.path.join(settings.BASE_DIR, bytes_path)

    sids_path = ordination.get_sids_path()
    sids_path = os.path.join(settings.BASE_DIR, sids_path)
    sids = bytes_to_ndarray(sids_path, np.int32)
    metadata, headers = extract_tensor_metadata(sids, annotator)

    ordination_data = get_rawdata_from_binary(bytes_path, len(sids))

    label_colid = np.where(np.array(headers)==label_level)[0][0]
    tid_colid = np.where(np.array(headers)=='tid')[0][0]
    label_col = np.array([metadata[sid][label_colid] for sid in sids])
    tid_col = np.array([metadata[sid][tid_colid] for sid in sids])
    
    for name, mat in mats.items():
        base_pdf_name = ordination.dm.database.name + ':' + ordination.get_name() + ':' + name
        print('===={}===='.format(base_pdf_name))
        analyse_adjacency_matrix(mat, base_pdf_name, label_col, tid_col)