In [1]:
%load_ext autoreload
%autoreload 2

In [117]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.model_selection import learning_curve

# metrics
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import make_scorer

# preprocessing
from sklearn.preprocessing import LabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

# models
from sklearn.svm import SVC

# config
_RANDOM_STATE = 0

In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import librosa
from IPython.display import Audio
import numpy as np

from itertools import pairwise
import collections

import networkx as nx

In [138]:
import automutualinformation
import pickle

context_dict = {
    0:'Unknown',
    1:'Separation',
    2:'Biting',
    3:'Feeding',
    4:'Fighting',
    5:'Grooming',
    6:'Isolation',
    7:'Kissing',
    8:'Landing',
    9:'Mating protest',
    10:'Threat-like',
    11:'General',
    12:'Sleeping',
}


def load_seq_data( dataset_descriptor, number_of_clusters):
    # Load The Sequences!
    with open(f'symbolic_sequences_{dataset_descriptor}_{number_of_clusters}_symseq.pkl','rb') as f:
        symbolic_sequences = pickle.load(f)

    # Load The Graph!
    with open(f'graph_symbolic_sequences_{dataset_descriptor}_{number_of_clusters}_graphsymseq.pkl','rb') as f:
        G = pickle.load(f)

    # Load The Sequences!
    with open(f"{dataset_descriptor}_{number_of_clusters}_map_complete.pkl",'rb') as f:
        seq_mapped = pickle.load(f)
        seq_mapped.drop_duplicates(subset=["voc_segments_ix"], keep="first", inplace=True)
        seq_mapped.index = range(len(seq_mapped.index))
        seq_mapped.index.set_names('segmentID', inplace=True)
        
    return symbolic_sequences, G, seq_mapped
    
#symbolic_sequences_offspring, G_offspring, seq_mapped_offspring = load_seq_data( 'offspring_relationships', 372)
#symbolic_sequences_wildbats, G_wildbats, seq_mapped_wildbats = load_seq_data( 'wildbats_relationships', 175 )
symbolic_sequences_bat_215, G_bat_215, seq_mapped_bat_215 = load_seq_data( 'bat_215', 84 )

In [139]:
# CHOOSE THE DB

#seg_mapped = seq_mapped_offspring
#symbolic_sequences = symbolic_sequences_offspring

symbolic_sequences = symbolic_sequences_bat_215
seq_mapped = seq_mapped_bat_215

## Functions

In [178]:
symbolic_sequences['context'] = symbolic_sequences['label_context'].apply(lambda x : collections.Counter(x).most_common()[0][0])

In [140]:
from collections import defaultdict
from itertools import pairwise

import numpy as np

# unigrams
_1_gram = symbolic_sequences['Seq_Syllables'].explode().value_counts().to_dict()
_1_gram = {k : v/ sum(_1_gram.values()) for k, v in _1_gram.items()}

# bigrams
_2_gram = pd.Series([p for seq in symbolic_sequences['Seq_Syllables'] for p in pairwise(seq)]).value_counts().to_dict()
_2_gram = {k : v/ sum(_2_gram.values()) for k, v in _2_gram.items()}

#_3_gram = symbolic_sequences['Seq_Syllables'].apply(lambda seq: tuple(seq[i: i + 3] for i in range(len(seq) - 3 + 1))).explode().value_counts().to_dict()
        

In [183]:
# depending on context:

_grams = {}

for context in symbolic_sequences['context']:
    
    tmp_series = symbolic_sequences[symbolic_sequences['context'] == context]['Seq_Syllables']
    
    _1_gram = tmp_series.explode().value_counts().to_dict()
    _1_gram = {k : v/ sum(_1_gram.values()) for k, v in _1_gram.items()}
    
    _2_gram = pd.Series([p for seq in tmp_series for p in pairwise(seq)]).value_counts().to_dict()
    _2_gram = {k : v/ sum(_2_gram.values()) for k, v in _2_gram.items()}
    
    _grams[context] = {
            '_1_gram' : _1_gram,
            '_2_gram' : _2_gram,
    }

  _2_gram = pd.Series([p for seq in tmp_series for p in pairwise(seq)]).value_counts().to_dict()


In [185]:
_grams;

In [251]:
def conditional_p(antecedent, current,  _1_gram_voc = _1_gram, _2_gram_voc = _2_gram):
    
    # forward transitional probability
    #https://corpustools.readthedocs.io/en/latest/transitional_probability.html
    
    #given the order: xy => p(y|x) = p(xy )/ p(x)
    try:
        # p(y|x) = p(xy )/ p(x)
        prob = _2_gram_voc[(antecedent, current)] / _1_gram_voc[(antecedent)]
    except KeyError:
        prob = 1
            
    return prob
            
          

In [252]:
def transitional_p(current, successor , _1_gram_voc = _1_gram, _2_gram_voc = _2_gram):
    
    #backword transitional probability
    #https://corpustools.readthedocs.io/en/latest/transitional_probability.html
    
    #given the order: xy => p(x|y) = p(xy )/ p(y)
    
    try:
        prob = _2_gram_voc[(current, successor)] / _1_gram_voc[(successor)]
    except KeyError:
        prob = 1
            
    return prob

In [253]:
transitional_p.__name__

'transitional_p'

In [254]:
def p_seq(seq, probability_function, vocabularies = [_1_gram, _2_gram]):  
    
    _1_gram_voc = vocabularies[0]
    _2_gram_voc = vocabularies[1]
    
    prob = 1.
    
    initial_p = seq[0]
    
    prob *= initial_p

    for p in pairwise(seq[1:]):    
        
        
        
        antecedent = p[0]
        current = p[1]
        
        if probability_function.__name__ == 'conditional_p':

            prob *= probability_function(antecedent, current, _1_gram_voc = _1_gram_voc, _2_gram_voc = _2_gram_voc )

        elif probability_function.__name__ == 'transitional_p':

            prob *= probability_function(antecedent, current, _1_gram_voc = _1_gram_voc, _2_gram_voc = _2_gram_voc)

        else:
            raise
    
    return prob


In [255]:
collections.Counter([0,1]).values()

dict_values([1, 1])

In [162]:
def syl_in_seq(seq):
    return sum( collections.Counter(seq).values() )
        

In [195]:
def entropy_seq(seq, vocabulary = _1_gram):
    
    entropy = 0
    
    for syl in seq:
        try:
            prob =  vocabulary[syl] / len(vocabulary)
        except KeyError:
            prob = 0
        
        entropy += -prob * np.log2(prob)
    return entropy

In [111]:
symbolic_sequences['Seq_Syllables'].apply(lambda x : p_seq(x, transitional_p))

0       3.078513e-01
1       3.078513e-01
2       1.000000e+00
3       4.586987e-01
4       1.000000e+00
            ...     
6074    5.794988e-66
6075    3.029826e-36
6076    1.509544e-04
6077    8.443615e-57
6078    2.159375e-09
Name: Seq_Syllables, Length: 6079, dtype: float64

In [147]:
len(_1_gram)

84

In [149]:
symbolic_sequences;

In [256]:
def prepare_data(sequences_series):
    
    columns = ['entropy', 'conditional', 'transitional']
    
    df = pd.DataFrame(columns = columns)
    
    df['syl_in_seq'] = sequences_series.apply(syl_in_seq)
    df['entropy'] = sequences_series.apply(entropy_seq)
    df['conditional'] = sequences_series.apply(lambda x : p_seq(x, conditional_p))
    df['transitional'] = sequences_series.apply(lambda x : p_seq(x, transitional_p))
    
    for k in _grams:
        vocabularies = [_grams[k]['_1_gram'], _grams[k]['_2_gram']]
        df[f'entropy_{k}'] = sequences_series.apply(lambda x : entropy_seq(x , vocabulary = vocabularies[0] ))
        df[f'conditional_{k}'] = sequences_series.apply(lambda x : p_seq(x , conditional_p, vocabularies = vocabularies))
        df[f'transitional_{k}'] = sequences_series.apply(lambda x : p_seq(x , transitional_p, vocabularies = vocabularies))

    
    return df

In [257]:
cond_1 = symbolic_sequences.label_context.apply(lambda x : len(np.unique(x)) < 2)
cond_2 = symbolic_sequences.label_context.apply(lambda x : all(i in [9, 2, 3,6, 4, 7, 5, 10] for i in x))

raw_data = symbolic_sequences.loc[cond_1 & cond_2];

X = prepare_data(raw_data['Seq_Syllables']);

X = X.fillna(X.mean(axis=0))

X.tail()

Unnamed: 0,entropy,conditional,transitional,syl_in_seq,entropy_6,conditional_6,transitional_6,entropy_12,conditional_12,transitional_12,...,transitional_7,entropy_8,conditional_8,transitional_8,entropy_10,conditional_10,transitional_10,entropy_5,conditional_5,transitional_5
6074,2.661083,5.569905e-37,7.103761999999999e-38,51,0.289441,8.124146e-09,6.743602e-13,0.213887,1.18392e-37,1.484887e-35,...,2.411947e-10,0.0,58.0,58.0,0.644132,1.527646e-10,3e-06,0.565092,5e-06,5.048023e-06
6075,1.863402,2.331523e-05,2.331523e-05,24,0.289441,1.8963e-08,5.735501e-10,1.540247,2.941155e-07,2.941155e-07,...,5.999839e-05,0.0,58.0,58.0,5.594532,3.685894e-05,0.004792,0.565092,0.000352,0.0003518366
6076,0.277605,3.634551,28.49772,4,0.382329,14.19211,14.19211,0.231358,3.85621,24.51507,...,28.74733,0.0,58.0,58.0,0.844808,2.877587,19.688756,0.801827,7.897274,30.79937
6077,2.440229,3.218542e-22,3.218542e-22,40,0.289441,5.270535e-10,1.446454e-12,2.016862,1.145707e-22,4.4336310000000005e-25,...,7.280492e-11,0.0,58.0,58.0,0.644132,7.384862e-12,4.5e-05,0.565092,1.1e-05,2.457369e-07
6078,0.609308,17.12275,17.12275,7,0.822775,0.05087722,0.05087722,0.502661,8.831951,8.831951,...,19.603,0.0,58.0,58.0,1.819685,15.86289,15.862894,1.661222,4.101705,4.101705


In [258]:
y = raw_data['label_context'].apply(lambda x : x[0])

In [259]:
# training, test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state= _RANDOM_STATE)

# training, validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.25, random_state= _RANDOM_STATE)

# binarise the classes 
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_val = label_binarizer.transform(y_val)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape  # (n_samples, n_classes)


(321, 8)

In [260]:
#Scale the features
sc = StandardScaler()
sc.fit(X_train)

X_train_std = sc.transform(X_train)
X_val_std = sc.transform(X_val)
X_test_std = sc.transform(X_test)

# test permutation
#X_permuted_std = sc.transform(X_perm)
#y_permuted = permuted_pruned_df.iloc[X_test.index]['label_context'].apply(lambda x : x[0])

In [263]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='entropy', n_estimators=100, random_state=_RANDOM_STATE, n_jobs=-1)
forest.fit(X_train_std, y_train)
y_pred = forest.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('Precision: %.2f' % precision_score(y_test, y_pred, average = 'micro'))
print('Recall: %.2f' % recall_score(y_test, y_pred, average = 'micro'))
print('F1-micro: %.2f' % f1_score(y_test, y_pred, average = 'micro'))
print('F1-weighted: %.2f' % f1_score(y_test, y_pred, average = 'weighted'))

Misclassified samples: 152
Accuracy: 0.53
Precision: 0.53
Recall: 0.53
F1-micro: 0.53
F1-weighted: 0.44


In [264]:
feature_labels = X.columns

importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print(feature_labels[indices[f]], importances[indices[f]])

entropy_11 0.06317620005606922
entropy 0.05060204368981685
entropy_4 0.048556831617663714
entropy_12 0.04326459615364474
conditional_6 0.03980834584999589
entropy_9 0.039788463111450845
entropy_3 0.0389416260360765
entropy_2 0.036956072936662414
transitional_7 0.02918709753397129
transitional_8 0.02833794351615929
conditional_10 0.027279246818265355
entropy_10 0.027243251462768552
conditional_5 0.027216050636608858
transitional_11 0.027147243719406158
entropy_6 0.026233243412101242
transitional_2 0.025702631817772523
transitional_5 0.02514754964727888
conditional_8 0.02434458521361713
entropy_7 0.024171547498720285
transitional_10 0.023699580650591332
conditional_3 0.023515598265346525
transitional_3 0.023299387489882747
conditional_11 0.02295833763798585
entropy_5 0.022449074619853165
conditional_7 0.022281975750283015
conditional_4 0.02223887107452554
transitional_4 0.022124037186014876
conditional_9 0.021535184700590394
conditional_2 0.021180088223810255
transitional_6 0.02096403082

In [265]:
from sklearn.metrics import classification_report
print(classification_report(y_test,
                            y_pred,
                            digits=2, 
                            target_names= [context_dict[i] for i in [2,3,4,5,6,7,9,10]]
                           )
     )

from sklearn.metrics import confusion_matrix

# assuming y_true and y_pred are your true and predicted labels, respectively
cm = confusion_matrix(y_test, y_pred)

print(cm)

                precision    recall  f1-score   support

        Biting       0.42      0.15      0.22        72
       Feeding       0.20      0.04      0.07        24
      Fighting       0.00      0.00      0.00        13
      Grooming       0.00      0.00      0.00        12
     Isolation       1.00      0.80      0.89        20
       Kissing       0.00      0.00      0.00        12
Mating protest       0.53      0.90      0.66       157
   Threat-like       0.00      0.00      0.00        11

      accuracy                           0.53       321
     macro avg       0.27      0.24      0.23       321
  weighted avg       0.43      0.53      0.44       321

[[ 11   1   0   1   0   0  59   0]
 [  2   1   0   0   0   0  21   0]
 [  0   0   0   0   0   0  13   0]
 [  0   0   0   0   0   0  12   0]
 [  1   1   0   0  16   0   2   0]
 [  1   1   0   0   0   0  10   0]
 [ 11   0   2   3   0   0 141   0]
 [  0   1   0   0   0   0  10   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
