In [1]:
# Load the appropriate modules 
import os, sys, glob
import numpy as np
import cPickle as pickle
sys.path.append('../')
import config
sys.path.append(config.BASE_PATH)
from dataloader import fused_features_IEMOCAP as IEMOCAP_loader

sys.path.append(config.PATTERN_SEARCH_MDS_PATH)

In [23]:
# create the experiment for EmoDB Speaker independent Experiments
def get_emodb_dataset(features_dic):
    speaker_indices = {}
    x_all_list = []
    Y_all = []
    prev_ind = 0
    for te_speaker, te_data in features_dic.items():  
        x_all_list.append(te_data['x'])
        Y_all += te_data['y']
        this_speaker_samples = len(te_data['y'])
        
        speaker_indices[te_speaker] = (prev_ind, prev_ind + this_speaker_samples)
        prev_ind += this_speaker_samples
        X_all = np.concatenate(x_all_list, axis=0)
    number_of_speakers = len(features_dic.keys())
    return X_all, Y_all, speaker_indices, number_of_speakers

def fuse_excited_happiness(l):
    return ['happy + excited' 
            if (e == 'excited' or e == 'happy') 
            else e for e in l ]

def get_iemocap_dataset(features_dic):
    speaker_indices = {}
    x_all_list = []
    Y_all = []
    prev_ind = 0
    sessions = []
    for te_speaker, te_data in features_dic.items():  
        ses_name = te_speaker[:-1]
        sessions.append(ses_name)
        x_all_list.append(te_data['x'])
        Y_all += te_data['y']
        this_speaker_samples = len(te_data['y'])
        
        speaker_indices[te_speaker] = (prev_ind, prev_ind + this_speaker_samples)
        prev_ind += this_speaker_samples
        X_all = np.concatenate(x_all_list, axis=0)
    return X_all, fuse_excited_happiness(Y_all), speaker_indices, len(set(sessions))

In [63]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.base import TransformerMixin, BaseEstimator
from tqdm import tqdm_notebook as tqdm

class IEMOCAPData(Dataset):
    def __init__(self, X,):
        self.X_high = X
    
    def __len__(self):
        return self.X_high.shape[0]
    
    def __getitem__(self, idx):
        return self.X_high[idx]

class autoencoder(nn.Module):
    def __init__(self, in_dim, hidden_dim):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(in_dim, 256),
            nn.ReLU(True),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, hidden_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, in_dim)
        )

    def forward(self, x):
        h = self.encoder(x)
        x = self.decoder(h)
        return x, h


class AE(object):
    def __init__(self, original_dim, target_dim, batch_size=32, learning_rate=1e-4, num_epochs=10000, early_stop=10):
        self.original_dim = original_dim
        self.target_dim = target_dim
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.early_stop = early_stop
        self.dvc = 'cpu'
        self.model = None
    
    def fit(self, X):
        iemo_data = IEMOCAPData(X)

        dataloader = DataLoader(iemo_data, batch_size=self.batch_size, shuffle=True)
        model = autoencoder(self.original_dim, self.target_dim).to(self.dvc)
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters())
        patience = 0
        prev_avg_loss = np.Inf
        for epoch in range(self.num_epochs):
            avg_loss= 0
            data_loader = tqdm(iter(dataloader))

            for data in data_loader:
                data = torch.Tensor(data.type(torch.FloatTensor)).to(self.dvc)
                # ===================forward=====================
                output, hidden = model(data)
        #         loss = criterion(output, data, hidden)
                loss = criterion(output, data)
                # ===================backward====================
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.detach().item()
            # ===================log========================
            if avg_loss > prev_avg_loss:
                patience += 1
            data_loader.set_description(
            'Epoch: {}; Loss: {:.10f}; patience: {}'.format(
                epoch + 1,
                avg_loss / len(data_loader),
                patience,
            ))
            if patience >= self.early_stop:
                break
            prev_avg_loss = avg_loss
        self.model = model
        return model
        
    def transform(self, X):
        return self.model(torch.Tensor(X).type(torch.FloatTensor).to(self.dvc)).numpy()
    
    def fit_transform(self, X):
        self.model = self.fit(X)
        return self.transform(X)

In [48]:
# Initialize all avaiulable Manifold Methods
import multidimensional
import multidimensional.common
import multidimensional.mds 
import multidimensional.smacof
import multidimensional.mds_utils as mds_utils
from sklearn import manifold, decomposition
from sklearn.decomposition import PCA
from sklearn.manifold import MDS as SMACOF
from sklearn.preprocessing import StandardScaler

class IdentityData(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass 
    
    def fit(self, X):
        return self
    
    def fit_transform(self, x):
        return x

def get_manifold_methods(original_dim, target_dim):
    method_n_comp = 66
    radius_barrier = 1e-3
    explore_dim_percent = 1
    starting_radius = 32
    max_turns = 500
    point_filter = (multidimensional.point_filters.FixedStochasticFilter(keep_percent=1, recalculate_each=10))
    radius_update = (multidimensional.radius_updates.AdaRadiusHalving(tolerance=.5*1e-3, burnout_tolerance=100000))

    mds_obj = multidimensional.mds.MDS(target_dim, point_filter, radius_update, starting_radius=starting_radius, 
                                       radius_barrier=radius_barrier,
                max_turns=max_turns, keep_history=False,
                explore_dim_percent=explore_dim_percent)

    manifold_methods = {
        'Pattern Search MDS': { 'results': {}, 'object': multidimensional.mds.MDS(target_dim, point_filter, 
                                                         radius_update, starting_radius=starting_radius, 
                                                         radius_barrier=radius_barrier, max_turns=max_turns, 
                                                         keep_history=False,
                                                         dissimilarities='precomputed',
                                                         explore_dim_percent=explore_dim_percent)},
        'MDS SMACOF': { 'results': {}, 'object': SMACOF(n_components=target_dim, n_init=1, 
                                                 max_iter=max_turns, dissimilarity='euclidean', n_jobs=8)},
        'LTSA': { 'results': {}, 'object': manifold.LocallyLinearEmbedding(method_n_comp, target_dim, 
                                           eigen_solver='auto', method='ltsa',n_jobs=8)},
        'Modified LLE': { 'results': {}, 'object': manifold.LocallyLinearEmbedding(method_n_comp, target_dim, 
                                           eigen_solver='auto', method='modified',n_jobs=8)},
        'Hessian LLE': { 'results': {}, 'object': manifold.LocallyLinearEmbedding(method_n_comp, target_dim, 
                                           eigen_solver='auto', method='hessian',n_jobs=8)},
        'LLE': { 'results': {}, 'object': manifold.LocallyLinearEmbedding(method_n_comp, target_dim, 
                                           eigen_solver='auto', method='standard',n_jobs=8)},
        'PCA': { 'results': {}, 'object': PCA(n_components=target_dim)},
        'Spectral Embedding': { 'results': {}, 'object': manifold.SpectralEmbedding(n_components=target_dim, 
                                                                                    n_jobs=8)},
        'TSNE': { 'results': {}, 'object': manifold.TSNE(n_components=target_dim)},
        'ISOMAP': { 'results': {}, 'object': manifold.Isomap(12, target_dim)},
        'Original Data': { 'results': {}, 'object': IdentityData()},
        'Autoencoder': { 'results': {}, 'object': AE(original_dim, target_dim) }

    }
    return manifold_methods

In [49]:
def run_DR(target_dims, methods_to_test, X_all, saveto=None):
    save_file = os.path.join('../', 'cache', saveto)
    if saveto is not None and os.path.isfile(save_file):
        with open(save_file, 'rb') as fd:
            reduced = pickle.load(fd)
        return reduced
    # normalize the input vectors 
#     X_high = StandardScaler().fit_transform(X_all)
    X_high = X_all
    print(X_high.shape)
    
    reduced = {}
    original_dim = X_all.shape[1]
    for target_dim in target_dims:
        reduced[target_dim] = {}
        manifold_methods = get_manifold_methods(original_dim, target_dim)
        methods_metrics = {}
        for selected_method in methods_to_test:            
            print('Running Method: {}'.format(selected_method))
            
            print('Reducing Input from Dimension: {} to a Lower Embedded Manifold with dimensions: {}...'.format(
                   X_high.shape[1], target_dim))
            try:
                obj = manifold_methods[selected_method]['object']
                if selected_method == 'Pattern Search MDS':
                    d_goal = multidimensional.common.DISTANCE_MATRIX(X_high.astype(np.float64))
    #                 d_goal = 1.0 - np.corrcoef(X_high.astype(np.float64))
    #                 np.fill_diagonal(d_goal, 0)
                    X_low = obj.fit_transform(d_goal)
                else:
                    X_low = obj.fit_transform(X_high)
            except Exception as e:
                print(e)
                X_low = None
            reduced[target_dim][selected_method] = X_low
    if saveto is not None:
        with open(save_file, 'wb') as fd:
            pickle.dump(reduced, fd)
    return reduced

In [61]:
from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score, accuracy_score
from sklearn.metrics import make_scorer
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

def emodb_speaker_groups(speaker_indices, n_samples):
    speaker_groups = np.zeros(n_samples, dtype=np.int)
    for speaker, sp_rng in speaker_indices.items():
        speaker_groups[sp_rng[0]:sp_rng[1]] = int(speaker)
    return speaker_groups

def iemocap_speaker_groups(speaker_indices, n_samples):
    sessions = ['Ses01', 'Ses02', 'Ses03', 'Ses04', 'Ses05']
    speaker_groups = np.zeros(n_samples, dtype=np.int)
    for sp, ind in speaker_indices.items():
        # sp = Ses0XM or Ses0XF
        speaker_groups[ind[0]:ind[1]] = int(sp[-2])
    return speaker_groups

def knn_search(n_neighbors, X, y, speaker_groups):
    if X is None:
        return [0 for _ in range(len(n_neighbors))], [0 for _ in range(len(n_neighbors))], n_neighbors

    clf_pipeline = Pipeline(steps=[
        ('znorm', StandardScaler()),
        ('knn', KNeighborsClassifier(p=2, metric='minkowski'))
    ])
    param_grid = {
        'knn__n_neighbors': n_neighbors,
    }
    outer_group = list(LeaveOneGroupOut().split(X, y, speaker_groups))
    scorers = {
        'w_acc': make_scorer(accuracy_score),
        'uw_acc': make_scorer(balanced_accuracy_score)
    }
    grid_search = GridSearchCV(
        clf_pipeline,
        refit=False,
        iid=False,
        param_grid=param_grid,
        cv=outer_group,
        scoring=scorers,
        n_jobs=-1,
        verbose=1
    )
    results = grid_search.fit(X, y).cv_results_
    wa = results['mean_test_w_acc']
    ua = results['mean_test_uw_acc']
    neigh = [p['knn__n_neighbors'] for p in results['params']]
    return wa, ua, neigh


def run_knn_search(reduced, y_all, speaker_groups, saveto=None):
    results = {}
    for target_dim, methods in reduced.iteritems():
        method_results = {}
        for method, X_low in methods.iteritems():
            wa, ua, n_neigh = knn_search(n_neighbors, X_low, y_all, speaker_groups)
            method_results['{} WA'.format(method)] = dict(zip(n_neigh, wa))
            method_results['{} UA'.format(method)] = dict(zip(n_neigh, ua))
        df = pd.DataFrame.from_dict(method_results, orient='index')
        df = df[sorted(df.columns)]
        results[target_dim] = df
    if saveto is not None:
        with open(saveto, 'wb') as fd:
            pickle.dump(results, fd)
    return results

In [62]:
def latex_preformat_print(df):
    methods = {}
    for ind in df.index.values:
        if not ind[:-3] in methods and ind[-2:] == 'WA':
            methods[ind[:-3]] = list(df[[1,5,9,13,17,21]].loc[ind])
    for ind in df.index.values:
        if ind[-2:] == 'UA':
            methods[ind[:-3]] += list(df[[1,5,9,13,17,21]].loc[ind])
    
    df = pd.DataFrame.from_dict(methods, orient="index")
    print df.to_latex()

In [57]:
n_neighbors = np.arange(1, 40, 4)
IEMOCAP_data_path = '/home/geopar/all_TRUE_IEMOCAP_feats/'
iemocap_l_feats_p = os.path.join(IEMOCAP_data_path, 'linear/IEMOCAP_linear_emobase2010')
iemocap_nl_feats_p = os.path.join(
    IEMOCAP_data_path, 
    'utterance/IEMOCAP-rqa-ad_hoc-tau-7-supremum-recurrence_rate-0.15-dur-0.03-fs-16000.dat')

berlin_data_path = '/home/geopar/all_BERLIN_features/'
berlin_l_feats_p = os.path.join(berlin_data_path, 'linear/BERLIN_linear_emobase2010')
berlin_nl_feats_p = os.path.join(
    berlin_data_path, 
    'rqa/utterance/BERLIN-rqa-ad_hoc-tau-7-manhattan-recurrence_rate-0.15-dur-0.02-fs-16000.dat')

methods_to_test = ['Autoencoder', 'Pattern Search MDS', 'Truncated SVD', 'Spectral Embedding', 
                   'LLE', 'Modified LLE', 'ISOMAP'] 

target_dims = [100, 75, 50, 40, 25, 10, 5, 3, 2]

In [59]:
data_dic = IEMOCAP_loader.get_fused_features([iemocap_nl_feats_p])
X_all, y_all, speaker_indices, number_of_sessions = get_iemocap_dataset(data_dic)
speaker_groups = iemocap_speaker_groups(speaker_indices, X_all.shape[0])
reduced = run_DR(target_dims, methods_to_test, X_all, saveto='dr-rqa-iemocap.p')
iemocap_rqa_results = run_knn_search(reduced, y_all, speaker_groups, saveto='dr-rqa-iemocap-results.p')

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.8s finished


In [None]:
for target_dim in sorted(iemocap_rqa_results.keys()):
    df = iemocap_rqa_results[target_dim]
    print "For Target Dimension: {}".format(target_dim)
    print display(df)
    latex_preformat_print(df)

In [None]:
data_dic = IEMOCAP_loader.get_fused_features([iemocap_l_feats_p])
X_all, y_all, speaker_indices, number_of_sessions = get_iemocap_dataset(data_dic)
speaker_groups = iemocap_speaker_groups(speaker_indices, X_all.shape[0])
reduced = run_DR(target_dims, methods_to_test, X_all, saveto='dr-emobase-iemocap.p')
iemocap_emobase_results = run_knn_search(reduced, y_all, speaker_groups, saveto='dr-emobase-iemocap-results.p')

In [None]:
for target_dim in sorted(iemocap_emobase_results.keys()):
    df = iemocap_emobase_results[target_dim]
    print "For Target Dimension: {}".format(target_dim)
    print display(df)
    latex_preformat_print(df)

In [None]:
data_dic = IEMOCAP_loader.get_fused_features([iemocap_l_feats_p, iemocap_nl_feats_p])
X_all, y_all, speaker_indices, number_of_sessions = get_iemocap_dataset(data_dic)
speaker_groups = iemocap_speaker_groups(speaker_indices, X_all.shape[0])
reduced = run_DR(target_dims, methods_to_test, X_all, saveto='dr-fused-iemocap.p')
iemocap_fused_results = run_knn_search(reduced, y_all, speaker_groups, saveto='dr-fused-iemocap-results.p')

In [None]:
for target_dim in sorted(iemocap_fused_results.keys()):
    df = iemocap_fused_results[target_dim]
    print "For Target Dimension: {}".format(target_dim)
    print display(df)
    latex_preformat_print(df)

In [None]:
data_dic = IEMOCAP_loader.get_fused_features([berlin_nl_feats_p])
X_all, y_all, speaker_indices, number_of_speakers = get_emodb_dataset(data_dic)
speaker_groups = emodb_speaker_groups(speaker_indices, X_all.shape[0])
reduced = run_DR(target_dims, methods_to_test, X_all, saveto='dr-rqa-berlin.p')
berlin_rqa_results = run_knn_search(reduced, y_all, speaker_groups, saveto='dr-rqa-berlin-results.p')

In [None]:
for target_dim in sorted(berlin_rqa_results.keys()):
    df = berlin_rqa_results[target_dim]
    print "For Target Dimension: {}".format(target_dim)
    print display(df)
    latex_preformat_print(df)

In [None]:
data_dic = IEMOCAP_loader.get_fused_features([berlin_l_feats_p])
X_all, y_all, speaker_indices, number_of_speakers = get_emodb_dataset(data_dic)
speaker_groups = emodb_speaker_groups(speaker_indices, X_all.shape[0])
reduced = run_DR(target_dims, methods_to_test, X_all, saveto='dr-emobase-berlin.p')
berlin_emobase_results = run_knn_search(reduced, y_all, speaker_groups, saveto='dr-emobase-berlin-results.p')

In [None]:
for target_dim in sorted(berlin_emobase_results.keys()):
    df = berlin_emobase_results[target_dim]
    print "For Target Dimension: {}".format(target_dim)
    print display(df)
    latex_preformat_print(df)

In [None]:
data_dic = IEMOCAP_loader.get_fused_features([berlin_l_feats_p, berlin_nl_feats_p])
X_all, y_all, speaker_indices, number_of_speakers = get_emodb_dataset(data_dic)
speaker_groups = emodb_speaker_groups(speaker_indices, X_all.shape[0])
reduced = run_DR(target_dims, methods_to_test, X_all, saveto='dr-fused-berlin.p')
berlin_emobase_results = run_knn_search(reduced, y_all, speaker_groups, saveto='dr-fused-berlin-results.p')

In [None]:
for target_dim in sorted(berlin_fused_results.keys()):
    df = berlin_fused_results[target_dim]
    print "For Target Dimension: {}".format(target_dim)
    print display(df)
    latex_preformat_print(df)