In [1]:
import scipy.io, math, os
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from utils import *
from nltk.corpus import cmudict
from numpy import linalg as LA
import seaborn as sns
import pylab
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import squareform
from scipy.spatial import distance
from scipy.cluster import hierarchy
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import Levenshtein
from sklearn.cluster import AgglomerativeClustering, AffinityPropagation

In [2]:
subjects = ['F1', 'F5', 'M1', 'M3']
subj_index = 0

# for determining the number of syllables 
d = cmudict.dict()

# TODO: put this in utils file 
def get_srate(file_number):
    directory = 'data/Data/{}/mat'.format(subjects[subj_index])
    
    files = sorted(os.listdir(directory))
    try:
        files.remove('.DS_Store')
    except:
        pass
    
    file = files[file_number]
    
    f = os.path.join(directory, file)
    mat = scipy.io.loadmat(f)['usctimit_ema_{}_{:03}_{:03}'.format(subjects[subj_index].lower(), file_number*5 + 1, file_number*5 + 5)]
    
    #returns the srate which is awkwardly stored here
    return mat[0][1][1][0][0]

In [3]:
directory = 'data/Data/{}/mat'.format(subjects[subj_index])
counter = 1
UL_df, LL_df, JW_df, TD_df, TB_df, TT_df = [], [], [], [], [], []

for filename in sorted(os.listdir(directory)):
    if filename.endswith('.mat'):
        f = os.path.join(directory, filename)
        mat = scipy.io.loadmat(f)
        # takes the data that is stored at the key that precedes the data for each .mat file
        data = mat['usctimit_ema_{}_{:03}_{:03}'.format(subjects[subj_index].lower(), counter, counter + 4)]
        counter += 5

        # make dataframes of the six positions
        UL_df.append(pd.DataFrame.from_dict(data[0][1][2]))
        LL_df.append(pd.DataFrame.from_dict(data[0][2][2]))
        JW_df.append(pd.DataFrame.from_dict(data[0][3][2]))
        TD_df.append(pd.DataFrame.from_dict(data[0][4][2]))
        TB_df.append(pd.DataFrame.from_dict(data[0][5][2]))
        TT_df.append(pd.DataFrame.from_dict(data[0][6][2]))
        
dataframes = [UL_df, LL_df, JW_df, TB_df, TD_df, TT_df]

In [4]:
frames = {}
sensors = ['ULx', 'ULy', 'LLx', 'LLy', 
           'JWx', 'JWy', 'TDx', 'TDy', 
           'TBx', 'TBy', 'TTx', 'TTy']

with open('timestamps.txt', 'r') as file:
    timestamps = file.read().splitlines()
    for word_number, line in enumerate(timestamps):
        split_line = line.split(',')
        sent_number = int(split_line[-1])
        
        # find start and end by multiplying the timestamps with the sampling rate
        starting_point = math.floor(float(split_line[2]) * get_srate(int(split_line[0])))
        end_point = math.ceil(float(split_line[3]) * get_srate(int(split_line[0])))
        
        # make new dataframe for the current word
        df = pd.DataFrame()
        
        for sensor in sensors:
            # position, dimension, file_number, starting_point, end_point
            array = get_pos_list(sensor[:2], sensor[-1], int(split_line[0]), starting_point, end_point, dataframes)
            df[sensor] = pd.Series(array)
            df.word = split_line[1]
            # TODO: get rid of UserWarning 
            df.sent = int(split_line[-1])
            df.syl = nsyl(split_line[1], d)
            frames[word_number] = df

  df.syl = nsyl(split_line[1], d)


In [31]:
do_normalize = True

syl1_words, syl2_words, syl3_words = {}, {}, {}
syl4_words, syl5_words, syl6_words = {}, {}, {}

syl_frames = [syl1_words, syl2_words, 
              syl3_words, syl4_words, 
              syl5_words, syl6_words]

# fit scaler on a global level to prevent local standardization 
scaler = StandardScaler()
scaler.fit(pd.concat(frames))

for i in range(1, len(syl_frames) + 1):
    for count, frame in enumerate(frames):
        # for some reason some words store the number of syllables in an array, hence:
        if (isinstance(frames[frame].syl, list) and frames[frame].syl[0] == i) or\
            (not isinstance(frames[frame].syl, list) and frames[frame].syl == i):
            # standardize the data to have a mean of 0 and approx. a SD of 1
            if do_normalize:
                data = scaler.transform(frames[frame])
                df = pd.DataFrame(data, columns=sensors)
                syl_frames[i - 1][count] = df
                
                # set meta-data, at this point we only need the word and the sentence it came from
                syl_frames[i - 1][count].word = frames[frame].word
                syl_frames[i - 1][count].sent = frames[frame].sent
            else:
                syl_frames[i - 1][count] = frames[frame]

In [32]:
# padding 
# just an observation: running this code twice returns an exception, will look into it later 
for i, frame in enumerate(syl_frames):
    # target length is the the word with the most samples in that syllable category
    target_length = longest_frame(frame)
    for word in frame.keys():
        current_length = frame[word].shape[0]
        pad_length1 = int((target_length - current_length) / 2)
        if np.mod((target_length - current_length),2) == 1:
            pad_length2 = pad_length1 + 1
        else:
            pad_length2 = pad_length1

        frame[word] = pd.DataFrame(np.pad(frame[word].values, 
                                   ((pad_length1,pad_length2), (0,0)), 'mean'),
                                   columns=sensors).transpose()

In [30]:
labels_syl1, labels_syl2, labels_syl3 = [], [], []
labels_syl4, labels_syl5, labels_syl6 = [], [], []

labels = [labels_syl1, labels_syl2, 
          labels_syl3, labels_syl4, 
          labels_syl5, labels_syl6]

for i, frame in enumerate(syl_frames):
    for word in frame:
        labels[i].append(frame[word].word)

In [56]:
difference_matrix_1, difference_matrix_2, difference_matrix_3 = [], [], []
difference_matrix_4, difference_matrix_5, difference_matrix_6 = [], [], []

difference_matrices = [difference_matrix_1, difference_matrix_2, 
                       difference_matrix_3, difference_matrix_4, 
                       difference_matrix_5, difference_matrix_6]

for i, matrix in enumerate(syl_frames):
    for row_word in matrix.values():
        row = np.array([])
        for column_word in matrix.values():
            # calculate the frob norm for each word pair and putting it into diff matrix 
            difference_matrix = row_word.subtract(column_word).to_numpy()
            frob_norm = linalg.norm(difference_matrix)

            row = np.append(row, frob_norm)

        difference_matrices[i].append(row)
    
    # turn into correlation matrix 
    diff = np.array([difference_matrices[i]])
    df = pd.DataFrame(diff[0])
    df.columns, df.index = labels[i], labels[i]
    correlations = df.corr()
    correlations_array = np.asarray(df.corr())
    
    # replace the uncondensed matrix with the condensed one
    difference_matrices[i] = correlations

In [59]:
Levenshtein_matrix_1, Levenshtein_matrix_2, Levenshtein_matrix_3 = [], [], []
Levenshtein_matrix_4, Levenshtein_matrix_5, Levenshtein_matrix_6 = [], [], []

Levenshtein_matrices = [Levenshtein_matrix_1, Levenshtein_matrix_2, 
                       Levenshtein_matrix_3, Levenshtein_matrix_4, 
                       Levenshtein_matrix_5, Levenshtein_matrix_6]

for i, arr in enumerate(labels):
    for row_word in labels[i]:
        row = np.array([])
        for column_word in labels[i]:
            lev = Levenshtein.distance(row_word, column_word)
            row = np.append(row, lev)

        Levenshtein_matrices[i].append(row)
        
    # turn into correlation matrix 
    diff = np.array([Levenshtein_matrices[i]])
    df = pd.DataFrame(diff[0])
    df.columns, df.index = labels[i], labels[i]
    correlations = df.corr()
    correlations_array = np.asarray(df.corr())
    
    # replace the uncondensed matrix with the condensed one
    Levenshtein_matrices[i] = correlations

In [None]:
sns.heatmap(Levenshtein_matrices[4], cmap='vlag', linewidths=1)